In [15]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

In [16]:
data_dd = dd.read_csv("india_birds\ebd_IN_smp_relMar-2024_sampling.txt", sep='\t', header=0, dtype=object)
columns = data_dd.columns

In [17]:
data_dd = data_dd[["STATE", "LATITUDE", "LONGITUDE", "OBSERVATION DATE", "OBSERVER ID", "PROTOCOL TYPE"]]
data_dd = data_dd[data_dd["PROTOCOL TYPE"].isin(["Traveling", "Stationary"])]
data_dd = data_dd.drop("PROTOCOL TYPE", axis=1)
data_dd = data_dd.dropna()

In [18]:
data_df = data_dd.compute()

In [19]:
data_df["LATITUDE"] = data_df["LATITUDE"].astype(float)
data_df["LONGITUDE"] = data_df["LONGITUDE"].astype(float)
data_df["OBSERVATION DATE"] = pd.to_datetime(data_df["OBSERVATION DATE"])
data_df["STATE"] = data_df["STATE"].astype('category')

In [20]:
numlist_df = data_df.groupby("OBSERVER ID").size().reset_index(name='NUMLIST')
numlist_df_filtered = numlist_df[numlist_df["NUMLIST"] > 10]

In [21]:
data_df_filtered = data_df[data_df["OBSERVER ID"].isin(numlist_df_filtered["OBSERVER ID"])]
data_df_filtered = data_df_filtered.merge(numlist_df_filtered, on="OBSERVER ID")

In [22]:
data_df_filtered.head()

Unnamed: 0,STATE,LATITUDE,LONGITUDE,OBSERVATION DATE,OBSERVER ID,NUMLIST
0,Tamil Nadu,11.407898,76.871835,2018-05-13,obsr1145548,520
1,Tamil Nadu,11.407898,76.871835,2018-05-13,obsr1145548,520
2,Karnataka,12.444002,75.712597,2019-05-13,obsr1145548,520
3,Karnataka,13.148651,77.487903,2019-12-15,obsr1145548,520
4,Karnataka,12.9358,77.604335,2019-02-16,obsr1145548,520


In [23]:
from sklearn.cluster import MeanShift, estimate_bandwidth

X = data_df_filtered[["LATITUDE", "LONGITUDE"]]
#bandwidth = estimate_bandwidth(X, quantile=0.05, n_samples=1000, random_state=0)
bandwidth = estimate_bandwidth(X, quantile=0.089, n_samples=10000, random_state=0)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=-1)
ms.fit(X)
labels = ms.labels_
cluster_centers_1 = ms.cluster_centers_

data_df_filtered["CLUSTER 1"] = labels
data_df_filtered["CLUSTER 1"] = data_df_filtered["CLUSTER 1"].astype('category')

In [24]:
fig = px.scatter_mapbox(data_df_filtered.sample(40000), lat="LATITUDE", lon="LONGITUDE", color="CLUSTER 1", zoom=3, height=800, width=1200)
fig.update_layout(mapbox_style="open-street-map")

fig.show()

In [25]:
for i in range(len(cluster_centers_1)):
    X = data_df_filtered[data_df_filtered["CLUSTER 1"] == i][["LATITUDE", "LONGITUDE"]]
    bandwidth = estimate_bandwidth(X, quantile=0.3, n_samples=10000, random_state=0)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=-1)
    ms.fit(X)
    labels = ms.labels_
    cluster_centers_2 = ms.cluster_centers_
    data_df_filtered.loc[data_df_filtered["CLUSTER 1"] == i, "CLUSTER 2"] = labels


In [26]:
data_df_filtered["CLUSTER 2"] = data_df_filtered["CLUSTER 2"].astype(str)
data_df_filtered["CLUSTER ID"] = data_df_filtered["CLUSTER 1"].astype(str) + "00" + data_df_filtered["CLUSTER 2"].astype(str)

In [27]:
fig = px.scatter_mapbox(data_df_filtered.sample(50000), lat="LATITUDE", lon="LONGITUDE", color="CLUSTER ID", zoom=3, height=800, width=1200)
fig.update_layout(mapbox_style="open-street-map")

fig.show()