In [None]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
import geopandas as gpd
warnings.filterwarnings('ignore')

In [None]:
import random
random.seed(0)

In [None]:
data_dd = dd.read_csv("india_birds\ebd_IN_smp_relMar-2024_sampling.txt", sep='\t', header=0, dtype=object)
columns = data_dd.columns

In [None]:
india_geojson = gpd.read_file("india-composite.geojson")
india_geojson = india_geojson.to_crs("WGS84")

Filter only Traveling and Stationary

In [None]:
data_dd = data_dd[["STATE", "LATITUDE", "LONGITUDE", "OBSERVATION DATE", "OBSERVER ID", "PROTOCOL TYPE",'TIME OBSERVATIONS STARTED','SAMPLING EVENT IDENTIFIER', 'PROJECT CODE', 'GROUP IDENTIFIER']]
data_dd = data_dd[data_dd["PROTOCOL TYPE"].isin(["Traveling", "Stationary"])]
data_dd = data_dd.drop("PROTOCOL TYPE", axis=1)
data_dd["GROUP IDENTIFIER"] = data_dd["GROUP IDENTIFIER"].fillna("No Group")
data_dd = data_dd.dropna()

In [None]:
data_df = data_dd.compute()

In [None]:
data_df["TIME OBSERVATIONS STARTED"] = pd.to_datetime(data_df["TIME OBSERVATIONS STARTED"], errors='coerce')

In [None]:
# data_dd.visualize()

In [None]:
data_df["LATITUDE"] = data_df["LATITUDE"].astype(float)
data_df["LONGITUDE"] = data_df["LONGITUDE"].astype(float)
data_df["OBSERVATION DATE"] = pd.to_datetime(data_df["OBSERVATION DATE"])
data_df["STATE"] = data_df["STATE"].astype('category')

In [None]:
data_df["DATE TIME"] = data_df["OBSERVATION DATE"].astype(str) + " " + data_df["TIME OBSERVATIONS STARTED"].dt.time.astype(str)
data_df["DATE TIME"] = pd.to_datetime(data_df["DATE TIME"], errors='coerce')
data_df["TIME OBSERVATIONS STARTED"] = data_df["TIME OBSERVATIONS STARTED"].dt.time

In [None]:
numlist_df = data_df.groupby("OBSERVER ID").size().reset_index(name='NUMLIST')
numlist_df_filtered = numlist_df[numlist_df["NUMLIST"] > 10]

In [None]:
data_df_filtered = data_df[data_df["OBSERVER ID"].isin(numlist_df_filtered["OBSERVER ID"])]
data_df_filtered = data_df_filtered.merge(numlist_df_filtered, on="OBSERVER ID")

In [None]:
data_df_filtered

In [None]:
from sklearn.cluster import MeanShift, estimate_bandwidth
X = data_df_filtered[["LATITUDE", "LONGITUDE"]]
#bandwidth = estimate_bandwidth(X, quantile=0.01, n_samples=10000, n_jobs=-1)
#bandwidth = estimate_bandwidth(X, quantile=0.005, n_samples=10000, n_jobs=-1)
bandwidth = estimate_bandwidth(X, quantile=0.5, n_samples=20000, n_jobs=-1, random_state=0)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=-1)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

data_df_filtered["CLUSTER"] = labels
data_df_filtered["CLUSTER"] = data_df_filtered["CLUSTER"].astype('category')


In [None]:
cluster_centers_first = cluster_centers

In [None]:
first_map = data_df_filtered

In [None]:
sampling_rate = 20000
fig = px.scatter_mapbox(first_map.sample(sampling_rate), lat="LATITUDE", lon="LONGITUDE", color="CLUSTER", zoom=2)
fig.update_layout(mapbox_style="open-street-map", width=1000, height=1000)
fig.update_layout(mapbox={"layers":[
            {
                "source": india_geojson["geometry"].__geo_interface__,
                "type": "line",
                "color": "black",
                "line": {"width": 0.5},
            }
        ]})
fig.show()

In [None]:

second_clusters = []
second_cluster_centers = []
for i in range(len(cluster_centers_first)):
    X = data_df_filtered[data_df_filtered["CLUSTER"] == i][["LATITUDE", "LONGITUDE"]]
    #bandwidth = estimate_bandwidth(X, quantile=0.01, n_samples=10000, n_jobs=-1)
    #bandwidth = estimate_bandwidth(X, quantile=0.005, n_samples=10000, n_jobs=-1)
    #bandwidth = estimate_bandwidth(X, quantile=0.017, n_samples=20000, n_jobs=-1, random_state=0)
    bandwidth = estimate_bandwidth(X, quantile=0.014, n_samples=20000, n_jobs=-1, random_state=0)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=-1)
    ms.fit(X)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    second_clusters.append(labels)
    second_cluster_centers.append(cluster_centers)



In [None]:
second_clusters_indexes = []

for i in range(len(second_clusters)):
    for j in range(len(np.unique(second_clusters[i]))):
        second_clusters_indexes.append((i+1)*1000+j)

second_clusters_indexes = np.array(second_clusters_indexes)


In [None]:
second_cluster_centers = np.concatenate(second_cluster_centers, axis=0)

In [None]:
second_clusters_df = pd.DataFrame(second_cluster_centers, columns=["LATITUDE", "LONGITUDE"])
second_clusters_df["CLUSTER"] = second_clusters_indexes
second_clusters_df["CLUSTER"] = second_clusters_df["CLUSTER"].astype('category')

In [None]:
second_clusters_df

In [None]:
for i in range(len(cluster_centers_first)):
    data_df_filtered.loc[data_df_filtered["CLUSTER"] == i, "SECOND CLUSTER"] = second_clusters[i]+((i+1)*1000)

In [None]:
data_df_filtered["SECOND CLUSTER cat"] = data_df_filtered["SECOND CLUSTER"].astype('int64').astype('category')

In [None]:
second_map = data_df_filtered

In [None]:
sampling_rate = 40000
fig = px.scatter_mapbox(second_map.sample(sampling_rate, replace=True), lat="LATITUDE", lon="LONGITUDE", color="SECOND CLUSTER cat", zoom=2)
fig.update_layout(mapbox_style="open-street-map", width=1000, height=1000)
fig.update_layout(mapbox={"layers":[
            {
                "source": india_geojson["geometry"].__geo_interface__,
                "type": "line",
                "color": "black",
                "line": {"width": 0.5},
            }
        ]})
fig.show()

In [None]:
data_df_filtered.drop(["SECOND CLUSTER", "CLUSTER"], axis=1, inplace=True)
data_df_filtered["CLUSTER"] = data_df_filtered["SECOND CLUSTER cat"]
data_df_filtered.drop("SECOND CLUSTER cat", axis=1, inplace=True)

In [None]:
data_df_filtered

In [None]:
lakshadweep_df = data_df_filtered[data_df_filtered["STATE"] == "Lakshadweep"]
andaman_nicobar_df = data_df_filtered[data_df_filtered["STATE"] == "Andaman and Nicobar Islands"]
andaman_df = andaman_nicobar_df[andaman_nicobar_df["LATITUDE"] > 10]
nicobar_df = andaman_nicobar_df[andaman_nicobar_df["LATITUDE"] < 10]
sikkim_north_bengal = data_df_filtered[data_df_filtered["CLUSTER"] == 2008]
himalayas = data_df_filtered[data_df_filtered["CLUSTER"].isin([2005,2030,2002,2013,2023])]

Andaman and nicobar and lakshadeep need to be seperate clusters, sikkim and north bengal needs to be clustered, himalaya needs to be clustered.

In [None]:

lakshadweep_df["CLUSTER"] = 4000

In [None]:

lat = lakshadweep_df["LATITUDE"].mean()
lon = lakshadweep_df["LONGITUDE"].mean()
second_clusters_df = second_clusters_df._append({"LATITUDE": lat, "LONGITUDE": lon, "CLUSTER": 4000}, ignore_index=True)

In [None]:
X = nicobar_df[["LATITUDE", "LONGITUDE"]]
bandwidth = estimate_bandwidth(X, quantile=0.805, n_samples=10000, n_jobs=-1, random_state=0)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=-1)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
nicobar_df["CLUSTER"] = labels+5000

for i in range(len(cluster_centers)):
    second_clusters_df = second_clusters_df._append({"LATITUDE": cluster_centers[i][0], "LONGITUDE": cluster_centers[i][1], "CLUSTER": 5000+i}, ignore_index=True)

In [None]:
X = andaman_df[["LATITUDE", "LONGITUDE"]]
bandwidth = estimate_bandwidth(X, quantile=0.5, n_samples=10000, n_jobs=-1, random_state=0)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=-1)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
andaman_df["CLUSTER"] = labels+6000

for i in range(len(cluster_centers)):
    second_clusters_df = second_clusters_df._append({"LATITUDE": cluster_centers[i][0], "LONGITUDE": cluster_centers[i][1], "CLUSTER": 6000+i}, ignore_index=True)

In [None]:
X = himalayas[["LATITUDE", "LONGITUDE"]]
bandwidth = estimate_bandwidth(X, quantile=0.03, n_samples=10000, n_jobs=-1, random_state=0)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=-1)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
himalayas["CLUSTER"] = labels+7000

for i in range(len(cluster_centers)):
    second_clusters_df = second_clusters_df._append({"LATITUDE": cluster_centers[i][0], "LONGITUDE": cluster_centers[i][1], "CLUSTER": 7000+i}, ignore_index=True)

In [None]:
X = sikkim_north_bengal[["LATITUDE", "LONGITUDE"]]
bandwidth = estimate_bandwidth(X, quantile=0.07, n_samples=10000, n_jobs=-1, random_state=0)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=-1)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
sikkim_north_bengal["CLUSTER"] = labels+8000

for i in range(len(cluster_centers)):
    second_clusters_df = second_clusters_df._append({"LATITUDE": cluster_centers[i][0], "LONGITUDE": cluster_centers[i][1], "CLUSTER": 8000+i}, ignore_index=True)

In [None]:
mapping_df = pd.concat([andaman_df, nicobar_df, lakshadweep_df, himalayas, sikkim_north_bengal])
mapping_df["CLUSTER"] = mapping_df["CLUSTER"].astype('category')


In [None]:
data_df_filtered.drop(mapping_df.index, inplace=True)

In [None]:

sampling_rate = 20000

fig = px.scatter_mapbox(mapping_df.sample(sampling_rate, replace=True), lat="LATITUDE", lon="LONGITUDE", color="CLUSTER", zoom=2, opacity=0.8)
fig.update_layout(mapbox_style="open-street-map", width=1000, height=1000)
fig.show()

In [None]:
data_df_filtered["CLUSTER"] = data_df_filtered["CLUSTER"].astype('int64').astype('category')

In [None]:
data_df_filtered = pd.concat([data_df_filtered, mapping_df])

In [None]:
data_df_filtered["CLUSTER"] = data_df_filtered["CLUSTER"].astype('int64').astype('category')

In [None]:
sampling_rate = 20000
fig = px.scatter_mapbox(data_df_filtered.sample(sampling_rate, replace=True), lat="LATITUDE", lon="LONGITUDE", color="CLUSTER", zoom=2)
fig.update_layout(mapbox_style="open-street-map", width=1000, height=1000)
fig.update_layout(mapbox={"layers":[
            {
                "source": india_geojson["geometry"].__geo_interface__,
                "type": "line",
                "color": "black",
                "line": {"width": 0.5},
            }
        ]})
fig.show()

In [None]:
largest_cluster_df = data_df_filtered.groupby("OBSERVER ID").agg({"CLUSTER": pd.Series.mode}).reset_index()
largest_cluster_df = largest_cluster_df[largest_cluster_df.apply(lambda x: x["CLUSTER"].dtype == "int64", axis=1)]
largest_cluster_df = largest_cluster_df.rename(columns={"CLUSTER": "LARGEST CLUSTER"})
data_df_filtered = data_df_filtered.merge(largest_cluster_df, on="OBSERVER ID")
data_df_filtered["LOCAL"] = data_df_filtered["LARGEST CLUSTER"] == data_df_filtered["CLUSTER"]

# GRAPHING

In [None]:
observers = data_df_filtered["OBSERVER ID"].unique()
observers = pd.Series(observers)

In [None]:
data_df_filtered["NEXT CLUSTER"] = data_df_filtered.sort_values("DATE TIME").groupby("OBSERVER ID")["CLUSTER"].shift(-1)

In [None]:
data_df_filtered

In [None]:
import networkx as nx
observer_graphs = {}
observers.apply(lambda x: observer_graphs.update({x: nx.DiGraph()}))

In [None]:
def PutInGraph(row):
        observer_count = row['NUMLIST']
        if (row['CLUSTER'], row['NEXT CLUSTER']) in observer_graphs[row["OBSERVER ID"]].edges():
            data = observer_graphs[row["OBSERVER ID"]].get_edge_data(row['CLUSTER'], row['NEXT CLUSTER'])
            observer_graphs[row["OBSERVER ID"]].add_edge(row['CLUSTER'], row['NEXT CLUSTER'], weight=data['weight']+1, normal_weight=round((data['weight']+1)/observer_count, 4))
        else:
            observer_graphs[row["OBSERVER ID"]].add_edge(row['CLUSTER'], row['NEXT CLUSTER'], weight=1, normal_weight=round(1/observer_count, 4))



In [None]:

data_df_filtered.dropna().apply(PutInGraph, axis=1)


In [None]:
from networkx.readwrite import json_graph
observer_graphs = {k: json_graph.adjacency_data(v) for k, v in observer_graphs.items()}

# GRAPHING END

In [None]:
observer_cluster_count = data_df_filtered.groupby(["OBSERVER ID", "CLUSTER"]).size().reset_index(name='COUNT').sort_values(by="OBSERVER ID", ascending=False)
observer_cluster_count = observer_cluster_count[observer_cluster_count["COUNT"] > 0]

In [None]:
observer_sequences = data_df_filtered.groupby("OBSERVER ID").agg({"CLUSTER": lambda x: list(x), "NUMLIST":"mean", "OBSERVATION DATE": lambda x: list(x)}).reset_index()

In [None]:
data_df_filtered["CHECKLIST NUM"] = data_df_filtered.sort_values("OBSERVATION DATE").groupby("OBSERVER ID").cumcount()

In [None]:
local_list = []
def create_local(row):
    id = row["OBSERVER ID"]
    clusters = row["CLUSTER"]
    window_size = int(round(row["NUMLIST"]/3,0))

    local = []
    mydict = {}
    for i in range(0, len(clusters), window_size):
        if i+window_size > len(clusters):
            break
        if i + 2*window_size > len(clusters):
            vals, counts = np.unique(clusters[i:i+2*window_size], return_counts=True)
        else: 
            vals, counts = np.unique(clusters[i:i+window_size], return_counts=True)
        mydict["CLUSTER"] = vals[np.argmax(counts)]
        mydict["FROM"] = i
        mydict["TO"] = i+window_size if i+2*window_size <= len(clusters) else len(clusters)
        local.append(mydict.copy())
    local_list.append({"OBSERVER ID": id, "LOCAL": local})

In [None]:
observer_sequences.apply(create_local, axis=1)
local_list_df = pd.DataFrame(local_list)


In [None]:
local_list_df_expanded = local_list_df.explode("LOCAL")

In [None]:
local_list_df_expanded["FROM"] = local_list_df_expanded["LOCAL"].apply(lambda x: x["FROM"])
local_list_df_expanded["TO"] = local_list_df_expanded["LOCAL"].apply(lambda x: x["TO"])
local_list_df_expanded["CLUSTER"] = local_list_df_expanded["LOCAL"].apply(lambda x: x["CLUSTER"])

In [None]:
local_list_df_expanded.drop("LOCAL", axis=1, inplace=True)

In [None]:
data_df_filtered

In [None]:
local_list_df_expanded

In [None]:
#TODO add local clusters to data_df_filtered

data_df_filtered = data_df_filtered.merge(local_list_df, on="OBSERVER ID")

In [None]:
data_df_filtered

In [None]:
data_df_filtered_exploded = data_df_filtered.explode("LOCAL_y")
data_df_filtered_exploded["FROM"] = data_df_filtered_exploded["LOCAL_y"].apply(lambda x: x["FROM"])
data_df_filtered_exploded["TO"] = data_df_filtered_exploded["LOCAL_y"].apply(lambda x: x["TO"])
data_df_filtered_exploded["LOCAL CLUSTER"] = data_df_filtered_exploded["LOCAL_y"].apply(lambda x: x["CLUSTER"])

In [None]:
data_df_filtered_exploded_filtered = data_df_filtered_exploded[data_df_filtered_exploded["CHECKLIST NUM"]>=data_df_filtered_exploded["FROM"]]
data_df_filtered_exploded_filtered = data_df_filtered_exploded_filtered[data_df_filtered_exploded_filtered["CHECKLIST NUM"]<data_df_filtered_exploded_filtered["TO"]]


In [None]:
data_df_filtered_exploded_filtered.drop(["LOCAL_y", "FROM", "TO"], axis=1, inplace=True)
data_df_filtered_exploded_filtered.rename(columns={"LOCAL_x": "NAIVE LOCAL"}, inplace=True)

In [None]:
data_df_filtered_exploded_filtered["LOCAL"] = data_df_filtered_exploded_filtered["CLUSTER"] == data_df_filtered_exploded_filtered["LOCAL CLUSTER"]

In [None]:
data_df_filtered_exploded_filtered

In [None]:
urban_geojson = gpd.read_file("urban_shape.geojson")

In [None]:

points = gpd.GeoDataFrame(data_df_filtered_exploded_filtered, geometry=gpd.points_from_xy(data_df_filtered_exploded_filtered.LONGITUDE, data_df_filtered_exploded_filtered.LATITUDE))

points2 = gpd.sjoin(points, urban_geojson, how="left")

In [None]:
points2["IS URBAN"] = points2["id"].notna()

In [None]:
data_df_filtered_exploded_filtered_urban = points2.drop(["geometry", "index_right","id"], axis=1)

In [None]:
data_df_filtered_exploded_filtered_urban.to_csv("data_df_filtered_triple_multi.csv", index=False)

In [None]:
second_clusters_df.to_csv("cluster_centers_triple_multi_df.csv", index=False)

In [None]:
import json
json.dump(observer_graphs, open("observer_graphs_triple.json", "w"))