# Loading data

In [None]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
import geopandas as gpd
import plotly.graph_objects as go
from scipy import stats

warnings.filterwarnings('ignore')

In [None]:
token = "pk.eyJ1IjoiYXJuYXZndXB0YTEyMDkiLCJhIjoiY2x4YTU5eDZvMmJiejJrcXViOWZzNWp3MCJ9.xngTZzyuc8pNyKCkcQgK9A"
px.set_mapbox_access_token(token)

In [None]:
india_geojson = gpd.read_file("india-composite.geojson")
india_geojson = india_geojson.to_crs("WGS84")

In [None]:
sampling_rate = 10000

In [None]:
data_df = pd.read_csv('data_df_filtered_triple_multi.csv')
cluster_centers_df = pd.read_csv("cluster_centers_triple_multi_df.csv")

In [None]:
data_df["CLUSTER"] = data_df["CLUSTER"].astype("category")
data_df["LOCAL CLUSTER"] = data_df["LOCAL CLUSTER"].astype("category")
data_df["NEXT CLUSTER"] = data_df["NEXT CLUSTER"].astype("category")
data_df["LOCAL CLUSTER"] = data_df["LOCAL CLUSTER"].astype("category")
data_df["YEAR"] = data_df["OBSERVATION DATE"].apply(lambda x: int(x.split("-")[0]))

In [None]:
TOFILTER = False
FILTER = 2019

if TOFILTER:
    data_df = data_df[data_df["YEAR"] <= FILTER]

In [None]:
import json
with open('observer_graphs_triple.json') as f:
    graphs_json = json.load(f)

In [None]:
from networkx.readwrite import json_graph
import networkx as nx

graphs = {}
for key in graphs_json:
    graphs[key] = json_graph.adjacency_graph(graphs_json[key])

# Preliminary Analysis

In [None]:
# fig = px.scatter_mapbox(data_df.sample(10000), lat="LATITUDE", lon="LONGITUDE", color="CLUSTER")
# fig.update_layout(mapbox_style="open-street-map")
# fig.show()

In [None]:
data_df["LOCAL"].describe()

In [None]:
data_df["LOCAL"].value_counts()

In [None]:
data_df.groupby("OBSERVER ID").agg({"LOCAL": "mean"}).sort_values("LOCAL", ascending=False).plot(kind="hist", bins=20, xlabel="Localness", ylabel="Number of Observers")

In [None]:
data_df.groupby("OBSERVER ID").agg({"NAIVE LOCAL": "mean"}).sort_values("NAIVE LOCAL", ascending=False).plot(kind="hist", bins=20, xlabel="Localness", ylabel="Number of Observers")

In [None]:
data_df[data_df["YEAR"] > 2020].groupby("OBSERVER ID").agg({"LOCAL": "mean"}).sort_values("LOCAL", ascending=False).plot(kind="hist", bins=20, xlabel="Localness", ylabel="Number of Observers")

In [None]:

data_df.groupby("STATE").agg({"LOCAL": "mean"}).sort_values("LOCAL", ascending=False).plot(kind="barh", figsize=(7,7), ylabel="", xlim=(0, 1))

In [None]:
# data_df[data_df["STATE"] == "Dadra and Nagar Haveli"].groupby("OBSERVER ID").size().sort_values(ascending=False).plot(kind="bar", figsize=(7,7), ylabel="Number of Observations")

In [None]:

data_df.groupby("STATE").agg({"NAIVE LOCAL": "mean"}).sort_values("NAIVE LOCAL", ascending=False).plot(kind="barh", figsize=(7,7), ylabel="", xlim=(0, 1))

In [None]:
#plot localness vs year from observation date
data_df["OBSERVATION DATE"] = pd.to_datetime(data_df["OBSERVATION DATE"])
data_df["YEAR"] = data_df["OBSERVATION DATE"].dt.year
data_df["MONTH"] = data_df["OBSERVATION DATE"].dt.month

data_df[data_df["YEAR"]>2000].groupby("YEAR").agg({"LOCAL": "mean"}).plot(kind="line", ylabel="Localness", xlabel="Year", title="Localness vs Year")


In [None]:
data_df[data_df["YEAR"]>2000].groupby("MONTH").agg({"LOCAL": "mean"}).plot(kind="line", ylabel="Localness", xlabel="Month", title="Localness vs MONTH", xticks=range(1, 13))



In [None]:
data_df[data_df["YEAR"]>2010].groupby("YEAR").agg({"OBSERVATION DATE": "count"}).plot(kind="line", ylabel="COUNT", xlabel="Year", title="COUNT vs Year")

# TOP TOURIST DESTINATIONS

In [None]:
top_tourist = data_df[data_df["LOCAL"]==False].groupby("CLUSTER").size().reset_index(name="COUNT").sort_values("COUNT", ascending=False)

In [None]:
total_cluster_obs = data_df.groupby("CLUSTER").size().reset_index(name="COUNT").sort_values("COUNT", ascending=False)
top_tourist_rel = pd.merge(top_tourist, total_cluster_obs, on="CLUSTER")
top_tourist_rel["REL TOURISM"] = top_tourist_rel["COUNT_x"] / top_tourist_rel["COUNT_y"]
top_tourist_rel.sort_values("REL TOURISM", ascending=False, inplace=True)

In [None]:
fig = px.scatter_mapbox(data_df[data_df["CLUSTER"].isin(top_tourist["CLUSTER"].head(20))][data_df["LOCAL"]==False].sample(sampling_rate), lat="LATITUDE", lon="LONGITUDE", zoom=2, color="CLUSTER", title="Absolute top tourist clusters")
fig.update_layout(mapbox_style="open-street-map", width=1000, height=1000, mapbox_accesstoken=token)
fig.update_layout(mapbox={"layers":[
            {
                "source": india_geojson["geometry"].__geo_interface__,
                "type": "line",
                "color": "black",
                "line": {"width": 0.5},
            }
        ]})
fig.show()

In [None]:
top_tourist_rel[top_tourist_rel["COUNT_x"]>2000]

In [None]:
fig = px.scatter_mapbox(data_df[data_df["CLUSTER"].isin(top_tourist_rel[top_tourist_rel["COUNT_x"]>2000]["CLUSTER"].head(20))][data_df["LOCAL"]==False].sample(sampling_rate, replace=True), lat="LATITUDE", lon="LONGITUDE", zoom=2, color="CLUSTER", title="Relative top tourist clusters")
fig.update_layout(mapbox_style="open-street-map", width=1000, height=1000)
fig.update_layout(mapbox={"layers":[
            {
                "source": india_geojson["geometry"].__geo_interface__,
                "type": "line",
                "color": "black",
                "line": {"width": 0.5},
            }
        ]})
fig.show()

Top Local Destinations

In [None]:
top_local = data_df[data_df["LOCAL"]==True].groupby("CLUSTER").size().reset_index(name="COUNT").sort_values("COUNT", ascending=False)
top_local

In [None]:
total_cluster_obs = data_df.groupby("CLUSTER").size().reset_index(name="COUNT").sort_values("COUNT", ascending=False)
top_local_rel = pd.merge(top_local, total_cluster_obs, on="CLUSTER")
top_local_rel["REL LOCAL"] = top_local_rel["COUNT_x"] / top_local_rel["COUNT_y"]
top_local_rel.sort_values("REL LOCAL", ascending=False, inplace=True)

In [None]:
top_local_rel.head(20)

In [None]:
fig = px.scatter_mapbox(data_df[data_df["CLUSTER"].isin(top_local.head(20)["CLUSTER"])][data_df["LOCAL"]==True].sample(sampling_rate), lat="LATITUDE", lon="LONGITUDE", zoom=2, color="CLUSTER", title="Absolute top local clusters")
fig.update_layout(mapbox_style="open-street-map", width=1000, height=1000)
fig.update_layout(mapbox={"layers":[
            {
                "source": india_geojson["geometry"].__geo_interface__,
                "type": "line",
                "color": "black",
                "line": {"width": 0.5},
            }
        ]})
fig.show()

In [None]:
top_local_rel

In [None]:
temp_sampling = 50000
fig = px.scatter_mapbox(data_df[data_df["CLUSTER"].isin(top_local_rel["CLUSTER"].head(20))][data_df["LOCAL"]==True].sample(temp_sampling, replace=True), lat="LATITUDE", lon="LONGITUDE", zoom=2, color="CLUSTER", title="Relative top local clusters")
fig.update_layout(mapbox_style="open-street-map", width=1000, height=1000)
fig.update_layout(mapbox={"layers":[
            {
                "source": india_geojson["geometry"].__geo_interface__,
                "type": "line",
                "color": "black",
                "line": {"width": 0.5},
            }
        ]})
fig.show()

LOCAL TOURIST PAIRS

In [None]:
local_tourist_pairs = data_df[data_df["LOCAL"]==False].groupby(["CLUSTER", "LOCAL CLUSTER"]).size().reset_index(name="COUNT").sort_values("COUNT", ascending=False)
local_tourist_pairs

In [None]:
# REL_WRT_S, REL_WRT_LOC, REL_WRT_TOUR, REL_WRT_TOUR_LOC
S = local_tourist_pairs.groupby("LOCAL CLUSTER").agg({"COUNT": "sum"}).sort_values("COUNT", ascending=False).reset_index()
S.columns = ["LOCAL CLUSTER", "COUNT_S"]
local_tourist_pairs = pd.merge(local_tourist_pairs, S, on="LOCAL CLUSTER", how="left")
local_tourist_pairs["REL_WRT_S"] = local_tourist_pairs["COUNT"] / local_tourist_pairs["COUNT_S"]
local_tourist_pairs.drop("COUNT_S", axis=1, inplace=True)

LOC = data_df[data_df["LOCAL"]==True].groupby("CLUSTER").agg({"LOCAL": "sum"}).sort_values("LOCAL", ascending=False).reset_index()
LOC.columns = ["CLUSTER", "COUNT_LOC"]
local_tourist_pairs = pd.merge(local_tourist_pairs, LOC, on="CLUSTER", how="left")
local_tourist_pairs["REL_WRT_LOC"] = local_tourist_pairs["COUNT"] / (local_tourist_pairs["COUNT_LOC"]+0.01)
local_tourist_pairs.drop("COUNT_LOC", axis=1, inplace=True)

TOUR = local_tourist_pairs.groupby("CLUSTER").agg({"COUNT": "sum"}).sort_values("COUNT", ascending=False).reset_index()
TOUR.columns = ["CLUSTER", "COUNT_TOUR"]
local_tourist_pairs = pd.merge(local_tourist_pairs, TOUR, on="CLUSTER", how="left")
local_tourist_pairs["REL_WRT_TOUR"] = local_tourist_pairs["COUNT"] / local_tourist_pairs["COUNT_TOUR"]
local_tourist_pairs.drop("COUNT_TOUR", axis=1, inplace=True)

TOUR_LOC = data_df.groupby("CLUSTER").agg({"LOCAL": "count"}).sort_values("LOCAL", ascending=False).reset_index()
TOUR_LOC.columns = ["CLUSTER", "COUNT_TOUR_LOC"]
local_tourist_pairs = pd.merge(local_tourist_pairs, TOUR_LOC, on="CLUSTER", how="left")
local_tourist_pairs["REL_WRT_TOUR_LOC"] = local_tourist_pairs["COUNT"] / (local_tourist_pairs["COUNT_TOUR_LOC"]+0.01)
local_tourist_pairs.drop("COUNT_TOUR_LOC", axis=1, inplace=True)
local_tourist_pairs = local_tourist_pairs[local_tourist_pairs["COUNT"]>20]

In [None]:
local_tourist_pairs.sort_values("REL_WRT_S", ascending=False)

In [None]:
local_tourist_pairs = local_tourist_pairs[local_tourist_pairs["COUNT"]>500]

In [None]:
route_mapping_df = local_tourist_pairs

In [None]:
cluster_centers_df

In [None]:
route_mapping_df = route_mapping_df.merge(cluster_centers_df, on="CLUSTER", how="left")
route_mapping_df = route_mapping_df.merge(cluster_centers_df, left_on="LOCAL CLUSTER", right_on="CLUSTER", how="left")

In [None]:
route_mapping_df

In [None]:
route_mapping_df.drop(["CLUSTER_y"], axis=1, inplace=True)
route_mapping_df.columns = ["CLUSTER", "LOCAL CLUSTER", "COUNT", "REL_WRT_S", "REL_WRT_LOC", "REL_WRT_TOUR", "REL_WRT_TOUR_LOC", "LATITUDE_D", "LONGITUDE_D", "LATITUDE_S", "LONGITUDE_S"]

In [None]:
route_mapping_df

In [None]:
route_mapping_df["CLUSTER"] = route_mapping_df["CLUSTER"].astype('category')
route_mapping_df["LOCAL CLUSTER"] = route_mapping_df["LOCAL CLUSTER"].astype('category')

In [None]:
from sklearn.metrics.pairwise import haversine_distances
from math import radians

def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    return haversine_distances([[lat1, lon1], [lat2, lon2]])[0][1]

route_mapping_df["DISTANCE"] = route_mapping_df.apply(lambda x: haversine(x["LATITUDE_D"], x["LONGITUDE_D"], x["LATITUDE_S"], x["LONGITUDE_S"]), axis=1)

route_mapping_df["DISTANCE"] = route_mapping_df["DISTANCE"] * 6371

In [None]:
route_mapping_df.describe()

DISTANCE FILTER

In [None]:
route_mapping_df_filtered = route_mapping_df[route_mapping_df["DISTANCE"]>500]

In [None]:
route_mapping_df.describe()

In [None]:

TOP_NUM = 10

fig = go.Figure()
ROUTE_MAPPING_VAR = "COUNT"
temp_route_mapping_df = route_mapping_df.sort_values(ROUTE_MAPPING_VAR, ascending=False)
for i in range(TOP_NUM):
    dest_cluster = temp_route_mapping_df.iloc[i]["CLUSTER"]
    source_cluster = temp_route_mapping_df.iloc[i]["LOCAL CLUSTER"]
    dest = temp_route_mapping_df.iloc[i][["LATITUDE_D", "LONGITUDE_D"]]
    source = temp_route_mapping_df.iloc[i][["LATITUDE_S", "LONGITUDE_S"]]

    theta = 30
    theta = np.radians(theta)
    c = 0.3
    phi = np.arctan2(dest[1] - source[1], dest[0] - source[0])





    templat1 = dest[0] - c*np.cos(phi - theta)
    templong1 = dest[1] - c*np.sin(phi - theta)
    templat2 = dest[0] - c*np.cos(phi + theta)
    templong2 = dest[1] - c*np.sin(phi + theta)

    fig.add_trace(go.Scattermapbox(lat=[source[0]], lon=[source[1]], mode = "markers", marker=go.scattermapbox.Marker(size=20, color="blue", opacity=0.7)))
    fig.add_trace(go.Scattermapbox(lat=[dest[0]], lon=[dest[1]], mode = "markers", marker=go.scattermapbox.Marker(size=15, color="green")))
    fig.add_trace(go.Scattermapbox(lat=[source[0], dest[0]], lon=[source[1], dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))

    fig.add_trace(go.Scattermapbox(lat=[templat1, dest[0]], lon=[templong1, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))
    fig.add_trace(go.Scattermapbox(lat=[templat2, dest[0]], lon=[templong2, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))





fig.update_layout(mapbox_style="open-street-map", title = f"Top 10 routes ({ROUTE_MAPPING_VAR})", width=1000, height=1000, showlegend=False)
fig.update_layout(mapbox={"layers":[
            {
                "source": india_geojson["geometry"].__geo_interface__,
                "type": "line",
                "color": "black",
                "line": {"width": 0.5},
            }
        ]})
fig.show()

In [None]:

TOP_NUM = 10

fig = go.Figure()
ROUTE_MAPPING_VAR = "COUNT"
temp_route_mapping_df = route_mapping_df_filtered.sort_values(ROUTE_MAPPING_VAR, ascending=False)
for i in range(TOP_NUM):
    dest_cluster = temp_route_mapping_df.iloc[i]["CLUSTER"]
    source_cluster = temp_route_mapping_df.iloc[i]["LOCAL CLUSTER"]
    dest = temp_route_mapping_df.iloc[i][["LATITUDE_D", "LONGITUDE_D"]]
    source = temp_route_mapping_df.iloc[i][["LATITUDE_S", "LONGITUDE_S"]]

    theta = 30
    theta = np.radians(theta)
    c = 0.3
    phi = np.arctan2(dest[1] - source[1], dest[0] - source[0])





    templat1 = dest[0] - c*np.cos(phi - theta)
    templong1 = dest[1] - c*np.sin(phi - theta)
    templat2 = dest[0] - c*np.cos(phi + theta)
    templong2 = dest[1] - c*np.sin(phi + theta)

    fig.add_trace(go.Scattermapbox(lat=[source[0]], lon=[source[1]], mode = "markers", marker=go.scattermapbox.Marker(size=20, color="blue", opacity=0.7)))
    fig.add_trace(go.Scattermapbox(lat=[dest[0]], lon=[dest[1]], mode = "markers", marker=go.scattermapbox.Marker(size=15, color="green")))
    fig.add_trace(go.Scattermapbox(lat=[source[0], dest[0]], lon=[source[1], dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))

    fig.add_trace(go.Scattermapbox(lat=[templat1, dest[0]], lon=[templong1, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))
    fig.add_trace(go.Scattermapbox(lat=[templat2, dest[0]], lon=[templong2, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))


fig.update_layout(mapbox_style="open-street-map", title = f"Top 10 routes ({ROUTE_MAPPING_VAR}) filtered", width=1000, height=1000, showlegend=False)
fig.update_layout(mapbox={"layers":[
            {
                "source": india_geojson["geometry"].__geo_interface__,
                "type": "line",
                "color": "black",
                "line": {"width": 0.5},
            }
        ]})
fig.show()

In [None]:

TOP_NUM = 10

fig = go.Figure()
ROUTE_MAPPING_VAR = "REL_WRT_S"
temp_route_mapping_df = route_mapping_df.sort_values(ROUTE_MAPPING_VAR, ascending=False)
for i in range(TOP_NUM):
    dest_cluster = temp_route_mapping_df.iloc[i]["CLUSTER"]
    source_cluster = temp_route_mapping_df.iloc[i]["LOCAL CLUSTER"]
    dest = temp_route_mapping_df.iloc[i][["LATITUDE_D", "LONGITUDE_D"]]
    source = temp_route_mapping_df.iloc[i][["LATITUDE_S", "LONGITUDE_S"]]

    theta = 30
    theta = np.radians(theta)
    c = 0.3
    phi = np.arctan2(dest[1] - source[1], dest[0] - source[0])





    templat1 = dest[0] - c*np.cos(phi - theta)
    templong1 = dest[1] - c*np.sin(phi - theta)
    templat2 = dest[0] - c*np.cos(phi + theta)
    templong2 = dest[1] - c*np.sin(phi + theta)

    fig.add_trace(go.Scattermapbox(lat=[source[0]], lon=[source[1]], mode = "markers", marker=go.scattermapbox.Marker(size=20, color="blue", opacity=0.7)))
    fig.add_trace(go.Scattermapbox(lat=[dest[0]], lon=[dest[1]], mode = "markers", marker=go.scattermapbox.Marker(size=15, color="green")))
    fig.add_trace(go.Scattermapbox(lat=[source[0], dest[0]], lon=[source[1], dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))

    fig.add_trace(go.Scattermapbox(lat=[templat1, dest[0]], lon=[templong1, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))
    fig.add_trace(go.Scattermapbox(lat=[templat2, dest[0]], lon=[templong2, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))


fig.update_layout(mapbox_style="open-street-map", title = f"Top 10 routes ({ROUTE_MAPPING_VAR})", width=1000, height=1000, showlegend=False)
fig.update_layout(mapbox={"layers":[
            {
                "source": india_geojson["geometry"].__geo_interface__,
                "type": "line",
                "color": "black",
                "line": {"width": 0.5},
            }
        ]})
fig.show()

In [None]:

TOP_NUM = 10

fig = go.Figure()
ROUTE_MAPPING_VAR = "REL_WRT_S"
temp_route_mapping_df = route_mapping_df_filtered.sort_values(ROUTE_MAPPING_VAR, ascending=False)
for i in range(TOP_NUM):
    dest_cluster = temp_route_mapping_df.iloc[i]["CLUSTER"]
    source_cluster = temp_route_mapping_df.iloc[i]["LOCAL CLUSTER"]
    dest = temp_route_mapping_df.iloc[i][["LATITUDE_D", "LONGITUDE_D"]]
    source = temp_route_mapping_df.iloc[i][["LATITUDE_S", "LONGITUDE_S"]]

    theta = 30
    theta = np.radians(theta)
    c = 0.3
    phi = np.arctan2(dest[1] - source[1], dest[0] - source[0])





    templat1 = dest[0] - c*np.cos(phi - theta)
    templong1 = dest[1] - c*np.sin(phi - theta)
    templat2 = dest[0] - c*np.cos(phi + theta)
    templong2 = dest[1] - c*np.sin(phi + theta)

    fig.add_trace(go.Scattermapbox(lat=[source[0]], lon=[source[1]], mode = "markers", marker=go.scattermapbox.Marker(size=20, color="blue", opacity=0.7)))
    fig.add_trace(go.Scattermapbox(lat=[dest[0]], lon=[dest[1]], mode = "markers", marker=go.scattermapbox.Marker(size=15, color="green")))
    fig.add_trace(go.Scattermapbox(lat=[source[0], dest[0]], lon=[source[1], dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))

    fig.add_trace(go.Scattermapbox(lat=[templat1, dest[0]], lon=[templong1, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))
    fig.add_trace(go.Scattermapbox(lat=[templat2, dest[0]], lon=[templong2, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))


fig.update_layout(mapbox_style="open-street-map", title = f"Top 10 routes ({ROUTE_MAPPING_VAR}) filtered", width=1000, height=1000, showlegend=False)
fig.update_layout(mapbox={"layers":[
            {
                "source": india_geojson["geometry"].__geo_interface__,
                "type": "line",
                "color": "black",
                "line": {"width": 0.5},
            }
        ]})
fig.show()

In [None]:

TOP_NUM = 10

fig = go.Figure()
ROUTE_MAPPING_VAR = "REL_WRT_LOC"
temp_route_mapping_df = route_mapping_df.sort_values(ROUTE_MAPPING_VAR, ascending=False)
for i in range(TOP_NUM):
    dest_cluster = temp_route_mapping_df.iloc[i]["CLUSTER"]
    source_cluster = temp_route_mapping_df.iloc[i]["LOCAL CLUSTER"]
    dest = temp_route_mapping_df.iloc[i][["LATITUDE_D", "LONGITUDE_D"]]
    source = temp_route_mapping_df.iloc[i][["LATITUDE_S", "LONGITUDE_S"]]

    theta = 30
    theta = np.radians(theta)
    c = 0.3
    phi = np.arctan2(dest[1] - source[1], dest[0] - source[0])





    templat1 = dest[0] - c*np.cos(phi - theta)
    templong1 = dest[1] - c*np.sin(phi - theta)
    templat2 = dest[0] - c*np.cos(phi + theta)
    templong2 = dest[1] - c*np.sin(phi + theta)

    fig.add_trace(go.Scattermapbox(lat=[source[0]], lon=[source[1]], mode = "markers", marker=go.scattermapbox.Marker(size=20, color="blue", opacity=0.7)))
    fig.add_trace(go.Scattermapbox(lat=[dest[0]], lon=[dest[1]], mode = "markers", marker=go.scattermapbox.Marker(size=15, color="green")))
    fig.add_trace(go.Scattermapbox(lat=[source[0], dest[0]], lon=[source[1], dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))

    fig.add_trace(go.Scattermapbox(lat=[templat1, dest[0]], lon=[templong1, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))
    fig.add_trace(go.Scattermapbox(lat=[templat2, dest[0]], lon=[templong2, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))


fig.update_layout(mapbox_style="open-street-map", title = f"Top 10 routes ({ROUTE_MAPPING_VAR})", width=1000, height=1000, showlegend=False)
fig.update_layout(mapbox={"layers":[
            {
                "source": india_geojson["geometry"].__geo_interface__,
                "type": "line",
                "color": "black",
                "line": {"width": 0.5},
            }
        ]})
fig.show()

In [None]:

TOP_NUM = 10

fig = go.Figure()
ROUTE_MAPPING_VAR = "REL_WRT_LOC"
temp_route_mapping_df = route_mapping_df_filtered.sort_values(ROUTE_MAPPING_VAR, ascending=False)
for i in range(TOP_NUM):
    dest_cluster = temp_route_mapping_df.iloc[i]["CLUSTER"]
    source_cluster = temp_route_mapping_df.iloc[i]["LOCAL CLUSTER"]
    dest = temp_route_mapping_df.iloc[i][["LATITUDE_D", "LONGITUDE_D"]]
    source = temp_route_mapping_df.iloc[i][["LATITUDE_S", "LONGITUDE_S"]]

    theta = 30
    theta = np.radians(theta)
    c = 0.3
    phi = np.arctan2(dest[1] - source[1], dest[0] - source[0])





    templat1 = dest[0] - c*np.cos(phi - theta)
    templong1 = dest[1] - c*np.sin(phi - theta)
    templat2 = dest[0] - c*np.cos(phi + theta)
    templong2 = dest[1] - c*np.sin(phi + theta)

    fig.add_trace(go.Scattermapbox(lat=[source[0]], lon=[source[1]], mode = "markers", marker=go.scattermapbox.Marker(size=20, color="blue", opacity=0.7)))
    fig.add_trace(go.Scattermapbox(lat=[dest[0]], lon=[dest[1]], mode = "markers", marker=go.scattermapbox.Marker(size=15, color="green")))
    fig.add_trace(go.Scattermapbox(lat=[source[0], dest[0]], lon=[source[1], dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))

    fig.add_trace(go.Scattermapbox(lat=[templat1, dest[0]], lon=[templong1, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))
    fig.add_trace(go.Scattermapbox(lat=[templat2, dest[0]], lon=[templong2, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))


fig.update_layout(mapbox_style="open-street-map", title = f"Top 10 routes ({ROUTE_MAPPING_VAR}) filtered", width=1000, height=1000, showlegend=False)
fig.update_layout(mapbox={"layers":[
            {
                "source": india_geojson["geometry"].__geo_interface__,
                "type": "line",
                "color": "black",
                "line": {"width": 0.5},
            }
        ]})
fig.show()

In [None]:

TOP_NUM = 10

fig = go.Figure()
ROUTE_MAPPING_VAR = "REL_WRT_TOUR"
temp_route_mapping_df = route_mapping_df.sort_values(ROUTE_MAPPING_VAR, ascending=False)
for i in range(TOP_NUM):
    dest_cluster = temp_route_mapping_df.iloc[i]["CLUSTER"]
    source_cluster = temp_route_mapping_df.iloc[i]["LOCAL CLUSTER"]
    dest = temp_route_mapping_df.iloc[i][["LATITUDE_D", "LONGITUDE_D"]]
    source = temp_route_mapping_df.iloc[i][["LATITUDE_S", "LONGITUDE_S"]]

    theta = 30
    theta = np.radians(theta)
    c = 0.3
    phi = np.arctan2(dest[1] - source[1], dest[0] - source[0])





    templat1 = dest[0] - c*np.cos(phi - theta)
    templong1 = dest[1] - c*np.sin(phi - theta)
    templat2 = dest[0] - c*np.cos(phi + theta)
    templong2 = dest[1] - c*np.sin(phi + theta)

    fig.add_trace(go.Scattermapbox(lat=[source[0]], lon=[source[1]], mode = "markers", marker=go.scattermapbox.Marker(size=20, color="blue", opacity=0.7)))
    fig.add_trace(go.Scattermapbox(lat=[dest[0]], lon=[dest[1]], mode = "markers", marker=go.scattermapbox.Marker(size=15, color="green")))
    fig.add_trace(go.Scattermapbox(lat=[source[0], dest[0]], lon=[source[1], dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))

    fig.add_trace(go.Scattermapbox(lat=[templat1, dest[0]], lon=[templong1, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))
    fig.add_trace(go.Scattermapbox(lat=[templat2, dest[0]], lon=[templong2, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))


fig.update_layout(mapbox_style="open-street-map", title = f"Top 10 routes ({ROUTE_MAPPING_VAR})", width=1000, height=1000, showlegend=False)
fig.update_layout(mapbox={"layers":[
            {
                "source": india_geojson["geometry"].__geo_interface__,
                "type": "line",
                "color": "black",
                "line": {"width": 0.5},
            }
        ]})
fig.show()

In [None]:

TOP_NUM = 10

fig = go.Figure()
ROUTE_MAPPING_VAR = "REL_WRT_TOUR"
temp_route_mapping_df = route_mapping_df_filtered.sort_values(ROUTE_MAPPING_VAR, ascending=False)
for i in range(TOP_NUM):
    dest_cluster = temp_route_mapping_df.iloc[i]["CLUSTER"]
    source_cluster = temp_route_mapping_df.iloc[i]["LOCAL CLUSTER"]
    dest = temp_route_mapping_df.iloc[i][["LATITUDE_D", "LONGITUDE_D"]]
    source = temp_route_mapping_df.iloc[i][["LATITUDE_S", "LONGITUDE_S"]]

    theta = 30
    theta = np.radians(theta)
    c = 0.3
    phi = np.arctan2(dest[1] - source[1], dest[0] - source[0])





    templat1 = dest[0] - c*np.cos(phi - theta)
    templong1 = dest[1] - c*np.sin(phi - theta)
    templat2 = dest[0] - c*np.cos(phi + theta)
    templong2 = dest[1] - c*np.sin(phi + theta)

    fig.add_trace(go.Scattermapbox(lat=[source[0]], lon=[source[1]], mode = "markers", marker=go.scattermapbox.Marker(size=20, color="blue", opacity=0.7)))
    fig.add_trace(go.Scattermapbox(lat=[dest[0]], lon=[dest[1]], mode = "markers", marker=go.scattermapbox.Marker(size=15, color="green")))
    fig.add_trace(go.Scattermapbox(lat=[source[0], dest[0]], lon=[source[1], dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))

    fig.add_trace(go.Scattermapbox(lat=[templat1, dest[0]], lon=[templong1, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))
    fig.add_trace(go.Scattermapbox(lat=[templat2, dest[0]], lon=[templong2, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))


fig.update_layout(mapbox_style="open-street-map", title = f"Top 10 routes ({ROUTE_MAPPING_VAR}) filtered", width=1000, height=1000, showlegend=False)
fig.update_layout(mapbox={"layers":[
            {
                "source": india_geojson["geometry"].__geo_interface__,
                "type": "line",
                "color": "black",
                "line": {"width": 0.5},
            }
        ]})
fig.show()

In [None]:

TOP_NUM = 10

fig = go.Figure()
ROUTE_MAPPING_VAR = "REL_WRT_TOUR_LOC"
temp_route_mapping_df = route_mapping_df.sort_values(ROUTE_MAPPING_VAR, ascending=False)
for i in range(TOP_NUM):
    dest_cluster = temp_route_mapping_df.iloc[i]["CLUSTER"]
    source_cluster = temp_route_mapping_df.iloc[i]["LOCAL CLUSTER"]
    dest = temp_route_mapping_df.iloc[i][["LATITUDE_D", "LONGITUDE_D"]]
    source = temp_route_mapping_df.iloc[i][["LATITUDE_S", "LONGITUDE_S"]]

    theta = 30
    theta = np.radians(theta)
    c = 0.3
    phi = np.arctan2(dest[1] - source[1], dest[0] - source[0])





    templat1 = dest[0] - c*np.cos(phi - theta)
    templong1 = dest[1] - c*np.sin(phi - theta)
    templat2 = dest[0] - c*np.cos(phi + theta)
    templong2 = dest[1] - c*np.sin(phi + theta)

    fig.add_trace(go.Scattermapbox(lat=[source[0]], lon=[source[1]], mode = "markers", marker=go.scattermapbox.Marker(size=20, color="blue", opacity=0.7)))
    fig.add_trace(go.Scattermapbox(lat=[dest[0]], lon=[dest[1]], mode = "markers", marker=go.scattermapbox.Marker(size=15, color="green")))
    fig.add_trace(go.Scattermapbox(lat=[source[0], dest[0]], lon=[source[1], dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))

    fig.add_trace(go.Scattermapbox(lat=[templat1, dest[0]], lon=[templong1, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))
    fig.add_trace(go.Scattermapbox(lat=[templat2, dest[0]], lon=[templong2, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))


fig.update_layout(mapbox_style="open-street-map", title = f"Top 10 routes ({ROUTE_MAPPING_VAR})", width=1000, height=1000, showlegend=False)
fig.update_layout(mapbox={"layers":[
            {
                "source": india_geojson["geometry"].__geo_interface__,
                "type": "line",
                "color": "black",
                "line": {"width": 0.5},
            }
        ]})
fig.show()

In [None]:

TOP_NUM = 10

fig = go.Figure()
ROUTE_MAPPING_VAR = "REL_WRT_TOUR_LOC"
temp_route_mapping_df = route_mapping_df_filtered.sort_values(ROUTE_MAPPING_VAR, ascending=False)
for i in range(TOP_NUM):
    dest_cluster = temp_route_mapping_df.iloc[i]["CLUSTER"]
    source_cluster = temp_route_mapping_df.iloc[i]["LOCAL CLUSTER"]
    dest = temp_route_mapping_df.iloc[i][["LATITUDE_D", "LONGITUDE_D"]]
    source = temp_route_mapping_df.iloc[i][["LATITUDE_S", "LONGITUDE_S"]]

    theta = 30
    theta = np.radians(theta)
    c = 0.3
    phi = np.arctan2(dest[1] - source[1], dest[0] - source[0])





    templat1 = dest[0] - c*np.cos(phi - theta)
    templong1 = dest[1] - c*np.sin(phi - theta)
    templat2 = dest[0] - c*np.cos(phi + theta)
    templong2 = dest[1] - c*np.sin(phi + theta)

    fig.add_trace(go.Scattermapbox(lat=[source[0]], lon=[source[1]], mode = "markers", marker=go.scattermapbox.Marker(size=20, color="blue", opacity=0.7)))
    fig.add_trace(go.Scattermapbox(lat=[dest[0]], lon=[dest[1]], mode = "markers", marker=go.scattermapbox.Marker(size=15, color="green")))
    fig.add_trace(go.Scattermapbox(lat=[source[0], dest[0]], lon=[source[1], dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))

    fig.add_trace(go.Scattermapbox(lat=[templat1, dest[0]], lon=[templong1, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))
    fig.add_trace(go.Scattermapbox(lat=[templat2, dest[0]], lon=[templong2, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))


fig.update_layout(mapbox_style="open-street-map", title = f"Top 10 routes ({ROUTE_MAPPING_VAR}) filtered", width=1000, height=1000, showlegend=False)
fig.update_layout(mapbox={"layers":[
            {
                "source": india_geojson["geometry"].__geo_interface__,
                "type": "line",
                "color": "black",
                "line": {"width": 0.5},
            }
        ]})
fig.show()

# ROUTE ANALYSIS

In [None]:
change_df = data_df.dropna()[data_df.dropna()["CLUSTER"].astype(int) != data_df.dropna()["NEXT CLUSTER"].astype(int)]

In [None]:
change_df = change_df.groupby(["CLUSTER", "NEXT CLUSTER"]).size().reset_index(name="COUNT").sort_values("COUNT", ascending=False)

In [None]:
change_df = change_df[change_df["COUNT"]>50]

In [None]:
change_df = change_df.merge(cluster_centers_df, left_on="CLUSTER", right_on="CLUSTER", how="left")
change_df = change_df.merge(cluster_centers_df, left_on="NEXT CLUSTER", right_on="CLUSTER", how="left")

In [None]:
change_df.drop(["CLUSTER_y"], axis=1, inplace=True)
change_df.columns = ["CLUSTER", "NEXT CLUSTER", "COUNT", "LATITUDE_1", "LONGITUDE_1", "LATITUDE_2", "LONGITUDE_2"]

In [None]:
cluster_counts = data_df.groupby("CLUSTER").size().reset_index(name="COUNT")

In [None]:
change_df["NCOUNT"] = change_df["CLUSTER"].apply(lambda x: cluster_counts[cluster_counts["CLUSTER"]==x]["COUNT"].values[0])

In [None]:
master_graph = nx.DiGraph()
#nx.set_node_attributes(master_graph, cluster_centers_df.set_index("CLUSTER").to_dict(orient="index"))
def add_edge(row):
    master_graph.add_edge(row["CLUSTER"], row["NEXT CLUSTER"], weight=row["COUNT"])

    master_graph.nodes[row["CLUSTER"]]["LATITUDE"] = row["LATITUDE_1"]
    master_graph.nodes[row["CLUSTER"]]["LONGITUDE"] = row["LONGITUDE_1"]
    master_graph.nodes[row["NEXT CLUSTER"]]["LATITUDE"] = row["LATITUDE_2"]
    master_graph.nodes[row["NEXT CLUSTER"]]["LONGITUDE"] = row["LONGITUDE_2"]
    master_graph.nodes[row["CLUSTER"]]["COUNT"] = row["NCOUNT"]
    master_graph.edges[row["CLUSTER"], row["NEXT CLUSTER"]]["COUNT"] = row["COUNT"]
    

In [None]:
change_df.apply(add_edge, axis=1)

In [None]:
nx.write_gml(master_graph, "master_graph.gml")

In [None]:
degree_df = pd.DataFrame(list(master_graph.degree(weight="COUNT")))
degree_df.columns = ["CLUSTER", "DEGREE"]
degree_df["INDEGREE"] = degree_df["CLUSTER"].apply(lambda x: master_graph.in_degree(x, weight="COUNT"))
degree_df["OUTDEGREE"] = degree_df["CLUSTER"].apply(lambda x: master_graph.out_degree(x, weight="COUNT"))


In [None]:
graphprop_df = degree_df.merge(cluster_centers_df, on="CLUSTER", how="left")

In [None]:
node_between_df = pd.DataFrame(nx.betweenness_centrality(master_graph, weight="COUNT").items())
node_between_df.columns = ["CLUSTER", "BETWEENNESS"]
graphprop_df = graphprop_df.merge(node_between_df, on="CLUSTER", how="left")

In [None]:
graphprop_df.sort_values("BETWEENNESS", ascending=False)

In [None]:
fig = px.scatter_mapbox(graphprop_df.sort_values("DEGREE", ascending=False).head(20), lat="LATITUDE", lon="LONGITUDE", size="DEGREE", color="BETWEENNESS",hover_name="CLUSTER", zoom=3, title= "TOP DEGREE")
fig.update_layout(mapbox_style="open-street-map", width=1000, height=1000)
fig.update_layout(mapbox={"layers":[
            {
                "source": india_geojson["geometry"].__geo_interface__,
                "type": "line",
                "color": "black",
                "line": {"width": 0.5},
            }
        ]})

In [None]:
fig = px.scatter_mapbox(graphprop_df.sort_values("BETWEENNESS", ascending=False).head(20), lat="LATITUDE", lon="LONGITUDE", size="BETWEENNESS", color="DEGREE",hover_name="CLUSTER", zoom=3, title= "TOP BETWEENNESS")
fig.update_layout(mapbox_style="open-street-map", width=1000, height=1000)
fig.update_layout(mapbox={"layers":[
            {
                "source": india_geojson["geometry"].__geo_interface__,
                "type": "line",
                "color": "black",
                "line": {"width": 0.5},
            }
        ]})

In [None]:
betweennessdict = nx.edge_betweenness_centrality(master_graph)

change_df["BETWEENNESS"] = change_df.apply(lambda x: betweennessdict[(x["CLUSTER"], x["NEXT CLUSTER"])], axis=1)

In [None]:
change_df.sort_values("BETWEENNESS", ascending=False)

In [None]:
change_df["DISTANCE"] = change_df.apply(lambda x: haversine(x["LATITUDE_1"], x["LONGITUDE_1"], x["LATITUDE_2"], x["LONGITUDE_2"]), axis=1)

change_df["DISTANCE"] = change_df["DISTANCE"] * 6371

In [None]:
change_df["DISTANCE"].describe()

In [None]:
TOP_NUM = 40
fig = go.Figure()
temp_route_mapping_df = change_df.sort_values("COUNT", ascending=False)

for i in range(TOP_NUM):
    dest_cluster = temp_route_mapping_df.iloc[i]["NEXT CLUSTER"]
    source_cluster = temp_route_mapping_df.iloc[i]["CLUSTER"]
    dest = temp_route_mapping_df.iloc[i][["LATITUDE_2", "LONGITUDE_2"]]
    source = temp_route_mapping_df.iloc[i][["LATITUDE_1", "LONGITUDE_1"]]

    theta = 30
    theta = np.radians(theta)
    c = 0.3
    phi = np.arctan2(dest[1] - source[1], dest[0] - source[0])





    templat1 = dest[0] - c*np.cos(phi - theta)
    templong1 = dest[1] - c*np.sin(phi - theta)
    templat2 = dest[0] - c*np.cos(phi + theta)
    templong2 = dest[1] - c*np.sin(phi + theta)

    fig.add_trace(go.Scattermapbox(lat=[source[0]], lon=[source[1]], mode = "markers", marker=go.scattermapbox.Marker(size=20, color="blue", opacity=0.7)))
    fig.add_trace(go.Scattermapbox(lat=[dest[0]], lon=[dest[1]], mode = "markers", marker=go.scattermapbox.Marker(size=15, color="green")))
    fig.add_trace(go.Scattermapbox(lat=[source[0], dest[0]], lon=[source[1], dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))

    fig.add_trace(go.Scattermapbox(lat=[templat1, dest[0]], lon=[templong1, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))
    fig.add_trace(go.Scattermapbox(lat=[templat2, dest[0]], lon=[templong2, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))


fig.update_layout(mapbox_style="open-street-map", title = f"Top {TOP_NUM} immediate routes (COUNT)", width=1000, height=1000, showlegend=False)
fig.update_layout(mapbox={"layers":[
            {
                "source": india_geojson["geometry"].__geo_interface__,
                "type": "line",
                "color": "black",
                "line": {"width": 0.5},
            }
        ]})
fig.show()

In [None]:
TOP_NUM = 30
fig = go.Figure()
temp_route_mapping_df = change_df[change_df["DISTANCE"]>200].sort_values("COUNT", ascending=False)

for i in range(TOP_NUM):
    dest_cluster = temp_route_mapping_df.iloc[i]["NEXT CLUSTER"]
    source_cluster = temp_route_mapping_df.iloc[i]["CLUSTER"]
    dest = temp_route_mapping_df.iloc[i][["LATITUDE_2", "LONGITUDE_2"]]
    source = temp_route_mapping_df.iloc[i][["LATITUDE_1", "LONGITUDE_1"]]

    theta = 30
    theta = np.radians(theta)
    c = 0.3
    phi = np.arctan2(dest[1] - source[1], dest[0] - source[0])





    templat1 = dest[0] - c*np.cos(phi - theta)
    templong1 = dest[1] - c*np.sin(phi - theta)
    templat2 = dest[0] - c*np.cos(phi + theta)
    templong2 = dest[1] - c*np.sin(phi + theta)

    fig.add_trace(go.Scattermapbox(lat=[source[0]], lon=[source[1]], mode = "markers", marker=go.scattermapbox.Marker(size=20, color="blue", opacity=0.7)))
    fig.add_trace(go.Scattermapbox(lat=[dest[0]], lon=[dest[1]], mode = "markers", marker=go.scattermapbox.Marker(size=15, color="green")))
    fig.add_trace(go.Scattermapbox(lat=[source[0], dest[0]], lon=[source[1], dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))

    fig.add_trace(go.Scattermapbox(lat=[templat1, dest[0]], lon=[templong1, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))
    fig.add_trace(go.Scattermapbox(lat=[templat2, dest[0]], lon=[templong2, dest[1]], mode="lines", line=go.scattermapbox.Line(width=2, color="black")))


fig.update_layout(mapbox_style="open-street-map", title = f"Top {TOP_NUM} immediate routes (COUNT) filtered", width=1000, height=1000, showlegend=False)
fig.update_layout(mapbox={"layers":[
            {
                "source": india_geojson["geometry"].__geo_interface__,
                "type": "line",
                "color": "black",
                "line": {"width": 0.5},
            }
        ]})
fig.show()

In [None]:
filtered_master_graph = nx.DiGraph()

for edge in master_graph.edges(data=True):
    if edge[2]["weight"]>500:
        filtered_master_graph.add_edge(edge[0], edge[1], weight=edge[2]["weight"])


In [None]:
comp = nx.community.louvain_communities(master_graph, seed=0, resolution=1.9, weight="COUNT")

In [None]:
community_df = []
for i,v in enumerate(comp):
    for c in v:
        community_df.append({"CLUSTER": c, "COMMUNITY": i})

community_df = pd.DataFrame(community_df)

community_df = community_df.merge(cluster_centers_df, on="CLUSTER", how="left")

community_df["CLUSTER"] = community_df["CLUSTER"].astype('category')
community_df["COMMUNITY"] = community_df["COMMUNITY"].astype('category')

In [None]:
fig = px.scatter_mapbox(community_df, lat="LATITUDE", lon="LONGITUDE", color="COMMUNITY", title="Communities", zoom=2, )
fig.update_layout(mapbox_style="open-street-map", width=1000, height=1000)

fig.show()

In [None]:
# observer_graph = nx.Graph()

# observer_graph.add_nodes_from(data_df["OBSERVER ID"].unique())
# def addgrouptograph(df):
#     for i in range(len(df)):
#         for j in range(i+1, len(df)):
#             if observer_graph.has_edge(df.iloc[i], df.iloc[j]):
#                 observer_graph.edges[df.iloc[i], df.iloc[j]]["weight"] += 1
#             else:
#                 observer_graph.add_edge(df.iloc[i], df.iloc[j], weight=1)
# obgraphdf = data_df[data_df["GROUP IDENTIFIER"] != "No Group"]
# obgraphdf = obgraphdf[obgraphdf["NUMLIST"]>200]
# obgraphdf.groupby("GROUP IDENTIFIER")["OBSERVER ID"].apply(addgrouptograph)
# nx.write_gml(observer_graph, "observer_graph.gml")

# WEALTH ANALYSIS

In [None]:
wealth_index = pd.read_csv("ind_pak_relative_wealth_index.csv")

In [None]:
wealth_index["latitude truncated"] = wealth_index["latitude"].apply(lambda x: round(x, 1))
wealth_index["longitude truncated"] = wealth_index["longitude"].apply(lambda x: round(x, 1))
wealth_index.drop(["latitude", "longitude"], axis=1, inplace=True)
wealth_index = wealth_index.groupby(["latitude truncated", "longitude truncated"]).mean().reset_index()
data_df["latitude truncated"] = data_df["LATITUDE"].apply(lambda x: round(x, 1))
data_df["longitude truncated"] = data_df["LONGITUDE"].apply(lambda x: round(x, 1))

data_df = data_df.merge(wealth_index, left_on=["latitude truncated", "longitude truncated"], right_on=["latitude truncated", "longitude truncated"], how="left")
data_df.drop(["latitude truncated", "longitude truncated", "error"], axis=1, inplace=True)
data_df.rename(columns={"rwi": "WEALTH INDEX"}, inplace=True)
data_df.dropna(inplace=True)

In [None]:
india_pak_mean_wealth = wealth_index["rwi"].mean()
birding_mean_wealth = data_df["WEALTH INDEX"].mean()
local_birding_mean_wealth = data_df[data_df["LOCAL"] == True]["WEALTH INDEX"].mean()
tourist_birding_mean_wealth = data_df[data_df["LOCAL"] == False]["WEALTH INDEX"].mean()
print("India and Pakistan mean wealth index: ", round(india_pak_mean_wealth, 4))
print("Birding mean wealth index: ", round(birding_mean_wealth, 4))
print("Local birding mean wealth index: ", round(local_birding_mean_wealth, 4))
print("Tourist birding mean wealth index: ", round(tourist_birding_mean_wealth, 4))

In [None]:
birding_mean_wealth

In [None]:
median_wealth_index = {}
median_wealth_index["india_pak"] = wealth_index["rwi"].median()
median_wealth_index["birding"] = data_df["WEALTH INDEX"].median()
median_wealth_index["local_birding"] = data_df[data_df["LOCAL"] == True]["WEALTH INDEX"].median()
median_wealth_index["tourist_birding"] = data_df[data_df["LOCAL"] == False]["WEALTH INDEX"].median()
print("India and Pakistan median wealth index: ", round(median_wealth_index["india_pak"], 4))
print("Birding median wealth index: ", round(median_wealth_index["birding"], 4))
print("Local birding median wealth index: ", round(median_wealth_index["local_birding"], 4))
print("Tourist birding median wealth index: ", round(median_wealth_index["tourist_birding"], 4))

In [None]:
plt.figure(figsize=(10,5))
plt.bar(median_wealth_index.keys(), median_wealth_index.values())
plt.ylabel("Median Wealth Index")
plt.title("Median Wealth Index of India and Pakistan vs Birding Data")
plt.show()

In [None]:
data_df

In [None]:
wealth_local = data_df.groupby("CLUSTER").agg({"WEALTH INDEX": "mean", "LOCAL": "mean", "IS URBAN":"mean"}).sort_values("WEALTH INDEX", ascending=False).dropna().reset_index()

In [None]:
temp = data_df.groupby("CLUSTER").size().reset_index(name="COUNT").sort_values("COUNT", ascending=False)

wealth_local = wealth_local.merge(temp, on="CLUSTER", how="left")

In [None]:
wealth_local[["WEALTH INDEX", "LOCAL", "COUNT", "IS URBAN"]].corr()

In [None]:
sns.regplot(data=wealth_local, x="WEALTH INDEX", y="LOCAL")

In [None]:
fig= px.scatter(wealth_local, x="WEALTH INDEX", y="COUNT", title="Wealth Index vs COUNT", hover_data="CLUSTER")
fig.update_yaxes(type="log")
fig.show()

In [None]:
px.scatter(wealth_local[wealth_local['COUNT']>1000], x="WEALTH INDEX", y="LOCAL", title="Wealth Index vs Localness", hover_data="CLUSTER", color="IS URBAN")

In [None]:
STAT_COUNT = 1000
res = stats.pearsonr(wealth_local[wealth_local["COUNT"]>STAT_COUNT]["WEALTH INDEX"], wealth_local[wealth_local["COUNT"]>STAT_COUNT]["COUNT"])
print(res)
print(res.confidence_interval(0.9))

In [None]:
STAT_COUNT = 1000
res = stats.pearsonr(wealth_local[wealth_local["COUNT"]>STAT_COUNT]["WEALTH INDEX"], wealth_local[wealth_local["COUNT"]>STAT_COUNT]["LOCAL"])
print(res)
print(res.confidence_interval(0.9))

In [None]:
sns.regplot(data=wealth_local[wealth_local["COUNT"]>1000], x="WEALTH INDEX", y="LOCAL")
plt.xlim(-1, 1.5)
plt.show()


In [None]:
sns.regplot(data=wealth_local[wealth_local["COUNT"]>10000], x="WEALTH INDEX", y="LOCAL")
plt.xlim(-1, 1.5)
plt.show()

In [None]:
sns.regplot(data=wealth_local[wealth_local['COUNT']>100], x="WEALTH INDEX", y="COUNT")
plt.yscale("log")

In [None]:
fig = px.scatter_mapbox(cluster_centers_df[cluster_centers_df["CLUSTER"].isin(wealth_local[wealth_local["WEALTH INDEX"]<0.3]["CLUSTER"])], lat="LATITUDE", lon="LONGITUDE", zoom=2, title="Wealth Index")
fig.update_layout(mapbox_style="open-street-map", width=1000, height=1000)

fig.show()

In [None]:
from sklearn.linear_model import QuantileRegressor


In [None]:
quantiles = [0.05, 0.5, 0.95]
predictions = {}

for q in quantiles:
    qr = QuantileRegressor(quantile=q, alpha=0 )
    qr.fit(wealth_local[wealth_local["COUNT"]>1000]["WEALTH INDEX"].values.reshape(-1, 1), wealth_local[wealth_local["COUNT"]>1000]["LOCAL"].values)
    predictions[q] = qr.predict(wealth_local[wealth_local["COUNT"]>1000]["WEALTH INDEX"].values.reshape(-1, 1))


In [None]:
plt.plot(wealth_local[wealth_local["COUNT"]>1000]["WEALTH INDEX"], predictions[quantiles[0]], label=f"{quantiles[0]}")
plt.plot(wealth_local[wealth_local["COUNT"]>1000]["WEALTH INDEX"], predictions[quantiles[1]], label=f"{quantiles[1]}")
plt.plot(wealth_local[wealth_local["COUNT"]>1000]["WEALTH INDEX"], predictions[quantiles[2]], label=f"{quantiles[2]}")

plt.plot(wealth_local[wealth_local["COUNT"]>1000]["WEALTH INDEX"], wealth_local[wealth_local["COUNT"]>1000]["LOCAL"], "o", label="Data", alpha=0.5)

plt.xlabel("Wealth Index")
plt.ylabel("Localness")
plt.title("Quantile Regression")

plt.legend()
plt.show()


In [None]:
quantiles1 = [0.05, 0.5, 0.95]
predictions1 = {}

for q in quantiles1:
    qr = QuantileRegressor(quantile=q, alpha=0 )
    qr.fit(wealth_local[wealth_local["COUNT"]>1000]["WEALTH INDEX"].values.reshape(-1, 1), np.log(wealth_local[wealth_local["COUNT"]>1000]["COUNT"]).values)
    predictions1[q] = qr.predict(wealth_local[wealth_local["COUNT"]>1000]["WEALTH INDEX"].values.reshape(-1, 1))


In [None]:
plt.plot(wealth_local[wealth_local["COUNT"]>1000]["WEALTH INDEX"], predictions1[quantiles1[0]], label=f"{quantiles1[0]}")
plt.plot(wealth_local[wealth_local["COUNT"]>1000]["WEALTH INDEX"], predictions1[quantiles1[1]], label=f"{quantiles1[1]}")
plt.plot(wealth_local[wealth_local["COUNT"]>1000]["WEALTH INDEX"], predictions1[quantiles1[2]], label=f"{quantiles1[2]}")

plt.plot(wealth_local[wealth_local["COUNT"]>1000]["WEALTH INDEX"], np.log(wealth_local[wealth_local["COUNT"]>1000]["COUNT"]), "o", label="Data", alpha=0.5)

plt.xlabel("Wealth Index")
plt.ylabel("Count")
plt.title("Quantile Regression")

plt.legend()
plt.show()


# URBAN RURAL ANALYSIS

Unfortunately only analyse Urban=True, 
High precision low recall, facebook definition of urban is very thin

In [None]:
data_df.groupby("IS URBAN").size()

In [None]:
data_df.groupby("IS URBAN").agg({"WEALTH INDEX": "mean"})

In [None]:
data_df.groupby(["IS URBAN", "LOCAL"]).agg({"WEALTH INDEX": "mean"})

In [None]:
data_df.groupby("IS URBAN").agg({"LOCAL": "mean"})

Pairplots

In [None]:
corr_df = data_df.groupby("CLUSTER").agg({"WEALTH INDEX": "mean", "LOCAL": "mean", "IS URBAN":"mean"}).sort_values("WEALTH INDEX", ascending=False).dropna().reset_index()
count_df = data_df.groupby("CLUSTER").size().reset_index(name="COUNT").sort_values("COUNT", ascending=False)
corr_df = corr_df.merge(count_df, on="CLUSTER", how="left")

In [None]:
corr_df[["WEALTH INDEX", "LOCAL", "IS URBAN", "COUNT"]].corr()

In [None]:
data_df[["LOCAL", "IS URBAN", "WEALTH INDEX"]].corr()

In [None]:
sns.histplot(data_df["LOCAL"], bins=20)

MOST CLUSTERS ARE TOURIST CLUSTERS

In [None]:
sns.histplot(corr_df["LOCAL"], bins=20)

In [None]:
sns.pairplot(corr_df[["WEALTH INDEX", "LOCAL", "IS URBAN", "COUNT"]])

In [None]:
sns.pairplot(corr_df[corr_df["COUNT"]>1000][["WEALTH INDEX", "LOCAL", "IS URBAN", "COUNT"]])

In [None]:
data_df.groupby("OBSERVER ID").agg({"IS URBAN": "mean"}).sort_values("IS URBAN", ascending=False).plot(kind="hist", bins=20, xlabel="URBANNESS", ylabel="Number of Observers")

In [None]:
fig = px.scatter_mapbox(data_df.sample(20000), lat="LATITUDE", lon="LONGITUDE", zoom=2, color = "IS URBAN",title="Urban Observers")
fig.update_layout(mapbox_style="open-street-map", width=1000, height=1000)
fig.show()

In [None]:
data_df[data_df["IS URBAN"] == True]["LOCAL"].mean()

In [None]:
data_df[data_df["IS URBAN"] == False]["LOCAL"].mean()

In [None]:
data_df[["LOCAL", "IS URBAN"]].corr()

# Population density Magic

# CLIMATE ANALYSIS

In [None]:
climate_data = pd.read_csv("climate_change.csv")
precipitation_data = pd.read_csv("precipitation.csv")

In [None]:
climate_data

In [None]:
# climate_data["T2000_01"] = climate_data["T2000_01"]+273.15
# climate_data["T2014_01"] = climate_data["T2014_01"]+273.15
climate_data[['T2000_01', 'T2001_01', 'T2002_01', 'T2003_01', 'T2004_01', 'T2005_01', 'T2006_01', 'T2007_01', 'T2008_01', 'T2009_01', 'T2010_01', 'T2011_01', 'T2012_01', 'T2013_01', 'T2014_01']] = climate_data[['T2000_01', 'T2001_01', 'T2002_01', 'T2003_01', 'T2004_01', 'T2005_01', 'T2006_01', 'T2007_01', 'T2008_01', 'T2009_01', 'T2010_01', 'T2011_01', 'T2012_01', 'T2013_01', 'T2014_01']] + 273.15
climate_data["CHANGE"] = climate_data["T2014_01"] - climate_data["T2000_01"]
climate_data["CHANGE PERC"] = climate_data["CHANGE"] / climate_data["T2000_01"]

In [None]:
climate_data

In [None]:
precipitation_data["CHANGE"] = precipitation_data["T2014_01"] - precipitation_data["T2000_01"]
precipitation_data["CHANGE PERC"] = precipitation_data["CHANGE"] / precipitation_data["T2000_01"]

In [None]:
cluster_centers_df_clim = cluster_centers_df.copy()
cluster_centers_df_clim["LATITUDE"] = cluster_centers_df_clim["LATITUDE"].apply(lambda x: round(x, 2))
cluster_centers_df_clim["LONGITUDE"] = cluster_centers_df_clim["LONGITUDE"].apply(lambda x: round(x, 2))


In [None]:
def get_climate_perc_data(row):
    lon = min(climate_data["GID_LON"], key=lambda x: abs(x-row["LONGITUDE"]))
    lat = min(climate_data["GID_LAT"], key=lambda x: abs(x-row["LATITUDE"]))
    return climate_data[(climate_data["GID_LON"]==lon) & (climate_data["GID_LAT"]==lat)]["CHANGE PERC"].values[0]
def get_climate_data(row):
    lon = min(climate_data["GID_LON"], key=lambda x: abs(x-row["LONGITUDE"]))
    lat = min(climate_data["GID_LAT"], key=lambda x: abs(x-row["LATITUDE"]))
    return climate_data[(climate_data["GID_LON"]==lon) & (climate_data["GID_LAT"]==lat)]["CHANGE"].values[0]


In [None]:
cluster_centers_df_clim["TEMP CHANGE PERC"] = cluster_centers_df_clim.apply(get_climate_perc_data, axis=1)
cluster_centers_df_clim["TEMP CHANGE"] = cluster_centers_df_clim.apply(get_climate_data, axis=1)


In [None]:
def get_preci_perc_data(row):
    lon = min(precipitation_data["GID_LON"], key=lambda x: abs(x-row["LONGITUDE"]))
    lat = min(precipitation_data["GID_LAT"], key=lambda x: abs(x-row["LATITUDE"]))
    return precipitation_data[(precipitation_data["GID_LON"]==lon) & (precipitation_data["GID_LAT"]==lat)]["CHANGE PERC"].values[0]
def get_preci_data(row):
    lon = min(precipitation_data["GID_LON"], key=lambda x: abs(x-row["LONGITUDE"]))
    lat = min(precipitation_data["GID_LAT"], key=lambda x: abs(x-row["LATITUDE"]))
    return precipitation_data[(precipitation_data["GID_LON"]==lon) & (precipitation_data["GID_LAT"]==lat)]["CHANGE"].values[0]

In [None]:
cluster_centers_df_clim["PRECI CHANGE PERC"] = cluster_centers_df_clim.apply(get_preci_perc_data, axis=1)
cluster_centers_df_clim["PRECI CHANGE"] = cluster_centers_df_clim.apply(get_preci_data, axis=1)


In [None]:
cluster_centers_df_clim["LOCALNESS"] = cluster_centers_df_clim["CLUSTER"].apply(lambda x: data_df[data_df["CLUSTER"]==x]["LOCAL"].mean())
cluster_centers_df_clim["COUNT"] = cluster_centers_df_clim["CLUSTER"].apply(lambda x: data_df[data_df["CLUSTER"]==x].shape[0])

In [None]:
sns.regplot(data=cluster_centers_df_clim, x="TEMP CHANGE", y="PRECI CHANGE PERC")