In [1]:
import geopandas as gpd
import pandas as pd

# Load processed GeoJSON
gdf = gpd.read_file("../data/processed/india_dengue_state_year.geojson")

# Enforce numeric types
for col in ["cases", "year"]:
    gdf[col] = pd.to_numeric(gdf[col], errors="coerce")

# Sort correctly
gdf = gdf.sort_values(by=["state", "year"]).reset_index(drop=True)

# Recreate temporal lags (safe redundancy)
gdf["cases_lag1"] = gdf.groupby("state")["cases"].shift(1)
gdf["cases_lag2"] = gdf.groupby("state")["cases"].shift(2)

# Load adjacency & compute spatial lag again (ensures reproducibility)
adj_df = pd.read_csv("../data/processed/state_adjacency.csv")

lag_lookup = gdf[["state", "year", "cases_lag1"]].copy()

def compute_spatial_lag(row):
    neighbors = adj_df[adj_df["state"] == row["state"]]["neighbor"].tolist()
    target_year = row["year"] - 1
    
    values = lag_lookup[
        (lag_lookup["state"].isin(neighbors)) &
        (lag_lookup["year"] == target_year)
    ]["cases_lag1"]
    
    return values.mean() if not values.empty else None

gdf["spatial_lag_cases"] = gdf.apply(compute_spatial_lag, axis=1)

gdf.head()


Unnamed: 0,GID_1,GID_0,COUNTRY,state_std,VARNAME_1,NL_NAME_1,TYPE_1,ENGTYPE_1,CC_1,HASC_1,ISO_1,state,year,cases,deaths,geometry,cases_lag1,cases_lag2,spatial_lag_cases
0,IND.4_1,IND,India,Assam,,,State,State,,IN.AS,IN-AS,Assam,2019.0,196.0,0,"MULTIPOLYGON (((92.7545 24.5011, 92.7436 24.47...",,,
1,IND.4_1,IND,India,Assam,,,State,State,,IN.AS,IN-AS,Assam,2020.0,33.0,0,"MULTIPOLYGON (((92.7545 24.5011, 92.7436 24.47...",196.0,,
2,IND.4_1,IND,India,Assam,,,State,State,,IN.AS,IN-AS,Assam,2021.0,103.0,0,"MULTIPOLYGON (((92.7545 24.5011, 92.7436 24.47...",33.0,196.0,200.5
3,IND.4_1,IND,India,Assam,,,State,State,,IN.AS,IN-AS,Assam,2022.0,1826.0,2,"MULTIPOLYGON (((92.7545 24.5011, 92.7436 24.47...",103.0,33.0,52.0
4,IND.4_1,IND,India,Assam,,,State,State,,IN.AS,IN-AS,Assam,2023.0,8208.0,7,"MULTIPOLYGON (((92.7545 24.5011, 92.7436 24.47...",1826.0,103.0,143.0


In [2]:
# Create risk labels using quantiles
gdf["risk_label"] = pd.qcut(
    gdf["cases"],
    q=3,
    labels=["Low", "Medium", "High"]
)

gdf[["cases", "risk_label"]].head()


Unnamed: 0,cases,risk_label
0,196.0,Low
1,33.0,Low
2,103.0,Low
3,1826.0,Medium
4,8208.0,High


In [3]:
features = [
    "cases_lag1",
    "cases_lag2",
    "spatial_lag_cases"
]

target = "risk_label"

ml_df = gdf[["state", "year"] + features + [target]].copy()
ml_df.head()


Unnamed: 0,state,year,cases_lag1,cases_lag2,spatial_lag_cases,risk_label
0,Assam,2019.0,,,,Low
1,Assam,2020.0,196.0,,,Low
2,Assam,2021.0,33.0,196.0,200.5,Low
3,Assam,2022.0,103.0,33.0,52.0,Medium
4,Assam,2023.0,1826.0,103.0,143.0,High


In [4]:
ml_df.isna().sum()


state                 19
year                  19
cases_lag1            41
cases_lag2            63
spatial_lag_cases    107
risk_label            19
dtype: int64

In [5]:
ml_df_clean = ml_df.dropna().reset_index(drop=True)

print("Before:", ml_df.shape)
print("After:", ml_df_clean.shape)


Before: (151, 6)
After: (44, 6)


In [6]:
ml_df_clean.describe(include="all")
ml_df_clean["risk_label"].value_counts()


risk_label
High      23
Medium    13
Low        8
Name: count, dtype: int64

In [7]:
ml_df_clean.to_csv(
    "../data/processed/final_ml_dataset.csv",
    index=False
)
