In [1]:
import geopandas as gpd
import pandas as pd

# Load GeoJSON
gdf = gpd.read_file("../data/processed/india_dengue_state_year.geojson")

# Ensure correct types
gdf["cases"] = pd.to_numeric(gdf["cases"], errors="coerce")
gdf["year"] = pd.to_numeric(gdf["year"], errors="coerce")

# Sort data properly
gdf = gdf.sort_values(by=["state", "year"]).reset_index(drop=True)

gdf.head()


Unnamed: 0,GID_1,GID_0,COUNTRY,state_std,VARNAME_1,NL_NAME_1,TYPE_1,ENGTYPE_1,CC_1,HASC_1,ISO_1,state,year,cases,deaths,geometry
0,IND.4_1,IND,India,Assam,,,State,State,,IN.AS,IN-AS,Assam,2019.0,196.0,0,"MULTIPOLYGON (((92.7545 24.5011, 92.7436 24.47..."
1,IND.4_1,IND,India,Assam,,,State,State,,IN.AS,IN-AS,Assam,2020.0,33.0,0,"MULTIPOLYGON (((92.7545 24.5011, 92.7436 24.47..."
2,IND.4_1,IND,India,Assam,,,State,State,,IN.AS,IN-AS,Assam,2021.0,103.0,0,"MULTIPOLYGON (((92.7545 24.5011, 92.7436 24.47..."
3,IND.4_1,IND,India,Assam,,,State,State,,IN.AS,IN-AS,Assam,2022.0,1826.0,2,"MULTIPOLYGON (((92.7545 24.5011, 92.7436 24.47..."
4,IND.4_1,IND,India,Assam,,,State,State,,IN.AS,IN-AS,Assam,2023.0,8208.0,7,"MULTIPOLYGON (((92.7545 24.5011, 92.7436 24.47..."


In [2]:
# Lag 1 year cases (within each state)
gdf["cases_lag1"] = gdf.groupby("state")["cases"].shift(1)

gdf[["state", "year", "cases", "cases_lag1"]].head(10)


Unnamed: 0,state,year,cases,cases_lag1
0,Assam,2019.0,196.0,
1,Assam,2020.0,33.0,196.0
2,Assam,2021.0,103.0,33.0
3,Assam,2022.0,1826.0,103.0
4,Assam,2023.0,8208.0,1826.0
5,Assam,2024.0,270.0,8208.0
6,Bihar,2019.0,6712.0,
7,Bihar,2020.0,493.0,6712.0
8,Bihar,2021.0,633.0,493.0
9,Bihar,2022.0,13972.0,633.0


In [3]:
gdf["cases_lag2"] = gdf.groupby("state")["cases"].shift(2)

gdf[["state", "year", "cases", "cases_lag1", "cases_lag2"]].head(12)


Unnamed: 0,state,year,cases,cases_lag1,cases_lag2
0,Assam,2019.0,196.0,,
1,Assam,2020.0,33.0,196.0,
2,Assam,2021.0,103.0,33.0,196.0
3,Assam,2022.0,1826.0,103.0,33.0
4,Assam,2023.0,8208.0,1826.0,103.0
5,Assam,2024.0,270.0,8208.0,1826.0
6,Bihar,2019.0,6712.0,,
7,Bihar,2020.0,493.0,6712.0,
8,Bihar,2021.0,633.0,493.0,6712.0
9,Bihar,2022.0,13972.0,633.0,493.0


In [4]:
gdf[gdf["state"] == "Bihar"][["year", "cases", "cases_lag1", "cases_lag2"]]


Unnamed: 0,year,cases,cases_lag1,cases_lag2
6,2019.0,6712.0,,
7,2020.0,493.0,6712.0,
8,2021.0,633.0,493.0,6712.0
9,2022.0,13972.0,633.0,493.0
10,2023.0,20224.0,13972.0,633.0
11,2024.0,7.0,20224.0,13972.0


In [5]:
gdf[["cases_lag1", "cases_lag2"]].isna().sum()


cases_lag1    41
cases_lag2    63
dtype: int64