## Load and prepare data for regression analysis

In [None]:
import pandas as pd
import geopandas as gpd

In [None]:
# forest loss data
df_forest = pd.read_csv('Data/district_forest_cover.csv')

# SPEI growing period data
df_spei = pd.read_csv('Data/district_province_spei_growingperiod.csv')

# rename season_year column in SPEI df
df_spei = df_spei.rename(columns={'season_year': 'year'})

# population data
df_pop = pd.read_csv('Data/district_pop_growth.csv')

# clean columns
df_pop = df_pop.drop(columns=['Unnamed: 0'])

for df in [df_forest, df_pop, df_spei]:
    df['district'] = df['district'].str.strip().str.title()

# calculate population density
# load district shapefile
districts_gdf = gpd.read_file('Data/district.shp')

# set CRS
districts_gdf = districts_gdf.set_crs(epsg=4326)

# reproject to a metric CRS
districts_gdf = districts_gdf.to_crs(epsg=32735)

# compute area (square kilometers)
districts_gdf['district_area_km2'] = districts_gdf['geometry'].area / 1e6

# standardize district names
districts_gdf['district'] = districts_gdf['NAME_2'].str.strip().str.title()
df_pop['district'] = df_pop['district'].str.strip().str.title()

# drop geometry
area_df = districts_gdf[['district', 'district_area_km2']]

# merge with population
df_pop = df_pop.merge(area_df, on='district', how='left')

# calculate population density
df_pop['pop_density'] = df_pop['population'] / df_pop['district_area_km2']

merged = (df_forest
          .merge(df_pop,  on=['district', 'year'], how='outer')
          .merge(df_spei, on=['district', 'year'], how='outer'))

# calculate percent forest cover in 2000
merged['percent_forest_2000'] = (merged['forest_2000_m2'] / merged['district_area_km2']) * 100

# compute SPEI lags
merged = merged.sort_values(['district', 'year'])
for lag in range(1, 6):
    merged[f'mean_spei_lag{lag}'] = (
        merged.groupby('district')['mean_growing_spei'].shift(lag)
    )

model_vars = ['percent_loss_annual', 'mean_spei_lag4',
              'pop_density', 'percent_forest_2000']

cleaned = merged[
    (merged['year'].between(2001, 2020)) & 
    merged[model_vars].notna().all(axis=1)
].copy()

# center population density
cleaned['pop_density_c'] = cleaned['pop_density'] - cleaned['pop_density'].mean()

# scale population density
cleaned['pop_density_c_scaled'] = cleaned['pop_density_c'] / 100

# within-district centering to isolate changes over time
cleaned['pop_density_wi'] = (
    cleaned['pop_density_c_scaled'] -
    cleaned.groupby('district')['pop_density_c_scaled'].transform('mean')
)

# create interaction term
cleaned['spei4_x_popdens'] = cleaned['mean_spei_lag4'] * cleaned['pop_density_wi']
    
cleaned.head()

# export to csv
cleaned.to_csv('regression_predictors.csv', index=False)