# create train and test set

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import gc

In [2]:
def process_ecv(outbreaks_data, evc_data, ecv):
    outbreaks_ecv = gpd.sjoin(evc_data, outbreaks_data[['district', 'geometry']], how='inner', op='within').reset_index(drop=True)
    outbreaks_ecv = outbreaks_ecv.drop('index_right', axis=1)
    outbreaks_ecv = outbreaks_ecv.groupby(['district', 'year', 'month']).aggregate('mean').reset_index()
    outbreaks_ecv[ecv+'_lag_1'] = outbreaks_ecv.groupby(['district'])[ecv].shift(1)
    outbreaks_ecv[ecv+'_lag_2'] = outbreaks_ecv.groupby(['district'])[ecv].shift(2)
    outbreaks_ecv = outbreaks_ecv.dropna()
    return outbreaks_ecv

## outbreaks

In [3]:
outbreaks = gpd.read_file('../data/cholera_outbreaks/monthly_cholera_outbreaks_per_district_2010_2015.shp')

In [4]:
outbreaks.crs

{'init': 'epsg:4326'}

In [5]:
outbreaks.shape

(329, 6)

In [6]:
outbreaks.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 329 entries, 0 to 328
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   state     329 non-null    object  
 1   district  329 non-null    object  
 2   year      329 non-null    int64   
 3   month     329 non-null    int64   
 4   outbreak  329 non-null    int64   
 5   geometry  329 non-null    geometry
dtypes: geometry(1), int64(3), object(2)
memory usage: 15.5+ KB


In [7]:
outbreaks.head()

Unnamed: 0,state,district,year,month,outbreak,geometry
0,maharashtra,solapur,2010,2,1,"POLYGON ((74.90098 17.23968, 74.90394 17.24113..."
1,gujarat,panch mahals,2010,5,1,"POLYGON ((73.73135 22.28985, 73.72839 22.28884..."
2,tamil nadu,salem,2010,5,1,"POLYGON ((78.22656 11.90686, 78.22643 11.90152..."
3,tamil nadu,theni,2010,5,1,"POLYGON ((77.34090 9.59505, 77.32851 9.57323, ..."
4,tamil nadu,theni,2012,1,1,"POLYGON ((77.34090 9.59505, 77.32851 9.57323, ..."


In [8]:
outbreaks['geometry'] = outbreaks['geometry'].buffer(1) # which crs (e.g. 4326 and buffer 1 degree or 3395 and buffer 111000 meters)?

In [9]:
outbreaks.head()

Unnamed: 0,state,district,year,month,outbreak,geometry
0,maharashtra,solapur,2010,2,1,"POLYGON ((76.33869 16.31147, 76.27006 16.30167..."
1,gujarat,panch mahals,2010,5,1,"POLYGON ((72.37187 22.62004, 72.36983 22.62965..."
2,tamil nadu,salem,2010,5,1,"POLYGON ((78.96695 10.38823, 78.94716 10.37893..."
3,tamil nadu,theni,2010,5,1,"POLYGON ((77.85536 8.62118, 77.82449 8.60658, ..."
4,tamil nadu,theni,2012,1,1,"POLYGON ((77.85536 8.62118, 77.82449 8.60658, ..."


## spatial joins, district level and feature creation

### sea surface salinity

In [10]:
sss = gpd.read_file('../data/sea_surface_salinity/monthly_sss_2010_2015.shp')

In [11]:
sss.crs

{'init': 'epsg:4326'}

In [12]:
sss.shape

(908712, 4)

In [13]:
sss.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 908712 entries, 0 to 908711
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype   
---  ------    --------------   -----   
 0   year      908712 non-null  int64   
 1   month     908712 non-null  int64   
 2   sss       908712 non-null  float64 
 3   geometry  908712 non-null  geometry
dtypes: float64(1), geometry(1), int64(2)
memory usage: 27.7 MB


In [14]:
sss.head()

Unnamed: 0,year,month,sss,geometry
0,2010,1,35.265869,POINT (60.04323 0.09808)
1,2010,2,35.191532,POINT (60.04323 0.09808)
2,2010,3,35.426231,POINT (60.04323 0.09808)
3,2010,4,35.37043,POINT (60.04323 0.09808)
4,2010,5,35.31086,POINT (60.04323 0.09808)


In [15]:
%%time

sss_district_level = process_ecv(outbreaks, sss, 'sss')

CPU times: user 1min 8s, sys: 196 ms, total: 1min 8s
Wall time: 1min 9s


In [16]:
sss_district_level.shape

(2940, 6)

In [17]:
sss_district_level.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 2940 entries, 2 to 3023
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   district   2940 non-null   object 
 1   year       2940 non-null   int64  
 2   month      2940 non-null   int64  
 3   sss        2940 non-null   float64
 4   sss_lag_1  2940 non-null   float64
 5   sss_lag_2  2940 non-null   float64
dtypes: float64(3), int64(2), object(1)
memory usage: 160.8+ KB


In [18]:
sss_district_level.head()

Unnamed: 0,district,year,month,sss,sss_lag_1,sss_lag_2
2,belgaum,2010,3,34.0274,34.600696,35.335807
3,belgaum,2010,4,34.299124,34.0274,34.600696
4,belgaum,2010,5,34.924089,34.299124,34.0274
5,belgaum,2010,6,35.246572,34.924089,34.299124
6,belgaum,2010,7,35.28239,35.246572,34.924089


In [19]:
del sss
gc.collect()

106

### chlorophyll-a concentration

In [20]:
chlor_a = gpd.read_file('../data/chlorophyll_a_concentration/monthly_chlor_a_2010.shp')

In [21]:
chlor_a = chlor_a.rename(columns={'chlor_a_lo': 'chlor_a_log'})

In [22]:
chlor_a.crs

{'init': 'epsg:4326'}

In [23]:
chlor_a.shape

(4933904, 4)

In [24]:
chlor_a.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 4933904 entries, 0 to 4933903
Data columns (total 4 columns):
 #   Column       Dtype   
---  ------       -----   
 0   year         int64   
 1   month        int64   
 2   chlor_a_log  float64 
 3   geometry     geometry
dtypes: float64(1), geometry(1), int64(2)
memory usage: 150.6 MB


In [33]:
chlor_a.head()

Unnamed: 0,year,month,chlor_a_log,geometry
0,2015,6,1.488876,POINT (64.81250 39.85417)
1,2015,6,1.643125,POINT (64.85416 39.85417)
2,2015,6,-0.685363,POINT (73.35416 39.10417)
3,2015,6,-2.577963,POINT (73.35416 39.02083)
4,2015,6,-1.006353,POINT (73.47916 39.02083)


In [26]:
chlor_a_district_level = pd.DataFrame(columns=['district', 'year', 'month', 'chlor_a_log', 'chlor_a_log_lag_1', 'chlor_a_log_lag_2'])

In [27]:
years = [2010,
         2011,
         2012,
         2013,
         2014,
         2015]

In [37]:
%%time

for year in years:
    print('Processing {}...'.format(year))
    chlor_a = gpd.read_file('../data/chlorophyll_a_concentration/monthly_chlor_a_'+str(year)+'.shp')
    chlor_a = chlor_a.rename(columns={'chlor_a_lo': 'chlor_a_log'})
    chlor_a_district_level_ = process_ecv(outbreaks, chlor_a, 'chlor_a_log')
    chlor_a_district_level = chlor_a_district_level.append(chlor_a_district_level_)

Processing 2010...
Processing 2011...
Processing 2012...
Processing 2013...
Processing 2014...
Processing 2015...
CPU times: user 50min 1s, sys: 11.6 s, total: 50min 12s
Wall time: 50min 20s


In [38]:
chlor_a_district_level.shape

(3569, 6)

In [39]:
chlor_a_district_level.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3569 entries, 2 to 670
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   district           3569 non-null   object 
 1   year               3569 non-null   object 
 2   month              3569 non-null   object 
 3   chlor_a_log        3569 non-null   float64
 4   chlor_a_log_lag_1  3569 non-null   float64
 5   chlor_a_log_lag_2  3569 non-null   float64
dtypes: float64(3), object(3)
memory usage: 195.2+ KB


In [40]:
chlor_a_district_level.head()

Unnamed: 0,district,year,month,chlor_a_log,chlor_a_log_lag_1,chlor_a_log_lag_2
2,ahmadabad,2010,3,2.263289,2.25506,2.401536
3,ahmadabad,2010,4,2.283045,2.263289,2.25506
4,ahmadabad,2010,5,2.110295,2.283045,2.263289
5,ahmadabad,2010,6,2.239745,2.110295,2.283045
6,ahmadabad,2010,7,2.174889,2.239745,2.110295


In [41]:
del [chlor_a, chlor_a_district_level_]
gc.collect()

157

## join features and target

In [90]:
train_test = pd.merge(sss_district_level, outbreaks[['district', 'year', 'month', 'outbreak']], how='left', on=['district', 'year', 'month'])

In [91]:
train_test.shape

(2940, 7)

In [92]:
train_test = pd.merge(train_test, chlor_a_district_level, how='inner', on=['district', 'year', 'month'])

In [93]:
train_test.shape

(2396, 10)

In [94]:
train_test.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 2396 entries, 0 to 2395
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   district           2396 non-null   object 
 1   year               2396 non-null   object 
 2   month              2396 non-null   object 
 3   sss                2396 non-null   float64
 4   sss_lag_1          2396 non-null   float64
 5   sss_lag_2          2396 non-null   float64
 6   outbreak           77 non-null     float64
 7   chlor_a_log        2396 non-null   float64
 8   chlor_a_log_lag_1  2396 non-null   float64
 9   chlor_a_log_lag_2  2396 non-null   float64
dtypes: float64(7), object(3)
memory usage: 205.9+ KB


In [95]:
train_test.head()

Unnamed: 0,district,year,month,sss,sss_lag_1,sss_lag_2,outbreak,chlor_a_log,chlor_a_log_lag_1,chlor_a_log_lag_2
0,belgaum,2010,3,34.0274,34.600696,35.335807,,-0.476916,-0.444519,-0.043873
1,belgaum,2010,4,34.299124,34.0274,34.600696,,-0.544771,-0.476916,-0.444519
2,belgaum,2010,5,34.924089,34.299124,34.0274,,-0.459166,-0.544771,-0.476916
3,belgaum,2010,6,35.246572,34.924089,34.299124,,-0.082965,-0.459166,-0.544771
4,belgaum,2010,7,35.28239,35.246572,34.924089,,0.288466,-0.082965,-0.459166


In [96]:
train_test['outbreak'] = train_test.outbreak.fillna(0)

In [97]:
train_test.outbreak.value_counts()

0.0    2319
1.0      77
Name: outbreak, dtype: int64

## map month to season

In [98]:
seasons = {1: 'winter',
           2: 'winter',
           3: 'pre_monsoon',
           4: 'pre_monsoon',
           5: 'pre_monsoon',
           6: 'monsoon',
           7: 'monsoon',
           8: 'monsoon',
           9: 'monsoon',
           10: 'post_monsoon',
           11: 'post_monsoon',
           12: 'post_monsoon'}

In [99]:
train_test['season'] = train_test.month.map(seasons)

In [100]:
dummies = pd.get_dummies(train_test.season, prefix='season', prefix_sep='_')

In [101]:
train_test = pd.concat([train_test, dummies], axis=1).drop('season', axis=1)

In [102]:
train_test.head()

Unnamed: 0,district,year,month,sss,sss_lag_1,sss_lag_2,outbreak,chlor_a_log,chlor_a_log_lag_1,chlor_a_log_lag_2,season_monsoon,season_post_monsoon,season_pre_monsoon
0,belgaum,2010,3,34.0274,34.600696,35.335807,0.0,-0.476916,-0.444519,-0.043873,0,0,1
1,belgaum,2010,4,34.299124,34.0274,34.600696,0.0,-0.544771,-0.476916,-0.444519,0,0,1
2,belgaum,2010,5,34.924089,34.299124,34.0274,0.0,-0.459166,-0.544771,-0.476916,0,0,1
3,belgaum,2010,6,35.246572,34.924089,34.299124,0.0,-0.082965,-0.459166,-0.544771,1,0,0
4,belgaum,2010,7,35.28239,35.246572,34.924089,0.0,0.288466,-0.082965,-0.459166,1,0,0


## map district to location

In [103]:
# find centroid of district
# check whether centroid is east or west of certain longitude
# assign east or west

## save final dataframe

In [104]:
train_test.to_pickle('../data/train_test.pkl.gz', compression='gzip')