# preprocess cholera outbreaks

In [1]:
import os
import numpy as np
import pandas as pd
import PyPDF2
import re
import geopandas as gpd
import requests
import zipfile
import io

## download cholera outbreaks data

In [2]:
!python download_cholera_outbreaks_data.py

Download cholera outbreaks data...
Processing 2010 with 53 files...
Processing 2011 with 53 files...
Processing 2012 with 52 files...
Processing 2013 with 52 files...
Processing 2014 with 53 files...
Processing 2015 with 53 files...
Download cholera outbreaks data complete.


## load pdfs and extract relevant parts

In [3]:
files_to_skip = ['52nd_wk_old.pdf',
                 '11th_wk14_old.pdf']

In [4]:
years = ['2010',
         '2011',
         '2012',
         '2013',
         '2014',
         '2015']

In [5]:
path = '../data/cholera_outbreaks/'

In [6]:
%%time

outbreaks_raw = []

for year in years:
    print('Processing year: {}...'.format(year))
    (_, _, file_names) = next(os.walk(path+year))
    file_names = np.setdiff1d(file_names, files_to_skip) # drop files to skip from file names
    
    for file in file_names:
        print('Processing file: {}...'.format(file))
        document = PyPDF2.PdfFileReader(path+year+'/'+file)
        pages = document.getNumPages()
        
        for page in range(pages):
            text = document.getPage(page).extractText()
            text = text.replace('\n', '') # remove line breaks
            text = text.split(' ') # split on space
            text = [i for i in text if i != ''] # remove empty items
            text = [i.lower() for i in text] # lower case all items
    
            indices = [i for i, s in enumerate(text) if 'cholera' in s] # get indices where cholera is mentioned
        
            if len(indices) > 0:
                for index in indices:
                    cholera_index = []
                    cholera_index.append(index)
                    index_list = np.arange(index-3,index).tolist() + cholera_index + np.arange(index+1,index+5).tolist()
                    index_list = [i for i in index_list if i < len(text)] # get items that are up to 3 indices before and up to 5 indices after cholera is mentioned
                    outbreak_info = [text[i] for i in index_list]
                    while len(outbreak_info) < 9:
                        outbreak_info.append(None)
                    outbreak_info.append(year+'/'+file)
                    outbreaks_raw.append(outbreak_info)
            else:
                pass

Processing year: 2010...
Processing file: 10th_wk10.pdf...
Processing file: 11th_wk10.pdf...
Processing file: 12th_wk10.pdf...
Processing file: 13th_wk10.pdf...
Processing file: 14th_wk10.pdf...
Processing file: 15th_wk10.pdf...
Processing file: 16th_wk10.pdf...
Processing file: 17th_wk10.pdf...
Processing file: 18th_wk10.pdf...
Processing file: 19th_wk10.pdf...
Processing file: 1st_wk10.pdf...
Processing file: 20th_wk10.pdf...
Processing file: 21st_wk10.pdf...
Processing file: 22nd_wk10.pdf...
Processing file: 23rd_wk10.pdf...
Processing file: 24th_wk10.pdf...
Processing file: 25th_wk10.pdf...
Processing file: 26th_wk10.pdf...
Processing file: 27th_wk10.pdf...
Processing file: 28th_wk10.pdf...
Processing file: 29th_wk10.pdf...
Processing file: 2nd_wk10.pdf...
Processing file: 30th_wk10.pdf...
Processing file: 31st_wk10.pdf...
Processing file: 32nd_wk10.pdf...
Processing file: 33rd_wk10.pdf...
Processing file: 34th_wk10.pdf...
Processing file: 35th_wk10.pdf...
Processing file: 36th_wk1



Processing file: 32nd_wk11.pdf...
Processing file: 33rd_wk11.pdf...
Processing file: 34th_wk11.pdf...
Processing file: 35th_wk11.pdf...
Processing file: 36th_wk11.pdf...
Processing file: 37th_wk11.pdf...
Processing file: 38th_wk11.pdf...
Processing file: 39th_wk11.pdf...
Processing file: 3rd_wk11.pdf...
Processing file: 40th_wk11.pdf...
Processing file: 41st_wk11.pdf...
Processing file: 42nd_wk11.pdf...
Processing file: 43rd_wk11.pdf...
Processing file: 44th_wk11.pdf...
Processing file: 45th_wk11.pdf...
Processing file: 46th_wk11.pdf...
Processing file: 47th_wk11.pdf...
Processing file: 48th_wk11.pdf...
Processing file: 49th_wk11.pdf...
Processing file: 4th_wk11.pdf...
Processing file: 50th_wk11.pdf...
Processing file: 51st_wk11.pdf...
Processing file: 52nd_wk11_u.pdf...
Processing file: 5th_wk11.pdf...
Processing file: 6th_wk11.pdf...
Processing file: 7th_wk11.pdf...
Processing file: 8th_wk11.pdf...
Processing file: 9th_wk11.pdf...
Processing year: 2012...
Processing file: 10th_wk12.p

In [7]:
outbreaks_raw = pd.DataFrame(outbreaks_raw)

In [8]:
outbreaks_raw.shape

(1311, 10)

In [9]:
pd.set_option('display.max_columns', None)

In [10]:
outbreaks_raw.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,hygiene.,sholapur,viii.,cholera,176,/,0,28.02.10,,2010/10th_wk10.pdf
1,positive,for,v.,cholera,(el,tor).,chlorination,of,,2010/10th_wk10.pdf
2,3.,maharashtra,solapur,cholera,218/0*,28.02.10,under,surveillance,,2010/11th_wk10.pdf
3,positive,for,v.,cholera,(el,tor).,chlorination,of,,2010/11th_wk10.pdf
4,tested,negative,for,cholera.,alternate,safe,drinking,water,,2010/13th_wk10.pdf


In [11]:
column_names = []

for i in range(outbreaks_raw.shape[1]):
    column_names.append('col'+str(i))

outbreaks_raw.columns = column_names

In [12]:
outbreaks_raw.columns

Index(['col0', 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8',
       'col9'],
      dtype='object')

In [13]:
outbreaks_raw.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9
0,hygiene.,sholapur,viii.,cholera,176,/,0,28.02.10,,2010/10th_wk10.pdf
1,positive,for,v.,cholera,(el,tor).,chlorination,of,,2010/10th_wk10.pdf
2,3.,maharashtra,solapur,cholera,218/0*,28.02.10,under,surveillance,,2010/11th_wk10.pdf
3,positive,for,v.,cholera,(el,tor).,chlorination,of,,2010/11th_wk10.pdf
4,tested,negative,for,cholera.,alternate,safe,drinking,water,,2010/13th_wk10.pdf


In [14]:
for i in range(outbreaks_raw.shape[1]):
    print('Column {} contains cholera: {}'.format(i, 'cholera' in outbreaks_raw.iloc[:,i].unique()))

Column 0 contains cholera: True
Column 1 contains cholera: False
Column 2 contains cholera: False
Column 3 contains cholera: True
Column 4 contains cholera: True
Column 5 contains cholera: False
Column 6 contains cholera: False
Column 7 contains cholera: False
Column 8 contains cholera: False
Column 9 contains cholera: False


In [15]:
outbreaks_raw[outbreaks_raw.col0 == 'cholera']

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9
263,cholera,culture,(v.,cholerae,ogawa).,health,camp,conducted,,2011/36th_wk11.pdf


In [16]:
outbreaks_raw[outbreaks_raw.col3 == 'cholera']

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9
0,hygiene.,sholapur,viii.,cholera,176,/,0,28.02.10,,2010/10th_wk10.pdf
1,positive,for,v.,cholera,(el,tor).,chlorination,of,,2010/10th_wk10.pdf
2,3.,maharashtra,solapur,cholera,218/0*,28.02.10,under,surveillance,,2010/11th_wk10.pdf
3,positive,for,v.,cholera,(el,tor).,chlorination,of,,2010/11th_wk10.pdf
6,gujarat,panchmahal,v.,cholera,6,/,0,12.05.10,,2010/19th_wk10.pdf
...,...,...,...,...,...,...,...,...,...,...
1299,be,negative,for,cholera,culture.01,water,sample,was,,2015/50th_wk15.pdf
1303,rajasthan,jaipur,xviii.,cholera,07,00,14-12-15,22-12-15,,2015/51st_wk15.pdf
1307,given.,kurnool,ii.,cholera,22,0,18/02/15,19/02/15,,2015/9th_wk15.pdf
1308,gujarat,ahmedabad,vii.,cholera,24,1,24/02/15,24/02/15,,2015/9th_wk15.pdf


In [17]:
outbreaks_raw[outbreaks_raw.col4 == 'cholera']

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9
52,positive,for,vibrio,cholerae.,cholera,cases,occurred,due,,2010/29th_wk10.pdf


## clean main cholera column

In [18]:
outbreaks_raw.col3.value_counts()

cholera                                                                                 848
cholera.                                                                                160
cholerae                                                                                 85
cholerae.                                                                                69
v.cholerae                                                                               35
cholera?                                                                                 32
hepatitisdenguecholerachikungunyamalariano.                                              14
hepatitisdenguecholerachikungunyamalariaacute                                             9
v.cholera                                                                                 7
v.cholerae.                                                                               7
cholera??                                                                       

In [19]:
outbreaks_raw['col3_clean'] = outbreaks_raw.col3.apply(lambda x: 'cholera' if 'cholera' in x else x)

In [20]:
outbreaks_raw.col3_clean.value_counts()

cholera    1311
Name: col3_clean, dtype: int64

## filter outbreaks

In [21]:
# next to where a cholera outbreak is mentioned there should be a number indicating the cases
outbreaks_raw['outbreak'] = outbreaks_raw.col4.apply(lambda x: 0 if re.search('[a-zA-Z]', str(x)) else 1)

In [22]:
outbreaks = outbreaks_raw[outbreaks_raw.outbreak == 1].copy().reset_index(drop=True)

In [23]:
outbreaks.shape

(469, 12)

In [24]:
outbreaks.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak
0,hygiene.,sholapur,viii.,cholera,176,/,0,28.02.10,,2010/10th_wk10.pdf,cholera,1
1,3.,maharashtra,solapur,cholera,218/0*,28.02.10,under,surveillance,,2010/11th_wk10.pdf,cholera,1
2,gujarat,panchmahal,v.,cholera,6,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1
3,practices.,salem,xviii.,cholera?,82,/,0,03.05.10,,2010/19th_wk10.pdf,cholera,1
4,nadu,theni,xix.,cholera,23,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1


In [25]:
outbreaks.tail()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak
464,rajasthan,jaipur,xviii.,cholera,7,00,14-12-15,22-12-15,,2015/51st_wk15.pdf,cholera,1
465,positive,for,v.,cholerae.,1,water,sample,collected;,,2015/51st_wk15.pdf,cholera,1
466,given.,kurnool,ii.,cholera,22,0,18/02/15,19/02/15,,2015/9th_wk15.pdf,cholera,1
467,gujarat,ahmedabad,vii.,cholera,24,1,24/02/15,24/02/15,,2015/9th_wk15.pdf,cholera,1
468,telangana,mahabubnagar,xxi.,cholera,31,0,27/02/15,27/02/15,,2015/9th_wk15.pdf,cholera,1


## extract and clean start dates

In [26]:
# next to where cholera outbreak cases are mentioned there should be a date
for col in outbreaks.columns[5:8]:
    outbreaks[col+'_clean'] = outbreaks[col].apply(lambda x: None if re.search('[a-zA-Z]', str(x)) is not None
                                                   else (None if len(str(x)) < 4
                                                         else x.replace('(', '').replace(')', '').replace('*', '')))

In [27]:
outbreaks.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean
0,hygiene.,sholapur,viii.,cholera,176,/,0,28.02.10,,2010/10th_wk10.pdf,cholera,1,,,28.02.10
1,3.,maharashtra,solapur,cholera,218/0*,28.02.10,under,surveillance,,2010/11th_wk10.pdf,cholera,1,28.02.10,,
2,gujarat,panchmahal,v.,cholera,6,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10
3,practices.,salem,xviii.,cholera?,82,/,0,03.05.10,,2010/19th_wk10.pdf,cholera,1,,,03.05.10
4,nadu,theni,xix.,cholera,23,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10


In [28]:
start_dates = []

for i, row in outbreaks.iterrows():
    dates = row[12:]
    start_date = next((i for i in dates if i is not None), None)
    start_dates.append(start_date)

In [29]:
outbreaks['start_date'] = start_dates

In [30]:
outbreaks.start_date.unique()

array(['28.02.10', '12.05.10', '03.05.10', '23.04.10', '28.12.09',
       '10.05.10', '27.05.10', '30.05.10', '04.06.10', '15.06.10',
       '22.06.10', '03.07.10', '24.06.10', '29.06.10', '27.06.10',
       '13.07.10', '12.07.10', '16.07.10', '20.07.10', '06.07.10',
       '29.07.10', '17.08.10', '20.08.10', '18.08.10', '16.08.10',
       '06.08.10', '10.09.10', None, '06.09.10', '21.09.10', '24.09.10',
       '22.10.10', '03.10.10', '24.10.10', '04.11.10', '29.11.10',
       '11.12.10', '19.12.10', '24.01.10', '03.03.11', '08.03.11',
       '20.03.11', '07.04.11', '20.04.11', '18.04.11', '23.04.11',
       '24.04.11', '04.05.11', '08.05.11', '18.05.11', '24.05.11',
       '02.06.11', '06.06.11', '14.06.11', '20.06.11', '27.06.11',
       '26.06.11', '03.07.11', '21.06.11', '08.07.11', '11.07.11',
       '23.07.11', '03.01.11', '22.07.11', '13.07.11', '14.08.11',
       '04.08.11', '11.08.11', '22.08.11', '18.08.11', '20.08.11',
       '16.08.11', '21.08.11', '01.08.11', '03.09.11', '

In [31]:
outbreaks = outbreaks[outbreaks.start_date.notnull()].copy().reset_index(drop=True)

In [32]:
outbreaks['start_date_year'] = outbreaks.start_date.apply(lambda x: x.split('.')[2] if '.' in x
                                                          else (x.split('/')[2] if '/' in x
                                                                else (x.split('-')[2] if '-' in x else x)))

In [33]:
outbreaks['start_date_year'] = outbreaks.start_date_year.str.pad(3, side='left',fillchar='0')
outbreaks['start_date_year'] = outbreaks.start_date_year.str.pad(4, side='left',fillchar='2')

In [34]:
outbreaks['start_date_year'].unique()

array(['2010', '2009', '2011', '2012', '2013', '2014', '2015', '2019'],
      dtype=object)

In [35]:
outbreaks['start_date_month'] = outbreaks.start_date.apply(lambda x: x.split('.')[1] if '.' in x
                                                           else (x.split('/')[1] if '/' in x
                                                                 else (x.split('-')[1] if '-' in x else x)))

In [36]:
outbreaks['start_date_month'] = outbreaks.start_date_month.str.pad(2, side='left',fillchar='0')

In [37]:
outbreaks['start_date_month'].unique()

array(['02', '05', '04', '12', '06', '07', '08', '09', '10', '11', '01',
       '03'], dtype=object)

In [38]:
outbreaks.shape

(400, 18)

In [39]:
outbreaks = outbreaks[outbreaks.start_date_year.isin(years)].copy().reset_index(drop=True)

In [40]:
outbreaks.shape

(398, 18)

In [41]:
outbreaks.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,start_date_year,start_date_month
0,hygiene.,sholapur,viii.,cholera,176,/,0,28.02.10,,2010/10th_wk10.pdf,cholera,1,,,28.02.10,28.02.10,2010,2
1,3.,maharashtra,solapur,cholera,218/0*,28.02.10,under,surveillance,,2010/11th_wk10.pdf,cholera,1,28.02.10,,,28.02.10,2010,2
2,gujarat,panchmahal,v.,cholera,6,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5
3,practices.,salem,xviii.,cholera?,82,/,0,03.05.10,,2010/19th_wk10.pdf,cholera,1,,,03.05.10,03.05.10,2010,5
4,nadu,theni,xix.,cholera,23,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5


In [42]:
outbreaks.tail()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,start_date_year,start_date_month
393,bengal,purulia,xv.,cholera,31,0,29-11-15,30-11-15,,2015/49th_wk15.pdf,cholera,1,,29-11-15,30-11-15,29-11-15,2015,11
394,rajasthan,jaipur,xviii.,cholera,7,0,14-12-15,22-12-15,,2015/51st_wk15.pdf,cholera,1,,14-12-15,22-12-15,14-12-15,2015,12
395,given.,kurnool,ii.,cholera,22,0,18/02/15,19/02/15,,2015/9th_wk15.pdf,cholera,1,,18/02/15,19/02/15,18/02/15,2015,2
396,gujarat,ahmedabad,vii.,cholera,24,1,24/02/15,24/02/15,,2015/9th_wk15.pdf,cholera,1,,24/02/15,24/02/15,24/02/15,2015,2
397,telangana,mahabubnagar,xxi.,cholera,31,0,27/02/15,27/02/15,,2015/9th_wk15.pdf,cholera,1,,27/02/15,27/02/15,27/02/15,2015,2


## map start date month to season

In [43]:
seasons = {'01': 'winter',
           '02': 'winter',
           '03': 'pre_monsoon',
           '04': 'pre_monsoon',
           '05': 'pre_monsoon',
           '06': 'monsoon',
           '07': 'monsoon',
           '08': 'monsoon',
           '09': 'monsoon',
           '10': 'post_monsoon',
           '11': 'post_monsoon',
           '12': 'post_monsoon'}

In [44]:
outbreaks['season'] = outbreaks.start_date_month.map(seasons)

In [45]:
outbreaks.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,start_date_year,start_date_month,season
0,hygiene.,sholapur,viii.,cholera,176,/,0,28.02.10,,2010/10th_wk10.pdf,cholera,1,,,28.02.10,28.02.10,2010,2,winter
1,3.,maharashtra,solapur,cholera,218/0*,28.02.10,under,surveillance,,2010/11th_wk10.pdf,cholera,1,28.02.10,,,28.02.10,2010,2,winter
2,gujarat,panchmahal,v.,cholera,6,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5,pre_monsoon
3,practices.,salem,xviii.,cholera?,82,/,0,03.05.10,,2010/19th_wk10.pdf,cholera,1,,,03.05.10,03.05.10,2010,5,pre_monsoon
4,nadu,theni,xix.,cholera,23,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5,pre_monsoon


In [46]:
outbreaks.tail()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,start_date_year,start_date_month,season
393,bengal,purulia,xv.,cholera,31,0,29-11-15,30-11-15,,2015/49th_wk15.pdf,cholera,1,,29-11-15,30-11-15,29-11-15,2015,11,post_monsoon
394,rajasthan,jaipur,xviii.,cholera,7,0,14-12-15,22-12-15,,2015/51st_wk15.pdf,cholera,1,,14-12-15,22-12-15,14-12-15,2015,12,post_monsoon
395,given.,kurnool,ii.,cholera,22,0,18/02/15,19/02/15,,2015/9th_wk15.pdf,cholera,1,,18/02/15,19/02/15,18/02/15,2015,2,winter
396,gujarat,ahmedabad,vii.,cholera,24,1,24/02/15,24/02/15,,2015/9th_wk15.pdf,cholera,1,,24/02/15,24/02/15,24/02/15,2015,2,winter
397,telangana,mahabubnagar,xxi.,cholera,31,0,27/02/15,27/02/15,,2015/9th_wk15.pdf,cholera,1,,27/02/15,27/02/15,27/02/15,2015,2,winter


## map states, districts and location (geometry)

In [47]:
!wget --mirror --continue --no-host-directories https://biogeo.ucdavis.edu/data/gadm3.6/shp/gadm36_IND_shp.zip --directory-prefix=../data/cholera_outbreaks

--2021-02-15 22:07:01--  https://biogeo.ucdavis.edu/data/gadm3.6/shp/gadm36_IND_shp.zip
Resolving biogeo.ucdavis.edu (biogeo.ucdavis.edu)... 128.120.228.172
Connecting to biogeo.ucdavis.edu (biogeo.ucdavis.edu)|128.120.228.172|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://data.biogeo.ucdavis.edu/data/gadm3.6/shp/gadm36_IND_shp.zip [following]
--2021-02-15 22:07:04--  https://data.biogeo.ucdavis.edu/data/gadm3.6/shp/gadm36_IND_shp.zip
Resolving data.biogeo.ucdavis.edu (data.biogeo.ucdavis.edu)... 128.120.228.172
Connecting to data.biogeo.ucdavis.edu (data.biogeo.ucdavis.edu)|128.120.228.172|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17520295 (17M) [application/zip]
Saving to: ‘../data/cholera_outbreaks/data/gadm3.6/shp/gadm36_IND_shp.zip’


2021-02-15 22:07:35 (589 KB/s) - ‘../data/cholera_outbreaks/data/gadm3.6/shp/gadm36_IND_shp.zip’ saved [17520295/17520295]

FINISHED --2021-02-15 22:07:35--
Total

In [48]:
!mv ../data/cholera_outbreaks/data/gadm3.6/shp/* ../data/cholera_outbreaks

In [49]:
!rm -rf ../data/cholera_outbreaks/data

In [50]:
!unzip -d ../data/cholera_outbreaks gadm36_IND_shp.zip

unzip:  cannot find or open gadm36_IND_shp.zip, gadm36_IND_shp.zip.zip or gadm36_IND_shp.zip.ZIP.


In [51]:
path = '../data/cholera_outbreaks/gadm36_IND_shp/'

In [52]:
file = 'gadm36_IND_2.shp'

In [53]:
india = gpd.read_file(path+file)

In [54]:
india.shape

(666, 14)

In [55]:
india.head()

Unnamed: 0,GID_0,NAME_0,GID_1,NAME_1,NL_NAME_1,GID_2,NAME_2,VARNAME_2,NL_NAME_2,TYPE_2,ENGTYPE_2,CC_2,HASC_2,geometry
0,IND,India,IND.1_1,Andaman and Nicobar,,IND.1.1_1,Nicobar Islands,,,District,District,,IN.AN.NI,"MULTIPOLYGON (((93.78773 6.85264, 93.78849 6.8..."
1,IND,India,IND.1_1,Andaman and Nicobar,,IND.1.2_1,North and Middle Andaman,,,District,District,,IN.AN.NM,"MULTIPOLYGON (((92.93898 12.22386, 92.93916 12..."
2,IND,India,IND.1_1,Andaman and Nicobar,,IND.1.3_1,South Andaman,,,District,District,,IN.AN.SA,"MULTIPOLYGON (((92.47972 10.52056, 92.47945 10..."
3,IND,India,IND.2_1,Andhra Pradesh,,IND.2.1_1,Anantapur,"Anantpur, Ananthapur",,District,District,,IN.AD.AN,"POLYGON ((77.71420 13.76079, 77.71314 13.75074..."
4,IND,India,IND.2_1,Andhra Pradesh,,IND.2.2_1,Chittoor,Chitoor|Chittor,,District,District,,IN.AD.CH,"POLYGON ((78.46293 12.63537, 78.46190 12.63228..."


In [56]:
states_districts = india[['NAME_1', 'NAME_2', 'geometry']].copy()

In [57]:
states_districts.columns = ['state', 'district', 'geometry']

In [58]:
states_districts['state'] = states_districts['state'].str.lower()
states_districts['district'] = states_districts['district'].str.lower()

In [59]:
districts = states_districts.district.unique().tolist()

In [60]:
outbreaks.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,start_date_year,start_date_month,season
0,hygiene.,sholapur,viii.,cholera,176,/,0,28.02.10,,2010/10th_wk10.pdf,cholera,1,,,28.02.10,28.02.10,2010,2,winter
1,3.,maharashtra,solapur,cholera,218/0*,28.02.10,under,surveillance,,2010/11th_wk10.pdf,cholera,1,28.02.10,,,28.02.10,2010,2,winter
2,gujarat,panchmahal,v.,cholera,6,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5,pre_monsoon
3,practices.,salem,xviii.,cholera?,82,/,0,03.05.10,,2010/19th_wk10.pdf,cholera,1,,,03.05.10,03.05.10,2010,5,pre_monsoon
4,nadu,theni,xix.,cholera,23,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5,pre_monsoon


In [61]:
for col in outbreaks.columns[0:3]:
    outbreaks[col+'_clean'] = outbreaks[col].str.replace('\W', '')
    outbreaks[col+'_clean'] = outbreaks[col+'_clean'].apply(lambda x: None if len(x) < 3 else x)

In [62]:
# correct district names in col1_clean
outbreaks.loc[(outbreaks.col1_clean == 'mahabubnagar'), ('col1_clean')] = 'mahbubnagar'
outbreaks.loc[(outbreaks.col1_clean == 'ahmedabad'), ('col1_clean')] = 'ahmadabad'
outbreaks.loc[(outbreaks.col1_clean == 'howrah'), ('col1_clean')] = 'haora'
outbreaks.loc[(outbreaks.col1_clean == 'hooghly'), ('col1_clean')] = 'hugli'
outbreaks.loc[(outbreaks.col1_clean == 'hoogly'), ('col1_clean')] = 'hugli'
outbreaks.loc[(outbreaks.col1_clean == 'villupuram'), ('col1_clean')] = 'viluppuram'
outbreaks.loc[(outbreaks.col1_clean == 'haridwar'), ('col1_clean')] = 'hardwar'
outbreaks.loc[(outbreaks.col1_clean == 'davangere'), ('col1_clean')] = 'davanagere'
outbreaks.loc[(outbreaks.col1_clean == 'davengere'), ('col1_clean')] = 'davanagere'
outbreaks.loc[(outbreaks.col1_clean == 'davangare'), ('col1_clean')] = 'davanagere'
outbreaks.loc[(outbreaks.col1_clean == 'tiruchirapalli'), ('col1_clean')] = 'tiruchirappalli'
outbreaks.loc[(outbreaks.col1_clean == 'darang'), ('col1_clean')] = 'darrang'
outbreaks.loc[(outbreaks.col1_clean == 'virudhunager'), ('col1_clean')] = 'virudunagar'
outbreaks.loc[(outbreaks.col1_clean == 'chikkaballapur'), ('col1_clean')] = 'chikballapura'
outbreaks.loc[(outbreaks.col1_clean == 'gondia'), ('col1_clean')] = 'gondiya'
outbreaks.loc[(outbreaks.col1_clean == 'purulia'), ('col1_clean')] = 'puruliya'
outbreaks.loc[(outbreaks.col1_clean == 'kalaburagi'), ('col1_clean')] = 'gulbarga'
outbreaks.loc[(outbreaks.col1_clean == 'kalburgi'), ('col1_clean')] = 'gulbarga'
outbreaks.loc[(outbreaks.col1_clean == 'berhampur'), ('col1_clean')] = 'ganjam'
outbreaks.loc[(outbreaks.col1_clean == 'sholapur'), ('col1_clean')] = 'solapur'
outbreaks.loc[(outbreaks.col1_clean == 'raigad'), ('col1_clean')] = 'raigarh'
outbreaks.loc[(outbreaks.col1_clean == 'panchmahal'), ('col1_clean')] = 'panch mahals'
outbreaks.loc[(outbreaks.col1_clean == 'sibsagar'), ('col1_clean')] = 'sivasagar'
outbreaks.loc[(outbreaks.col1_clean == 'sibsagar'), ('col1_clean')] = 'kabeerdham'
outbreaks.loc[(outbreaks.col1_clean == 'banaskantha'), ('col1_clean')] = 'banas kantha'
outbreaks.loc[(outbreaks.col1_clean == 'chamarajnagar'), ('col1_clean')] = 'chamrajnagar'
outbreaks.loc[(outbreaks.col1_clean == 'khargaon'), ('col1_clean')] = 'west nimar'
outbreaks.loc[(outbreaks.col1_clean == 'mysuru'), ('col1_clean')] = 'mysore'
outbreaks.loc[(outbreaks.col1_clean == 'mohali'), ('col1_clean')] = 'sahibzada ajit singh nagar'
outbreaks.loc[(outbreaks.col1_clean == 'delhi'), ('col1_clean')] = 'west'
outbreaks.loc[(outbreaks.col1_clean == 'kawardha'), ('col1_clean')] = 'kabeerdham'
outbreaks.loc[(outbreaks.col1_clean == 'chirtadurga'), ('col1_clean')] = 'chitradurga'
outbreaks.loc[(outbreaks.col1_clean == 'budgam'), ('col1_clean')] = 'badgam'
outbreaks.loc[(outbreaks.col1_clean == 'gulburga'), ('col1_clean')] = 'gulbarga'
outbreaks.loc[(outbreaks.col1_clean == 'jangir'), ('col1_clean')] = 'janjgir-champa'
outbreaks.loc[(outbreaks.col1_clean == 'kancheepuramsaidapet'), ('col1_clean')] = 'kancheepuram'

# correct district names in col2_clean
outbreaks.loc[(outbreaks.col2_clean == 'thiruvannamalai'), ('col2_clean')] = 'tiruvannamalai'
outbreaks.loc[(outbreaks.col2_clean == 'raigad'), ('col2_clean')] = 'raigarh'
outbreaks.loc[(outbreaks.col2_clean == 'davangere'), ('col2_clean')] = 'davanagere'
outbreaks.loc[(outbreaks.col2_clean == 'sibsagar'), ('col2_clean')] = 'sivasagar'
outbreaks.loc[(outbreaks.col2_clean == 'sabarkantha'), ('col2_clean')] = 'sabar kantha'

In [63]:
outbreaks.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,start_date_year,start_date_month,season,col0_clean,col1_clean,col2_clean
0,hygiene.,sholapur,viii.,cholera,176,/,0,28.02.10,,2010/10th_wk10.pdf,cholera,1,,,28.02.10,28.02.10,2010,2,winter,hygiene,solapur,viii
1,3.,maharashtra,solapur,cholera,218/0*,28.02.10,under,surveillance,,2010/11th_wk10.pdf,cholera,1,28.02.10,,,28.02.10,2010,2,winter,,maharashtra,solapur
2,gujarat,panchmahal,v.,cholera,6,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5,pre_monsoon,gujarat,panch mahals,
3,practices.,salem,xviii.,cholera?,82,/,0,03.05.10,,2010/19th_wk10.pdf,cholera,1,,,03.05.10,03.05.10,2010,5,pre_monsoon,practices,salem,xviii
4,nadu,theni,xix.,cholera,23,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5,pre_monsoon,nadu,theni,xix


In [64]:
outbreaks.columns[18:]

Index(['season', 'col0_clean', 'col1_clean', 'col2_clean'], dtype='object')

In [65]:
for col in outbreaks.columns[18:]:
    outbreaks[col+'_district'] = outbreaks[col].apply(lambda x: None if x is None else next((d for d in districts if x in str(d)), None))

In [66]:
outbreaks.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,start_date_year,start_date_month,season,col0_clean,col1_clean,col2_clean,season_district,col0_clean_district,col1_clean_district,col2_clean_district
0,hygiene.,sholapur,viii.,cholera,176,/,0,28.02.10,,2010/10th_wk10.pdf,cholera,1,,,28.02.10,28.02.10,2010,2,winter,hygiene,solapur,viii,,,solapur,
1,3.,maharashtra,solapur,cholera,218/0*,28.02.10,under,surveillance,,2010/11th_wk10.pdf,cholera,1,28.02.10,,,28.02.10,2010,2,winter,,maharashtra,solapur,,,,solapur
2,gujarat,panchmahal,v.,cholera,6,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5,pre_monsoon,gujarat,panch mahals,,,,panch mahals,
3,practices.,salem,xviii.,cholera?,82,/,0,03.05.10,,2010/19th_wk10.pdf,cholera,1,,,03.05.10,03.05.10,2010,5,pre_monsoon,practices,salem,xviii,,,salem,
4,nadu,theni,xix.,cholera,23,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5,pre_monsoon,nadu,theni,xix,,,theni,


In [67]:
district_cols = [col for col in outbreaks.columns if 'district' in col]

In [68]:
outbreaks_districts = []

for i, row in outbreaks.iterrows():
    all_district_cols = row[district_cols]
    district = next((i for i in all_district_cols if i is not None), None)
    outbreaks_districts.append(district)

In [69]:
outbreaks['district'] = outbreaks_districts

In [70]:
# manually map missing districts
outbreaks.loc[(outbreaks.col4 == '110') & (outbreaks.col6 == '25/10/13'), ('district')] = 'bankura'
outbreaks.loc[(outbreaks.col4 == '50') & (outbreaks.col6 == '25/10/13') & (outbreaks.col7 == '28/10/13'), ('district')] = 'bankura'
outbreaks.loc[(outbreaks.col4 == '23') & (outbreaks.col6 == '23/10/13') & (outbreaks.col7 == '23/10/13'), ('district')] = 'puruliya'
outbreaks.loc[(outbreaks.col4 == '13') & (outbreaks.col6 == '31/07/13') & (outbreaks.col7 == '31/07/13'), ('district')] = 'puruliya'
outbreaks.loc[(outbreaks.col4 == '33') & (outbreaks.col6 == '01/06/13') & (outbreaks.col7 == '08/06/13'), ('district')] = 'chitradurga'
outbreaks.loc[(outbreaks.col4 == '88') & (outbreaks.col6 == '05/05/13'), ('district')] = 'sangli'
outbreaks.loc[(outbreaks.col4 == '60') & (outbreaks.col6 == '01/05/13') & (outbreaks.col7 == '02/05/13'), ('district')] = 'puruliya'
outbreaks.loc[(outbreaks.col4 == '36') & (outbreaks.col6 == '19/04/13'), ('district')] = 'davanagere'
outbreaks.loc[(outbreaks.col4 == '161') & (outbreaks.col6 == '27/08/12'), ('district')] = 'nagpur'
outbreaks.loc[(outbreaks.col4 == '73') & (outbreaks.col6 == '22/07/12') & (outbreaks.col7 == '23/07/12'), ('district')] = 'bankura'
outbreaks.loc[(outbreaks.col4 == '19') & (outbreaks.col6 == '21/07/12') & (outbreaks.col7 == '21/07/12'), ('district')] = 'chikmagalur'
outbreaks.loc[(outbreaks.col4 == '21') & (outbreaks.col6 == '06/07/12') & (outbreaks.col7 == '12/07/12'), ('district')] = 'birbhum'
outbreaks.loc[(outbreaks.col4 == '59') & (outbreaks.col6 == '25/05/12') & (outbreaks.col7 == '25/05/12'), ('district')] = 'tumkur'
outbreaks.loc[(outbreaks.col4 == '8') & (outbreaks.col6 == '03/05/12') & (outbreaks.col7 == '07/05/12'), ('district')] = 'wayanad'
outbreaks.loc[(outbreaks.col4 == '26') & (outbreaks.col7 == '16.08.11'), ('district')] = 'birbhum'
outbreaks.loc[(outbreaks.col4 == '86') & (outbreaks.col6 == '25/10/13') & (outbreaks.col7 == '28/10/13'), ('district')] = 'bankura'
outbreaks.loc[(outbreaks.col4 == '25') & (outbreaks.col6 == '19/05/12'), ('district')] = 'mandya'
outbreaks.loc[(outbreaks.col4 == '12') & (outbreaks.col6 == '13.03.12') & (outbreaks.col7 == '14.03.12'), ('district')] = 'mysore'

In [71]:
outbreaks.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,start_date_year,start_date_month,season,col0_clean,col1_clean,col2_clean,season_district,col0_clean_district,col1_clean_district,col2_clean_district,district
0,hygiene.,sholapur,viii.,cholera,176,/,0,28.02.10,,2010/10th_wk10.pdf,cholera,1,,,28.02.10,28.02.10,2010,2,winter,hygiene,solapur,viii,,,solapur,,solapur
1,3.,maharashtra,solapur,cholera,218/0*,28.02.10,under,surveillance,,2010/11th_wk10.pdf,cholera,1,28.02.10,,,28.02.10,2010,2,winter,,maharashtra,solapur,,,,solapur,solapur
2,gujarat,panchmahal,v.,cholera,6,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5,pre_monsoon,gujarat,panch mahals,,,,panch mahals,,panch mahals
3,practices.,salem,xviii.,cholera?,82,/,0,03.05.10,,2010/19th_wk10.pdf,cholera,1,,,03.05.10,03.05.10,2010,5,pre_monsoon,practices,salem,xviii,,,salem,,salem
4,nadu,theni,xix.,cholera,23,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5,pre_monsoon,nadu,theni,xix,,,theni,,theni


In [72]:
outbreaks.tail()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,start_date_year,start_date_month,season,col0_clean,col1_clean,col2_clean,season_district,col0_clean_district,col1_clean_district,col2_clean_district,district
393,bengal,purulia,xv.,cholera,31,0,29-11-15,30-11-15,,2015/49th_wk15.pdf,cholera,1,,29-11-15,30-11-15,29-11-15,2015,11,post_monsoon,bengal,puruliya,,,,puruliya,,puruliya
394,rajasthan,jaipur,xviii.,cholera,7,0,14-12-15,22-12-15,,2015/51st_wk15.pdf,cholera,1,,14-12-15,22-12-15,14-12-15,2015,12,post_monsoon,rajasthan,jaipur,xviii,,,jaipur,,jaipur
395,given.,kurnool,ii.,cholera,22,0,18/02/15,19/02/15,,2015/9th_wk15.pdf,cholera,1,,18/02/15,19/02/15,18/02/15,2015,2,winter,given,kurnool,,,,kurnool,,kurnool
396,gujarat,ahmedabad,vii.,cholera,24,1,24/02/15,24/02/15,,2015/9th_wk15.pdf,cholera,1,,24/02/15,24/02/15,24/02/15,2015,2,winter,gujarat,ahmadabad,vii,,,ahmadabad,,ahmadabad
397,telangana,mahabubnagar,xxi.,cholera,31,0,27/02/15,27/02/15,,2015/9th_wk15.pdf,cholera,1,,27/02/15,27/02/15,27/02/15,2015,2,winter,telangana,mahbubnagar,xxi,,,mahbubnagar,,mahbubnagar


In [73]:
outbreaks.shape

(398, 27)

In [74]:
outbreaks = outbreaks[outbreaks.district.notnull()]

In [75]:
outbreaks.shape

(397, 27)

In [76]:
outbreaks_mapped = pd.merge(outbreaks, states_districts, how='left', on='district')[['state', 'district', 'start_date_year', 'start_date_month', 'season', 'outbreak', 'geometry']].reset_index(drop=True)

In [77]:
outbreaks_mapped.shape

(402, 7)

In [78]:
outbreaks_mapped

Unnamed: 0,state,district,start_date_year,start_date_month,season,outbreak,geometry
0,maharashtra,solapur,2010,02,winter,1,"POLYGON ((74.90098 17.23968, 74.90394 17.24113..."
1,maharashtra,solapur,2010,02,winter,1,"POLYGON ((74.90098 17.23968, 74.90394 17.24113..."
2,gujarat,panch mahals,2010,05,pre_monsoon,1,"POLYGON ((73.73135 22.28985, 73.72839 22.28884..."
3,tamil nadu,salem,2010,05,pre_monsoon,1,"POLYGON ((78.22656 11.90686, 78.22643 11.90152..."
4,tamil nadu,theni,2010,05,pre_monsoon,1,"POLYGON ((77.34090 9.59505, 77.32851 9.57323, ..."
...,...,...,...,...,...,...,...
397,west bengal,puruliya,2015,11,post_monsoon,1,"POLYGON ((85.88916 23.15176, 85.88641 23.15335..."
398,rajasthan,jaipur,2015,12,post_monsoon,1,"POLYGON ((75.79135 26.55370, 75.78448 26.55061..."
399,andhra pradesh,kurnool,2015,02,winter,1,"POLYGON ((77.15276 15.13162, 77.13536 15.13601..."
400,gujarat,ahmadabad,2015,02,winter,1,"POLYGON ((71.89120 22.09747, 71.88139 22.09477..."


In [79]:
duplicate_districts = states_districts.district.value_counts()[states_districts.district.value_counts() > 1].index.tolist()

In [80]:
outbreaks_mapped[outbreaks_mapped.district.isin(duplicate_districts)].sort_values('district')

Unnamed: 0,state,district,start_date_year,start_date_month,season,outbreak,geometry
162,chhattisgarh,bijapur,2012,7,monsoon,1,"POLYGON ((81.12064 19.25221, 81.12340 19.24678..."
163,karnataka,bijapur,2012,7,monsoon,1,"POLYGON ((76.41283 16.58693, 76.42019 16.57418..."
186,chhattisgarh,bijapur,2012,8,monsoon,1,"POLYGON ((81.12064 19.25221, 81.12340 19.24678..."
187,karnataka,bijapur,2012,8,monsoon,1,"POLYGON ((76.41283 16.58693, 76.42019 16.57418..."
79,chhattisgarh,raigarh,2011,7,monsoon,1,"POLYGON ((83.48308 21.63960, 83.48334 21.63717..."
80,maharashtra,raigarh,2011,7,monsoon,1,"MULTIPOLYGON (((73.03819 18.05208, 73.03819 18..."
85,chhattisgarh,raigarh,2011,7,monsoon,1,"POLYGON ((83.48308 21.63960, 83.48334 21.63717..."
86,maharashtra,raigarh,2011,7,monsoon,1,"MULTIPOLYGON (((73.03819 18.05208, 73.03819 18..."
356,chhattisgarh,raigarh,2015,3,pre_monsoon,1,"POLYGON ((83.48308 21.63960, 83.48334 21.63717..."
357,maharashtra,raigarh,2015,3,pre_monsoon,1,"MULTIPOLYGON (((73.03819 18.05208, 73.03819 18..."


In [81]:
outbreaks[(outbreaks.district == 'bijapur') & (outbreaks.start_date_month == '07')] # karnataka

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,start_date_year,start_date_month,season,col0_clean,col1_clean,col2_clean,season_district,col0_clean_district,col1_clean_district,col2_clean_district,district
160,imparted.,bijapur,xi,cholera,32,0,08/07/12,12/07/12,,2012/28th_wk12.pdf,cholera,1,,08/07/12,12/07/12,08/07/12,2012,7,monsoon,imparted,bijapur,,,,bijapur,,bijapur


In [82]:
outbreaks[(outbreaks.district == 'bijapur') & (outbreaks.start_date_month == '08')] # karnataka

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,start_date_year,start_date_month,season,col0_clean,col1_clean,col2_clean,season_district,col0_clean_district,col1_clean_district,col2_clean_district,district
184,given.,bijapur,xvi,cholera,85,0,20/08/12,20/08/12,,2012/34th_wk12.pdf,cholera,1,,20/08/12,20/08/12,20/08/12,2012,8,monsoon,given,bijapur,xvi,,,bijapur,,bijapur


In [83]:
outbreaks[(outbreaks.district == 'raigarh') & (outbreaks.start_date_month == '07')] # maharashtra

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,start_date_year,start_date_month,season,col0_clean,col1_clean,col2_clean,season_district,col0_clean_district,col1_clean_district,col2_clean_district,district
79,maharashtra,raigad,xxii,cholera,67,/,0,11.07.11,,2011/29th_wk11.pdf,cholera,1,,,11.07.11,11.07.11,2011,7,monsoon,maharashtra,raigarh,xxii,,,raigarh,,raigarh
84,week.,11,raigad,cholera,94,/,0,11.07.11,,2011/30th_wk11.pdf,cholera,1,,,11.07.11,11.07.11,2011,7,monsoon,week,,raigarh,,,,raigarh,raigarh


In [84]:
outbreaks[(outbreaks.district == 'raigarh') & (outbreaks.start_date_month == '03')] # maharashtra

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,start_date_year,start_date_month,season,col0_clean,col1_clean,col2_clean,season_district,col0_clean_district,col1_clean_district,col2_clean_district,district
353,maharashtra,raigad,xxxi.,cholera,22,0,30/03/15,under,,2015/16th_wk15.pdf,cholera,1,,30/03/15,,30/03/15,2015,3,pre_monsoon,maharashtra,raigarh,xxxi,,,raigarh,,raigarh


In [85]:
outbreaks_mapped.drop(outbreaks_mapped[(outbreaks_mapped.state == 'chhattisgarh') & (outbreaks_mapped.district.isin(['bijapur', 'raigarh']))].index, inplace=True)

In [86]:
outbreaks_mapped.shape

(397, 7)

In [87]:
outbreaks_mapped.drop_duplicates().shape

(328, 7)

In [88]:
outbreaks_mapped.drop_duplicates().reset_index(drop=True).to_pickle('../data/cholera_outbreaks/monthly_cholera_outbreaks_per_district_2010_2015.pkl.gz', compression='gzip')