# preprocess cholera outbreaks

In [1]:
import os
import numpy as np
import pandas as pd
import PyPDF2
import re
import geopandas as gpd
import requests
import zipfile
import io

## download cholera outbreaks data

In [2]:
!python download_cholera_outbreaks_data.py

Download cholera outbreaks data...
Processing 2010 with 53 files...
Processing 2011 with 53 files...
Processing 2012 with 52 files...
Processing 2013 with 52 files...
Processing 2014 with 53 files...
Processing 2015 with 53 files...
Download cholera outbreaks data complete.


## load pdfs and extract relevant parts

In [3]:
files_to_skip = ['52nd_wk_old.pdf',
                 '11th_wk14_old.pdf']

In [4]:
years = [2010,
         2011,
         2012,
         2013,
         2014,
         2015]

In [5]:
path = '../data/cholera_outbreaks/'

In [6]:
%%time

outbreaks_raw = []

for year in years:
    print('Processing year: {}...'.format(year))
    (_, _, file_names) = next(os.walk(path+str(year)))
    file_names = np.setdiff1d(file_names, files_to_skip) # drop files to skip from file names
    
    for file in file_names:
        print('Processing file: {}...'.format(file))
        document = PyPDF2.PdfFileReader(path+str(year)+'/'+file)
        pages = document.getNumPages()
        
        for page in range(pages):
            text = document.getPage(page).extractText()
            text = text.replace('\n', '') # remove line breaks
            text = text.split(' ') # split on space
            text = [i for i in text if i != ''] # remove empty items
            text = [i.lower() for i in text] # lower case all items
    
            indices = [i for i, s in enumerate(text) if 'cholera' in s] # get indices where cholera is mentioned
        
            if len(indices) > 0:
                for index in indices:
                    cholera_index = []
                    cholera_index.append(index)
                    index_list = np.arange(index-3,index).tolist() + cholera_index + np.arange(index+1,index+5).tolist()
                    index_list = [i for i in index_list if i < len(text)] # get items that are up to 3 indices before and up to 5 indices after cholera is mentioned
                    outbreak_info = [text[i] for i in index_list]
                    while len(outbreak_info) < 9:
                        outbreak_info.append(None)
                    outbreak_info.append(str(year)+'/'+file)
                    outbreaks_raw.append(outbreak_info)
            else:
                pass

Processing year: 2010...
Processing file: 10th_wk10.pdf...
Processing file: 11th_wk10.pdf...
Processing file: 12th_wk10.pdf...
Processing file: 13th_wk10.pdf...
Processing file: 14th_wk10.pdf...
Processing file: 15th_wk10.pdf...
Processing file: 16th_wk10.pdf...
Processing file: 17th_wk10.pdf...
Processing file: 18th_wk10.pdf...
Processing file: 19th_wk10.pdf...
Processing file: 1st_wk10.pdf...
Processing file: 20th_wk10.pdf...
Processing file: 21st_wk10.pdf...
Processing file: 22nd_wk10.pdf...
Processing file: 23rd_wk10.pdf...
Processing file: 24th_wk10.pdf...
Processing file: 25th_wk10.pdf...
Processing file: 26th_wk10.pdf...
Processing file: 27th_wk10.pdf...
Processing file: 28th_wk10.pdf...
Processing file: 29th_wk10.pdf...
Processing file: 2nd_wk10.pdf...
Processing file: 30th_wk10.pdf...
Processing file: 31st_wk10.pdf...
Processing file: 32nd_wk10.pdf...
Processing file: 33rd_wk10.pdf...
Processing file: 34th_wk10.pdf...
Processing file: 35th_wk10.pdf...
Processing file: 36th_wk1



Processing file: 32nd_wk11.pdf...
Processing file: 33rd_wk11.pdf...
Processing file: 34th_wk11.pdf...
Processing file: 35th_wk11.pdf...
Processing file: 36th_wk11.pdf...
Processing file: 37th_wk11.pdf...
Processing file: 38th_wk11.pdf...
Processing file: 39th_wk11.pdf...
Processing file: 3rd_wk11.pdf...
Processing file: 40th_wk11.pdf...
Processing file: 41st_wk11.pdf...
Processing file: 42nd_wk11.pdf...
Processing file: 43rd_wk11.pdf...
Processing file: 44th_wk11.pdf...
Processing file: 45th_wk11.pdf...
Processing file: 46th_wk11.pdf...
Processing file: 47th_wk11.pdf...
Processing file: 48th_wk11.pdf...
Processing file: 49th_wk11.pdf...
Processing file: 4th_wk11.pdf...
Processing file: 50th_wk11.pdf...
Processing file: 51st_wk11.pdf...
Processing file: 52nd_wk11_u.pdf...
Processing file: 5th_wk11.pdf...
Processing file: 6th_wk11.pdf...
Processing file: 7th_wk11.pdf...
Processing file: 8th_wk11.pdf...
Processing file: 9th_wk11.pdf...
Processing year: 2012...
Processing file: 10th_wk12.p

In [7]:
outbreaks_raw = pd.DataFrame(outbreaks_raw)

In [8]:
outbreaks_raw.shape

(1311, 10)

In [9]:
pd.set_option('display.max_columns', None)

In [10]:
outbreaks_raw.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,hygiene.,sholapur,viii.,cholera,176,/,0,28.02.10,,2010/10th_wk10.pdf
1,positive,for,v.,cholera,(el,tor).,chlorination,of,,2010/10th_wk10.pdf
2,3.,maharashtra,solapur,cholera,218/0*,28.02.10,under,surveillance,,2010/11th_wk10.pdf
3,positive,for,v.,cholera,(el,tor).,chlorination,of,,2010/11th_wk10.pdf
4,tested,negative,for,cholera.,alternate,safe,drinking,water,,2010/13th_wk10.pdf


In [11]:
column_names = []

for i in range(outbreaks_raw.shape[1]):
    column_names.append('col'+str(i))

outbreaks_raw.columns = column_names

In [12]:
outbreaks_raw.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9
0,hygiene.,sholapur,viii.,cholera,176,/,0,28.02.10,,2010/10th_wk10.pdf
1,positive,for,v.,cholera,(el,tor).,chlorination,of,,2010/10th_wk10.pdf
2,3.,maharashtra,solapur,cholera,218/0*,28.02.10,under,surveillance,,2010/11th_wk10.pdf
3,positive,for,v.,cholera,(el,tor).,chlorination,of,,2010/11th_wk10.pdf
4,tested,negative,for,cholera.,alternate,safe,drinking,water,,2010/13th_wk10.pdf


In [13]:
for i in range(outbreaks_raw.shape[1]):
    print('Column {} contains cholera: {}'.format(i, 'cholera' in outbreaks_raw.iloc[:,i].unique()))

Column 0 contains cholera: True
Column 1 contains cholera: False
Column 2 contains cholera: False
Column 3 contains cholera: True
Column 4 contains cholera: True
Column 5 contains cholera: False
Column 6 contains cholera: False
Column 7 contains cholera: False
Column 8 contains cholera: False
Column 9 contains cholera: False


In [14]:
outbreaks_raw[outbreaks_raw.col0 == 'cholera']

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9
263,cholera,culture,(v.,cholerae,ogawa).,health,camp,conducted,,2011/36th_wk11.pdf


In [15]:
outbreaks_raw[outbreaks_raw.col3 == 'cholera']

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9
0,hygiene.,sholapur,viii.,cholera,176,/,0,28.02.10,,2010/10th_wk10.pdf
1,positive,for,v.,cholera,(el,tor).,chlorination,of,,2010/10th_wk10.pdf
2,3.,maharashtra,solapur,cholera,218/0*,28.02.10,under,surveillance,,2010/11th_wk10.pdf
3,positive,for,v.,cholera,(el,tor).,chlorination,of,,2010/11th_wk10.pdf
6,gujarat,panchmahal,v.,cholera,6,/,0,12.05.10,,2010/19th_wk10.pdf
...,...,...,...,...,...,...,...,...,...,...
1299,be,negative,for,cholera,culture.01,water,sample,was,,2015/50th_wk15.pdf
1303,rajasthan,jaipur,xviii.,cholera,07,00,14-12-15,22-12-15,,2015/51st_wk15.pdf
1307,given.,kurnool,ii.,cholera,22,0,18/02/15,19/02/15,,2015/9th_wk15.pdf
1308,gujarat,ahmedabad,vii.,cholera,24,1,24/02/15,24/02/15,,2015/9th_wk15.pdf


In [16]:
outbreaks_raw[outbreaks_raw.col4 == 'cholera']

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9
52,positive,for,vibrio,cholerae.,cholera,cases,occurred,due,,2010/29th_wk10.pdf


## clean main cholera column

In [17]:
outbreaks_raw.col3.value_counts()

cholera                                                                                 848
cholera.                                                                                160
cholerae                                                                                 85
cholerae.                                                                                69
v.cholerae                                                                               35
cholera?                                                                                 32
hepatitisdenguecholerachikungunyamalariano.                                              14
hepatitisdenguecholerachikungunyamalariaacute                                             9
v.cholera                                                                                 7
v.cholerae.                                                                               7
?cholera                                                                        

In [18]:
outbreaks_raw['col3_clean'] = outbreaks_raw.col3.apply(lambda x: 'cholera' if 'cholera' in x else x)

In [19]:
outbreaks_raw.col3_clean.value_counts()

cholera    1311
Name: col3_clean, dtype: int64

## filter outbreaks

In [20]:
# next to where a cholera outbreak is mentioned there should be a number indicating the cases
outbreaks_raw['outbreak'] = outbreaks_raw.col4.apply(lambda x: 0 if re.search('[a-zA-Z]', str(x)) else 1)

In [21]:
outbreaks = outbreaks_raw[outbreaks_raw.outbreak == 1].copy().reset_index(drop=True)

In [22]:
outbreaks.shape

(469, 12)

In [23]:
outbreaks.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak
0,hygiene.,sholapur,viii.,cholera,176,/,0,28.02.10,,2010/10th_wk10.pdf,cholera,1
1,3.,maharashtra,solapur,cholera,218/0*,28.02.10,under,surveillance,,2010/11th_wk10.pdf,cholera,1
2,gujarat,panchmahal,v.,cholera,6,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1
3,practices.,salem,xviii.,cholera?,82,/,0,03.05.10,,2010/19th_wk10.pdf,cholera,1
4,nadu,theni,xix.,cholera,23,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1


In [24]:
outbreaks.tail()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak
464,rajasthan,jaipur,xviii.,cholera,7,00,14-12-15,22-12-15,,2015/51st_wk15.pdf,cholera,1
465,positive,for,v.,cholerae.,1,water,sample,collected;,,2015/51st_wk15.pdf,cholera,1
466,given.,kurnool,ii.,cholera,22,0,18/02/15,19/02/15,,2015/9th_wk15.pdf,cholera,1
467,gujarat,ahmedabad,vii.,cholera,24,1,24/02/15,24/02/15,,2015/9th_wk15.pdf,cholera,1
468,telangana,mahabubnagar,xxi.,cholera,31,0,27/02/15,27/02/15,,2015/9th_wk15.pdf,cholera,1


## extract and clean start dates

In [25]:
# next to where cholera outbreak cases are mentioned there should be a date
for col in outbreaks.columns[5:8]:
    outbreaks[col+'_clean'] = outbreaks[col].apply(lambda x: None if re.search('[a-zA-Z]', str(x)) is not None
                                                   else (None if len(str(x)) < 4
                                                         else x.replace('(', '').replace(')', '').replace('*', '')))

In [26]:
outbreaks.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean
0,hygiene.,sholapur,viii.,cholera,176,/,0,28.02.10,,2010/10th_wk10.pdf,cholera,1,,,28.02.10
1,3.,maharashtra,solapur,cholera,218/0*,28.02.10,under,surveillance,,2010/11th_wk10.pdf,cholera,1,28.02.10,,
2,gujarat,panchmahal,v.,cholera,6,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10
3,practices.,salem,xviii.,cholera?,82,/,0,03.05.10,,2010/19th_wk10.pdf,cholera,1,,,03.05.10
4,nadu,theni,xix.,cholera,23,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10


In [27]:
start_dates = []

for i, row in outbreaks.iterrows():
    dates = row[12:]
    start_date = next((i for i in dates if i is not None), None)
    start_dates.append(start_date)

In [28]:
outbreaks['start_date'] = start_dates

In [29]:
outbreaks.start_date.unique()

array(['28.02.10', '12.05.10', '03.05.10', '23.04.10', '28.12.09',
       '10.05.10', '27.05.10', '30.05.10', '04.06.10', '15.06.10',
       '22.06.10', '03.07.10', '24.06.10', '29.06.10', '27.06.10',
       '13.07.10', '12.07.10', '16.07.10', '20.07.10', '06.07.10',
       '29.07.10', '17.08.10', '20.08.10', '18.08.10', '16.08.10',
       '06.08.10', '10.09.10', None, '06.09.10', '21.09.10', '24.09.10',
       '22.10.10', '03.10.10', '24.10.10', '04.11.10', '29.11.10',
       '11.12.10', '19.12.10', '24.01.10', '03.03.11', '08.03.11',
       '20.03.11', '07.04.11', '20.04.11', '18.04.11', '23.04.11',
       '24.04.11', '04.05.11', '08.05.11', '18.05.11', '24.05.11',
       '02.06.11', '06.06.11', '14.06.11', '20.06.11', '27.06.11',
       '26.06.11', '03.07.11', '21.06.11', '08.07.11', '11.07.11',
       '23.07.11', '03.01.11', '22.07.11', '13.07.11', '14.08.11',
       '04.08.11', '11.08.11', '22.08.11', '18.08.11', '20.08.11',
       '16.08.11', '21.08.11', '01.08.11', '03.09.11', '

In [30]:
outbreaks = outbreaks[outbreaks.start_date.notnull()].copy().reset_index(drop=True)

In [31]:
outbreaks['year'] = outbreaks.start_date.apply(lambda x: x.split('.')[2] if '.' in x
                                               else (x.split('/')[2] if '/' in x
                                                     else (x.split('-')[2] if '-' in x
                                                           else x)))

In [32]:
outbreaks['year'] = outbreaks.year.str.pad(width=3, side='left', fillchar='0')
outbreaks['year'] = outbreaks.year.str.pad(width=4, side='left', fillchar='2')
outbreaks['year'] = outbreaks.year.astype(np.int64)

In [33]:
outbreaks['year'].unique()

array([2010, 2009, 2011, 2012, 2013, 2014, 2015, 2019])

In [34]:
outbreaks[outbreaks.year == 2019]

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,year
390,karnataka,bagalkot,vii.,cholera,66,0,20-09-19,25-09-19,,2015/39th_wk15.pdf,cholera,1,,20-09-19,25-09-19,20-09-19,2019


In [35]:
outbreaks.loc[outbreaks.year == 2019, 'year'] = 2015

In [36]:
outbreaks['month'] = outbreaks.start_date.apply(lambda x: x.split('.')[1] if '.' in x
                                                else (x.split('/')[1] if '/' in x
                                                      else (x.split('-')[1] if '-' in x
                                                            else x)))

In [37]:
outbreaks['month'] = outbreaks.month.astype(np.int64)

In [38]:
outbreaks['month'].unique()

array([ 2,  5,  4, 12,  6,  7,  8,  9, 10, 11,  1,  3])

In [39]:
outbreaks.shape

(400, 18)

In [40]:
outbreaks = outbreaks[outbreaks.year.isin(years)].copy().reset_index(drop=True)

In [41]:
outbreaks.shape

(399, 18)

In [42]:
outbreaks.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,year,month
0,hygiene.,sholapur,viii.,cholera,176,/,0,28.02.10,,2010/10th_wk10.pdf,cholera,1,,,28.02.10,28.02.10,2010,2
1,3.,maharashtra,solapur,cholera,218/0*,28.02.10,under,surveillance,,2010/11th_wk10.pdf,cholera,1,28.02.10,,,28.02.10,2010,2
2,gujarat,panchmahal,v.,cholera,6,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5
3,practices.,salem,xviii.,cholera?,82,/,0,03.05.10,,2010/19th_wk10.pdf,cholera,1,,,03.05.10,03.05.10,2010,5
4,nadu,theni,xix.,cholera,23,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5


In [43]:
outbreaks.tail()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,year,month
394,bengal,purulia,xv.,cholera,31,0,29-11-15,30-11-15,,2015/49th_wk15.pdf,cholera,1,,29-11-15,30-11-15,29-11-15,2015,11
395,rajasthan,jaipur,xviii.,cholera,7,0,14-12-15,22-12-15,,2015/51st_wk15.pdf,cholera,1,,14-12-15,22-12-15,14-12-15,2015,12
396,given.,kurnool,ii.,cholera,22,0,18/02/15,19/02/15,,2015/9th_wk15.pdf,cholera,1,,18/02/15,19/02/15,18/02/15,2015,2
397,gujarat,ahmedabad,vii.,cholera,24,1,24/02/15,24/02/15,,2015/9th_wk15.pdf,cholera,1,,24/02/15,24/02/15,24/02/15,2015,2
398,telangana,mahabubnagar,xxi.,cholera,31,0,27/02/15,27/02/15,,2015/9th_wk15.pdf,cholera,1,,27/02/15,27/02/15,27/02/15,2015,2


## map states, districts and location (geometry)

In [44]:
!wget --recursive --no-directories --no-clobber --directory-prefix=../data/cholera_outbreaks https://biogeo.ucdavis.edu/data/gadm3.6/shp/gadm36_IND_shp.zip

File ‘../data/cholera_outbreaks/gadm36_IND_shp.zip’ already there; not retrieving.



In [45]:
!unzip -u -d ../data/cholera_outbreaks/gadm36_IND_shp ../data/cholera_outbreaks/gadm36_IND_shp.zip

Archive:  ../data/cholera_outbreaks/gadm36_IND_shp.zip


In [46]:
path = '../data/cholera_outbreaks/gadm36_IND_shp/'

In [47]:
file = 'gadm36_IND_2.shp'

In [48]:
india = gpd.read_file(path+file)

In [49]:
india.shape

(666, 14)

In [50]:
india.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 666 entries, 0 to 665
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   GID_0      666 non-null    object  
 1   NAME_0     666 non-null    object  
 2   GID_1      666 non-null    object  
 3   NAME_1     666 non-null    object  
 4   NL_NAME_1  0 non-null      object  
 5   GID_2      666 non-null    object  
 6   NAME_2     666 non-null    object  
 7   VARNAME_2  190 non-null    object  
 8   NL_NAME_2  0 non-null      object  
 9   TYPE_2     666 non-null    object  
 10  ENGTYPE_2  666 non-null    object  
 11  CC_2       0 non-null      object  
 12  HASC_2     626 non-null    object  
 13  geometry   666 non-null    geometry
dtypes: geometry(1), object(13)
memory usage: 73.0+ KB


In [51]:
india.head()

Unnamed: 0,GID_0,NAME_0,GID_1,NAME_1,NL_NAME_1,GID_2,NAME_2,VARNAME_2,NL_NAME_2,TYPE_2,ENGTYPE_2,CC_2,HASC_2,geometry
0,IND,India,IND.1_1,Andaman and Nicobar,,IND.1.1_1,Nicobar Islands,,,District,District,,IN.AN.NI,"MULTIPOLYGON (((93.78773 6.85264, 93.78849 6.8..."
1,IND,India,IND.1_1,Andaman and Nicobar,,IND.1.2_1,North and Middle Andaman,,,District,District,,IN.AN.NM,"MULTIPOLYGON (((92.93898 12.22386, 92.93916 12..."
2,IND,India,IND.1_1,Andaman and Nicobar,,IND.1.3_1,South Andaman,,,District,District,,IN.AN.SA,"MULTIPOLYGON (((92.47972 10.52056, 92.47945 10..."
3,IND,India,IND.2_1,Andhra Pradesh,,IND.2.1_1,Anantapur,"Anantpur, Ananthapur",,District,District,,IN.AD.AN,"POLYGON ((77.71420 13.76079, 77.71314 13.75074..."
4,IND,India,IND.2_1,Andhra Pradesh,,IND.2.2_1,Chittoor,Chitoor|Chittor,,District,District,,IN.AD.CH,"POLYGON ((78.46293 12.63537, 78.46190 12.63228..."


In [52]:
states_districts = india[['NAME_1', 'NAME_2', 'geometry']].copy()

In [53]:
states_districts.columns = ['state', 'district', 'geometry']

In [54]:
states_districts['state'] = states_districts['state'].str.lower()
states_districts['district'] = states_districts['district'].str.lower()

In [55]:
districts = states_districts.district.unique().tolist()

In [56]:
outbreaks.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,year,month
0,hygiene.,sholapur,viii.,cholera,176,/,0,28.02.10,,2010/10th_wk10.pdf,cholera,1,,,28.02.10,28.02.10,2010,2
1,3.,maharashtra,solapur,cholera,218/0*,28.02.10,under,surveillance,,2010/11th_wk10.pdf,cholera,1,28.02.10,,,28.02.10,2010,2
2,gujarat,panchmahal,v.,cholera,6,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5
3,practices.,salem,xviii.,cholera?,82,/,0,03.05.10,,2010/19th_wk10.pdf,cholera,1,,,03.05.10,03.05.10,2010,5
4,nadu,theni,xix.,cholera,23,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5


In [57]:
for col in outbreaks.columns[0:3]:
    outbreaks[col+'_clean'] = outbreaks[col].str.replace('\W', '')
    outbreaks[col+'_clean'] = outbreaks[col+'_clean'].apply(lambda x: None if len(x) < 3 else x)

In [58]:
# correct district names in col1_clean
outbreaks.loc[outbreaks.col1_clean == 'mahabubnagar', 'col1_clean'] = 'mahbubnagar'
outbreaks.loc[outbreaks.col1_clean == 'ahmedabad', 'col1_clean'] = 'ahmadabad'
outbreaks.loc[outbreaks.col1_clean == 'howrah', 'col1_clean'] = 'haora'
outbreaks.loc[outbreaks.col1_clean == 'hooghly', 'col1_clean'] = 'hugli'
outbreaks.loc[outbreaks.col1_clean == 'hoogly', 'col1_clean'] = 'hugli'
outbreaks.loc[outbreaks.col1_clean == 'villupuram', 'col1_clean'] = 'viluppuram'
outbreaks.loc[outbreaks.col1_clean == 'haridwar', 'col1_clean'] = 'hardwar'
outbreaks.loc[outbreaks.col1_clean == 'davangere', 'col1_clean'] = 'davanagere'
outbreaks.loc[outbreaks.col1_clean == 'davengere', 'col1_clean'] = 'davanagere'
outbreaks.loc[outbreaks.col1_clean == 'davangare', 'col1_clean'] = 'davanagere'
outbreaks.loc[outbreaks.col1_clean == 'tiruchirapalli', 'col1_clean'] = 'tiruchirappalli'
outbreaks.loc[outbreaks.col1_clean == 'darang', 'col1_clean'] = 'darrang'
outbreaks.loc[outbreaks.col1_clean == 'virudhunager', 'col1_clean'] = 'virudunagar'
outbreaks.loc[outbreaks.col1_clean == 'chikkaballapur', 'col1_clean'] = 'chikballapura'
outbreaks.loc[outbreaks.col1_clean == 'gondia', 'col1_clean'] = 'gondiya'
outbreaks.loc[outbreaks.col1_clean == 'purulia', 'col1_clean'] = 'puruliya'
outbreaks.loc[outbreaks.col1_clean == 'kalaburagi', 'col1_clean'] = 'gulbarga'
outbreaks.loc[outbreaks.col1_clean == 'kalburgi', 'col1_clean'] = 'gulbarga'
outbreaks.loc[outbreaks.col1_clean == 'berhampur', 'col1_clean'] = 'ganjam'
outbreaks.loc[outbreaks.col1_clean == 'sholapur', 'col1_clean'] = 'solapur'
outbreaks.loc[outbreaks.col1_clean == 'raigad', 'col1_clean'] = 'raigarh'
outbreaks.loc[outbreaks.col1_clean == 'panchmahal', 'col1_clean'] = 'panch mahals'
outbreaks.loc[outbreaks.col1_clean == 'sibsagar', 'col1_clean'] = 'sivasagar'
outbreaks.loc[outbreaks.col1_clean == 'sibsagar', 'col1_clean'] = 'kabeerdham'
outbreaks.loc[outbreaks.col1_clean == 'banaskantha', 'col1_clean'] = 'banas kantha'
outbreaks.loc[outbreaks.col1_clean == 'chamarajnagar', 'col1_clean'] = 'chamrajnagar'
outbreaks.loc[outbreaks.col1_clean == 'khargaon', 'col1_clean'] = 'west nimar'
outbreaks.loc[outbreaks.col1_clean == 'mysuru', 'col1_clean'] = 'mysore'
outbreaks.loc[outbreaks.col1_clean == 'mohali', 'col1_clean'] = 'sahibzada ajit singh nagar'
outbreaks.loc[outbreaks.col1_clean == 'delhi', 'col1_clean'] = 'west'
outbreaks.loc[outbreaks.col1_clean == 'kawardha', 'col1_clean'] = 'kabeerdham'
outbreaks.loc[outbreaks.col1_clean == 'chirtadurga', 'col1_clean'] = 'chitradurga'
outbreaks.loc[outbreaks.col1_clean == 'budgam', 'col1_clean'] = 'badgam'
outbreaks.loc[outbreaks.col1_clean == 'gulburga', 'col1_clean'] = 'gulbarga'
outbreaks.loc[outbreaks.col1_clean == 'jangir', 'col1_clean'] = 'janjgir-champa'
outbreaks.loc[outbreaks.col1_clean == 'kancheepuramsaidapet', 'col1_clean'] = 'kancheepuram'

# correct district names in col2_clean
outbreaks.loc[outbreaks.col2_clean == 'thiruvannamalai', 'col2_clean'] = 'tiruvannamalai'
outbreaks.loc[outbreaks.col2_clean == 'raigad', 'col2_clean'] = 'raigarh'
outbreaks.loc[outbreaks.col2_clean == 'davangere', 'col2_clean'] = 'davanagere'
outbreaks.loc[outbreaks.col2_clean == 'sibsagar', 'col2_clean'] = 'sivasagar'
outbreaks.loc[outbreaks.col2_clean == 'sabarkantha', 'col2_clean'] = 'sabar kantha'

In [59]:
outbreaks.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,year,month,col0_clean,col1_clean,col2_clean
0,hygiene.,sholapur,viii.,cholera,176,/,0,28.02.10,,2010/10th_wk10.pdf,cholera,1,,,28.02.10,28.02.10,2010,2,hygiene,solapur,viii
1,3.,maharashtra,solapur,cholera,218/0*,28.02.10,under,surveillance,,2010/11th_wk10.pdf,cholera,1,28.02.10,,,28.02.10,2010,2,,maharashtra,solapur
2,gujarat,panchmahal,v.,cholera,6,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5,gujarat,panch mahals,
3,practices.,salem,xviii.,cholera?,82,/,0,03.05.10,,2010/19th_wk10.pdf,cholera,1,,,03.05.10,03.05.10,2010,5,practices,salem,xviii
4,nadu,theni,xix.,cholera,23,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5,nadu,theni,xix


In [60]:
outbreaks.columns[18:]

Index(['col0_clean', 'col1_clean', 'col2_clean'], dtype='object')

In [61]:
for col in outbreaks.columns[18:]:
    outbreaks[col+'_district'] = outbreaks[col].apply(lambda x: None if x is None else next((d for d in districts if x in str(d)), None))

In [62]:
outbreaks.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,year,month,col0_clean,col1_clean,col2_clean,col0_clean_district,col1_clean_district,col2_clean_district
0,hygiene.,sholapur,viii.,cholera,176,/,0,28.02.10,,2010/10th_wk10.pdf,cholera,1,,,28.02.10,28.02.10,2010,2,hygiene,solapur,viii,,solapur,
1,3.,maharashtra,solapur,cholera,218/0*,28.02.10,under,surveillance,,2010/11th_wk10.pdf,cholera,1,28.02.10,,,28.02.10,2010,2,,maharashtra,solapur,,,solapur
2,gujarat,panchmahal,v.,cholera,6,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5,gujarat,panch mahals,,,panch mahals,
3,practices.,salem,xviii.,cholera?,82,/,0,03.05.10,,2010/19th_wk10.pdf,cholera,1,,,03.05.10,03.05.10,2010,5,practices,salem,xviii,,salem,
4,nadu,theni,xix.,cholera,23,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5,nadu,theni,xix,,theni,


In [63]:
district_cols = [col for col in outbreaks.columns if 'district' in col]

In [64]:
outbreaks_districts = []

for i, row in outbreaks.iterrows():
    all_district_cols = row[district_cols]
    district = next((i for i in all_district_cols if i is not None), None)
    outbreaks_districts.append(district)

In [65]:
outbreaks['district'] = outbreaks_districts

In [66]:
# manually map missing districts
outbreaks.loc[(outbreaks.col4 == '110') & (outbreaks.col6 == '25/10/13'), 'district'] = 'bankura'
outbreaks.loc[(outbreaks.col4 == '50') & (outbreaks.col6 == '25/10/13') & (outbreaks.col7 == '28/10/13'), 'district'] = 'bankura'
outbreaks.loc[(outbreaks.col4 == '23') & (outbreaks.col6 == '23/10/13') & (outbreaks.col7 == '23/10/13'), 'district'] = 'puruliya'
outbreaks.loc[(outbreaks.col4 == '13') & (outbreaks.col6 == '31/07/13') & (outbreaks.col7 == '31/07/13'), 'district'] = 'puruliya'
outbreaks.loc[(outbreaks.col4 == '33') & (outbreaks.col6 == '01/06/13') & (outbreaks.col7 == '08/06/13'), 'district'] = 'chitradurga'
outbreaks.loc[(outbreaks.col4 == '88') & (outbreaks.col6 == '05/05/13'), 'district'] = 'sangli'
outbreaks.loc[(outbreaks.col4 == '60') & (outbreaks.col6 == '01/05/13') & (outbreaks.col7 == '02/05/13'), 'district'] = 'puruliya'
outbreaks.loc[(outbreaks.col4 == '36') & (outbreaks.col6 == '19/04/13'), 'district'] = 'davanagere'
outbreaks.loc[(outbreaks.col4 == '161') & (outbreaks.col6 == '27/08/12'), 'district'] = 'nagpur'
outbreaks.loc[(outbreaks.col4 == '73') & (outbreaks.col6 == '22/07/12') & (outbreaks.col7 == '23/07/12'), 'district'] = 'bankura'
outbreaks.loc[(outbreaks.col4 == '19') & (outbreaks.col6 == '21/07/12') & (outbreaks.col7 == '21/07/12'), 'district'] = 'chikmagalur'
outbreaks.loc[(outbreaks.col4 == '21') & (outbreaks.col6 == '06/07/12') & (outbreaks.col7 == '12/07/12'), 'district'] = 'birbhum'
outbreaks.loc[(outbreaks.col4 == '59') & (outbreaks.col6 == '25/05/12') & (outbreaks.col7 == '25/05/12'), 'district'] = 'tumkur'
outbreaks.loc[(outbreaks.col4 == '8') & (outbreaks.col6 == '03/05/12') & (outbreaks.col7 == '07/05/12'), 'district'] = 'wayanad'
outbreaks.loc[(outbreaks.col4 == '26') & (outbreaks.col7 == '16.08.11'), 'district'] = 'birbhum'
outbreaks.loc[(outbreaks.col4 == '86') & (outbreaks.col6 == '25/10/13') & (outbreaks.col7 == '28/10/13'), 'district'] = 'bankura'
outbreaks.loc[(outbreaks.col4 == '25') & (outbreaks.col6 == '19/05/12'), 'district'] = 'mandya'
outbreaks.loc[(outbreaks.col4 == '12') & (outbreaks.col6 == '13.03.12') & (outbreaks.col7 == '14.03.12'), 'district'] = 'mysore'

In [67]:
outbreaks.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,year,month,col0_clean,col1_clean,col2_clean,col0_clean_district,col1_clean_district,col2_clean_district,district
0,hygiene.,sholapur,viii.,cholera,176,/,0,28.02.10,,2010/10th_wk10.pdf,cholera,1,,,28.02.10,28.02.10,2010,2,hygiene,solapur,viii,,solapur,,solapur
1,3.,maharashtra,solapur,cholera,218/0*,28.02.10,under,surveillance,,2010/11th_wk10.pdf,cholera,1,28.02.10,,,28.02.10,2010,2,,maharashtra,solapur,,,solapur,solapur
2,gujarat,panchmahal,v.,cholera,6,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5,gujarat,panch mahals,,,panch mahals,,panch mahals
3,practices.,salem,xviii.,cholera?,82,/,0,03.05.10,,2010/19th_wk10.pdf,cholera,1,,,03.05.10,03.05.10,2010,5,practices,salem,xviii,,salem,,salem
4,nadu,theni,xix.,cholera,23,/,0,12.05.10,,2010/19th_wk10.pdf,cholera,1,,,12.05.10,12.05.10,2010,5,nadu,theni,xix,,theni,,theni


In [68]:
outbreaks.tail()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,year,month,col0_clean,col1_clean,col2_clean,col0_clean_district,col1_clean_district,col2_clean_district,district
394,bengal,purulia,xv.,cholera,31,0,29-11-15,30-11-15,,2015/49th_wk15.pdf,cholera,1,,29-11-15,30-11-15,29-11-15,2015,11,bengal,puruliya,,,puruliya,,puruliya
395,rajasthan,jaipur,xviii.,cholera,7,0,14-12-15,22-12-15,,2015/51st_wk15.pdf,cholera,1,,14-12-15,22-12-15,14-12-15,2015,12,rajasthan,jaipur,xviii,,jaipur,,jaipur
396,given.,kurnool,ii.,cholera,22,0,18/02/15,19/02/15,,2015/9th_wk15.pdf,cholera,1,,18/02/15,19/02/15,18/02/15,2015,2,given,kurnool,,,kurnool,,kurnool
397,gujarat,ahmedabad,vii.,cholera,24,1,24/02/15,24/02/15,,2015/9th_wk15.pdf,cholera,1,,24/02/15,24/02/15,24/02/15,2015,2,gujarat,ahmadabad,vii,,ahmadabad,,ahmadabad
398,telangana,mahabubnagar,xxi.,cholera,31,0,27/02/15,27/02/15,,2015/9th_wk15.pdf,cholera,1,,27/02/15,27/02/15,27/02/15,2015,2,telangana,mahbubnagar,xxi,,mahbubnagar,,mahbubnagar


In [69]:
outbreaks.shape

(399, 25)

In [70]:
outbreaks = outbreaks[outbreaks.district.notnull()]

In [71]:
outbreaks.shape

(398, 25)

In [72]:
outbreaks_mapped = pd.merge(states_districts, outbreaks, how='right', on='district')[['state', 'district', 'year', 'month', 'outbreak', 'geometry']].reset_index(drop=True)

In [73]:
outbreaks_mapped.shape

(403, 6)

In [74]:
outbreaks_mapped

Unnamed: 0,state,district,year,month,outbreak,geometry
0,maharashtra,solapur,2010,2,1,"POLYGON ((74.90098 17.23968, 74.90394 17.24113..."
1,maharashtra,solapur,2010,2,1,"POLYGON ((74.90098 17.23968, 74.90394 17.24113..."
2,gujarat,panch mahals,2010,5,1,"POLYGON ((73.73135 22.28985, 73.72839 22.28884..."
3,tamil nadu,salem,2010,5,1,"POLYGON ((78.22656 11.90686, 78.22643 11.90152..."
4,tamil nadu,salem,2010,5,1,"POLYGON ((78.22656 11.90686, 78.22643 11.90152..."
...,...,...,...,...,...,...
398,karnataka,udupi,2015,8,1,"MULTIPOLYGON (((74.67014 13.19958, 74.67014 13..."
399,odisha,nabarangapur,2015,8,1,"POLYGON ((82.84877 19.19666, 82.84850 19.18833..."
400,karnataka,bagalkot,2015,9,1,"POLYGON ((75.85240 15.93246, 75.84651 15.93122..."
401,rajasthan,jaipur,2015,12,1,"POLYGON ((75.79135 26.55370, 75.78448 26.55061..."


In [75]:
duplicate_districts = states_districts.district.value_counts()[states_districts.district.value_counts() > 1].index.tolist()

In [76]:
outbreaks_mapped[outbreaks_mapped.district.isin(duplicate_districts)].sort_values('district')

Unnamed: 0,state,district,year,month,outbreak,geometry
283,chhattisgarh,bijapur,2012,7,1,"POLYGON ((81.12064 19.25221, 81.12340 19.24678..."
284,karnataka,bijapur,2012,7,1,"POLYGON ((76.41283 16.58693, 76.42019 16.57418..."
285,chhattisgarh,bijapur,2012,8,1,"POLYGON ((81.12064 19.25221, 81.12340 19.24678..."
286,karnataka,bijapur,2012,8,1,"POLYGON ((76.41283 16.58693, 76.42019 16.57418..."
176,chhattisgarh,raigarh,2011,7,1,"POLYGON ((83.48308 21.63960, 83.48334 21.63717..."
177,maharashtra,raigarh,2011,7,1,"MULTIPOLYGON (((73.03819 18.05208, 73.03819 18..."
178,chhattisgarh,raigarh,2011,7,1,"POLYGON ((83.48308 21.63960, 83.48334 21.63717..."
179,maharashtra,raigarh,2011,7,1,"MULTIPOLYGON (((73.03819 18.05208, 73.03819 18..."
180,chhattisgarh,raigarh,2015,3,1,"POLYGON ((83.48308 21.63960, 83.48334 21.63717..."
181,maharashtra,raigarh,2015,3,1,"MULTIPOLYGON (((73.03819 18.05208, 73.03819 18..."


In [77]:
outbreaks[(outbreaks.district == 'bijapur') & (outbreaks.month == 7)] # karnataka

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,year,month,col0_clean,col1_clean,col2_clean,col0_clean_district,col1_clean_district,col2_clean_district,district
160,imparted.,bijapur,xi,cholera,32,0,08/07/12,12/07/12,,2012/28th_wk12.pdf,cholera,1,,08/07/12,12/07/12,08/07/12,2012,7,imparted,bijapur,,,bijapur,,bijapur


In [78]:
outbreaks[(outbreaks.district == 'bijapur') & (outbreaks.month == 8)] # karnataka

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,year,month,col0_clean,col1_clean,col2_clean,col0_clean_district,col1_clean_district,col2_clean_district,district
184,given.,bijapur,xvi,cholera,85,0,20/08/12,20/08/12,,2012/34th_wk12.pdf,cholera,1,,20/08/12,20/08/12,20/08/12,2012,8,given,bijapur,xvi,,bijapur,,bijapur


In [79]:
outbreaks[(outbreaks.district == 'raigarh') & (outbreaks.month == 7)] # maharashtra

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,year,month,col0_clean,col1_clean,col2_clean,col0_clean_district,col1_clean_district,col2_clean_district,district
79,maharashtra,raigad,xxii,cholera,67,/,0,11.07.11,,2011/29th_wk11.pdf,cholera,1,,,11.07.11,11.07.11,2011,7,maharashtra,raigarh,xxii,,raigarh,,raigarh
84,week.,11,raigad,cholera,94,/,0,11.07.11,,2011/30th_wk11.pdf,cholera,1,,,11.07.11,11.07.11,2011,7,week,,raigarh,,,raigarh,raigarh


In [80]:
outbreaks[(outbreaks.district == 'raigarh') & (outbreaks.month == 3)] # maharashtra

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col3_clean,outbreak,col5_clean,col6_clean,col7_clean,start_date,year,month,col0_clean,col1_clean,col2_clean,col0_clean_district,col1_clean_district,col2_clean_district,district
353,maharashtra,raigad,xxxi.,cholera,22,0,30/03/15,under,,2015/16th_wk15.pdf,cholera,1,,30/03/15,,30/03/15,2015,3,maharashtra,raigarh,xxxi,,raigarh,,raigarh


In [81]:
outbreaks_mapped = outbreaks_mapped.drop(outbreaks_mapped[(outbreaks_mapped.state == 'chhattisgarh') & (outbreaks_mapped.district.isin(['bijapur', 'raigarh']))].index)

In [82]:
outbreaks_mapped.shape

(398, 6)

In [83]:
outbreaks_mapped.drop_duplicates().shape

(329, 6)

In [84]:
outbreaks_mapped.drop_duplicates().reset_index(drop=True).to_file('../data/cholera_outbreaks/monthly_cholera_outbreaks_per_district_2010_2015.shp')