# preprocess cholera outbreaks

In [None]:
import os
import numpy as np
import pandas as pd
import PyPDF2
import re
import geopandas as gpd
import requests
import zipfile
import io

## download cholera outbreaks data

In [None]:
!python download_cholera_outbreaks_data.py

## load pdfs and extract relevant parts

In [None]:
files_to_skip = ['52nd_wk_old.pdf',
                 '11th_wk14_old.pdf']

In [None]:
years = ['2010',
         '2011',
         '2012',
         '2013',
         '2014',
         '2015']

In [None]:
path = '../data/cholera_outbreaks/'

In [None]:
%%time

outbreaks_raw = []

for year in years:
    print('Processing year: {}...'.format(year))
    (_, _, file_names) = next(os.walk(path+year))
    file_names = np.setdiff1d(file_names, files_to_skip) # drop files to skip from file names
    
    for file in file_names:
        print('Processing file: {}...'.format(file))
        document = PyPDF2.PdfFileReader(path+year+'/'+file)
        pages = document.getNumPages()
        
        for page in range(pages):
            text = document.getPage(page).extractText()
            text = text.replace('\n', '') # remove line breaks
            text = text.split(' ') # split on space
            text = [i for i in text if i != ''] # remove empty items
            text = [i.lower() for i in text] # lower case all items
    
            indices = [i for i, s in enumerate(text) if 'cholera' in s] # get indices where cholera is mentioned
        
            if len(indices) > 0:
                for index in indices:
                    cholera_index = []
                    cholera_index.append(index)
                    index_list = np.arange(index-3,index).tolist() + cholera_index + np.arange(index+1,index+5).tolist()
                    index_list = [i for i in index_list if i < len(text)] # get items that are up to 3 indices before and up to 5 indices after cholera is mentioned
                    outbreak_info = [text[i] for i in index_list]
                    while len(outbreak_info) < 9:
                        outbreak_info.append(None)
                    outbreak_info.append(year+'/'+file)
                    outbreaks_raw.append(outbreak_info)
            else:
                pass

In [None]:
outbreaks_raw = pd.DataFrame(outbreaks_raw)

In [None]:
outbreaks_raw.shape

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
outbreaks_raw.head()

In [None]:
column_names = []

for i in range(outbreaks_raw.shape[1]):
    column_names.append('col'+str(i))

outbreaks_raw.columns = column_names

In [None]:
outbreaks_raw.head()

In [None]:
for i in range(outbreaks_raw.shape[1]):
    print('Column {} contains cholera: {}'.format(i, 'cholera' in outbreaks_raw.iloc[:,i].unique()))

In [None]:
outbreaks_raw[outbreaks_raw.col0 == 'cholera']

In [None]:
outbreaks_raw[outbreaks_raw.col3 == 'cholera']

In [None]:
outbreaks_raw[outbreaks_raw.col4 == 'cholera']

## clean main cholera column

In [None]:
outbreaks_raw.col3.value_counts()

In [None]:
outbreaks_raw['col3_clean'] = outbreaks_raw.col3.apply(lambda x: 'cholera' if 'cholera' in x else x)

In [None]:
outbreaks_raw.col3_clean.value_counts()

## filter outbreaks

In [None]:
# next to where a cholera outbreak is mentioned there should be a number indicating the cases
outbreaks_raw['outbreak'] = outbreaks_raw.col4.apply(lambda x: 0 if re.search('[a-zA-Z]', str(x)) else 1)

In [None]:
outbreaks = outbreaks_raw[outbreaks_raw.outbreak == 1].copy().reset_index(drop=True)

In [None]:
outbreaks.shape

In [None]:
outbreaks.head()

In [None]:
outbreaks.tail()

## extract and clean start dates

In [None]:
# next to where cholera outbreak cases are mentioned there should be a date
for col in outbreaks.columns[5:8]:
    outbreaks[col+'_clean'] = outbreaks[col].apply(lambda x: None if re.search('[a-zA-Z]', str(x)) is not None
                                                   else (None if len(str(x)) < 4
                                                         else x.replace('(', '').replace(')', '').replace('*', '')))

In [None]:
outbreaks.head()

In [None]:
start_dates = []

for i, row in outbreaks.iterrows():
    dates = row[12:]
    start_date = next((i for i in dates if i is not None), None)
    start_dates.append(start_date)

In [None]:
outbreaks['start_date'] = start_dates

In [None]:
outbreaks.start_date.unique()

In [None]:
outbreaks = outbreaks[outbreaks.start_date.notnull()].copy().reset_index(drop=True)

In [None]:
outbreaks['start_date_year'] = outbreaks.start_date.apply(lambda x: x.split('.')[2] if '.' in x
                                                          else (x.split('/')[2] if '/' in x
                                                                else (x.split('-')[2] if '-' in x else x)))

In [None]:
outbreaks['start_date_year'] = outbreaks.start_date_year.str.pad(3, side='left',fillchar='0')
outbreaks['start_date_year'] = outbreaks.start_date_year.str.pad(4, side='left',fillchar='2')

In [None]:
outbreaks['start_date_year'].unique()

In [None]:
outbreaks[outbreaks.start_date_year == '2019']

In [None]:
outbreaks.loc[outbreaks.start_date_year == '2019', 'start_date_year'] = '2015'

In [None]:
outbreaks['start_date_month'] = outbreaks.start_date.apply(lambda x: x.split('.')[1] if '.' in x
                                                           else (x.split('/')[1] if '/' in x
                                                                 else (x.split('-')[1] if '-' in x else x)))

In [None]:
outbreaks['start_date_month'] = outbreaks.start_date_month.str.pad(2, side='left',fillchar='0')

In [None]:
outbreaks['start_date_month'].unique()

In [None]:
outbreaks.shape

In [None]:
outbreaks = outbreaks[outbreaks.start_date_year.isin(years)].copy().reset_index(drop=True)

In [None]:
outbreaks.shape

In [None]:
outbreaks.head()

In [None]:
outbreaks.tail()

## map start date month to season

In [None]:
seasons = {'01': 'winter',
           '02': 'winter',
           '03': 'pre_monsoon',
           '04': 'pre_monsoon',
           '05': 'pre_monsoon',
           '06': 'monsoon',
           '07': 'monsoon',
           '08': 'monsoon',
           '09': 'monsoon',
           '10': 'post_monsoon',
           '11': 'post_monsoon',
           '12': 'post_monsoon'}

In [None]:
outbreaks['season'] = outbreaks.start_date_month.map(seasons)

In [None]:
outbreaks.head()

In [None]:
outbreaks.tail()

## map states, districts and location (geometry)

In [None]:
!wget --mirror --continue --no-host-directories https://biogeo.ucdavis.edu/data/gadm3.6/shp/gadm36_IND_shp.zip --directory-prefix=../data/cholera_outbreaks

In [None]:
!mv ../data/cholera_outbreaks/data/gadm3.6/shp/* ../data/cholera_outbreaks

In [None]:
!rm -rf ../data/cholera_outbreaks/data

In [None]:
!unzip -d ../data/cholera_outbreaks gadm36_IND_shp.zip

In [None]:
path = '../data/cholera_outbreaks/gadm36_IND_shp/'

In [None]:
file = 'gadm36_IND_2.shp'

In [None]:
india = gpd.read_file(path+file)

In [None]:
india.shape

In [None]:
india.info()

In [None]:
india.head()

In [None]:
states_districts = india[['NAME_1', 'NAME_2', 'geometry']].copy()

In [None]:
states_districts.columns = ['state', 'district', 'geometry']

In [None]:
states_districts['state'] = states_districts['state'].str.lower()
states_districts['district'] = states_districts['district'].str.lower()

In [None]:
districts = states_districts.district.unique().tolist()

In [None]:
outbreaks.head()

In [None]:
for col in outbreaks.columns[0:3]:
    outbreaks[col+'_clean'] = outbreaks[col].str.replace('\W', '')
    outbreaks[col+'_clean'] = outbreaks[col+'_clean'].apply(lambda x: None if len(x) < 3 else x)

In [None]:
# correct district names in col1_clean
outbreaks.loc[outbreaks.col1_clean == 'mahabubnagar', 'col1_clean'] = 'mahbubnagar'
outbreaks.loc[outbreaks.col1_clean == 'ahmedabad', 'col1_clean'] = 'ahmadabad'
outbreaks.loc[outbreaks.col1_clean == 'howrah', 'col1_clean'] = 'haora'
outbreaks.loc[outbreaks.col1_clean == 'hooghly', 'col1_clean'] = 'hugli'
outbreaks.loc[outbreaks.col1_clean == 'hoogly', 'col1_clean'] = 'hugli'
outbreaks.loc[outbreaks.col1_clean == 'villupuram', 'col1_clean'] = 'viluppuram'
outbreaks.loc[outbreaks.col1_clean == 'haridwar', 'col1_clean'] = 'hardwar'
outbreaks.loc[outbreaks.col1_clean == 'davangere', 'col1_clean'] = 'davanagere'
outbreaks.loc[outbreaks.col1_clean == 'davengere', 'col1_clean'] = 'davanagere'
outbreaks.loc[outbreaks.col1_clean == 'davangare', 'col1_clean'] = 'davanagere'
outbreaks.loc[outbreaks.col1_clean == 'tiruchirapalli', 'col1_clean'] = 'tiruchirappalli'
outbreaks.loc[outbreaks.col1_clean == 'darang', 'col1_clean'] = 'darrang'
outbreaks.loc[outbreaks.col1_clean == 'virudhunager', 'col1_clean'] = 'virudunagar'
outbreaks.loc[outbreaks.col1_clean == 'chikkaballapur', 'col1_clean'] = 'chikballapura'
outbreaks.loc[outbreaks.col1_clean == 'gondia', 'col1_clean'] = 'gondiya'
outbreaks.loc[outbreaks.col1_clean == 'purulia', 'col1_clean'] = 'puruliya'
outbreaks.loc[outbreaks.col1_clean == 'kalaburagi', 'col1_clean'] = 'gulbarga'
outbreaks.loc[outbreaks.col1_clean == 'kalburgi', 'col1_clean'] = 'gulbarga'
outbreaks.loc[outbreaks.col1_clean == 'berhampur', 'col1_clean'] = 'ganjam'
outbreaks.loc[outbreaks.col1_clean == 'sholapur', 'col1_clean'] = 'solapur'
outbreaks.loc[outbreaks.col1_clean == 'raigad', 'col1_clean'] = 'raigarh'
outbreaks.loc[outbreaks.col1_clean == 'panchmahal', 'col1_clean'] = 'panch mahals'
outbreaks.loc[outbreaks.col1_clean == 'sibsagar', 'col1_clean'] = 'sivasagar'
outbreaks.loc[outbreaks.col1_clean == 'sibsagar', 'col1_clean'] = 'kabeerdham'
outbreaks.loc[outbreaks.col1_clean == 'banaskantha', 'col1_clean'] = 'banas kantha'
outbreaks.loc[outbreaks.col1_clean == 'chamarajnagar', 'col1_clean'] = 'chamrajnagar'
outbreaks.loc[outbreaks.col1_clean == 'khargaon', 'col1_clean'] = 'west nimar'
outbreaks.loc[outbreaks.col1_clean == 'mysuru', 'col1_clean'] = 'mysore'
outbreaks.loc[outbreaks.col1_clean == 'mohali', 'col1_clean'] = 'sahibzada ajit singh nagar'
outbreaks.loc[outbreaks.col1_clean == 'delhi', 'col1_clean'] = 'west'
outbreaks.loc[outbreaks.col1_clean == 'kawardha', 'col1_clean'] = 'kabeerdham'
outbreaks.loc[outbreaks.col1_clean == 'chirtadurga', 'col1_clean'] = 'chitradurga'
outbreaks.loc[outbreaks.col1_clean == 'budgam', 'col1_clean'] = 'badgam'
outbreaks.loc[outbreaks.col1_clean == 'gulburga', 'col1_clean'] = 'gulbarga'
outbreaks.loc[outbreaks.col1_clean == 'jangir', 'col1_clean'] = 'janjgir-champa'
outbreaks.loc[outbreaks.col1_clean == 'kancheepuramsaidapet', 'col1_clean'] = 'kancheepuram'

# correct district names in col2_clean
outbreaks.loc[outbreaks.col2_clean == 'thiruvannamalai', 'col2_clean'] = 'tiruvannamalai'
outbreaks.loc[outbreaks.col2_clean == 'raigad', 'col2_clean'] = 'raigarh'
outbreaks.loc[outbreaks.col2_clean == 'davangere', 'col2_clean'] = 'davanagere'
outbreaks.loc[outbreaks.col2_clean == 'sibsagar', 'col2_clean'] = 'sivasagar'
outbreaks.loc[outbreaks.col2_clean == 'sabarkantha', 'col2_clean'] = 'sabar kantha'

In [None]:
outbreaks.head()

In [None]:
outbreaks.columns[18:]

In [None]:
for col in outbreaks.columns[18:]:
    outbreaks[col+'_district'] = outbreaks[col].apply(lambda x: None if x is None else next((d for d in districts if x in str(d)), None))

In [None]:
outbreaks.head()

In [None]:
district_cols = [col for col in outbreaks.columns if 'district' in col]

In [None]:
outbreaks_districts = []

for i, row in outbreaks.iterrows():
    all_district_cols = row[district_cols]
    district = next((i for i in all_district_cols if i is not None), None)
    outbreaks_districts.append(district)

In [None]:
outbreaks['district'] = outbreaks_districts

In [None]:
# manually map missing districts
outbreaks.loc[(outbreaks.col4 == '110') & (outbreaks.col6 == '25/10/13'), 'district'] = 'bankura'
outbreaks.loc[(outbreaks.col4 == '50') & (outbreaks.col6 == '25/10/13') & (outbreaks.col7 == '28/10/13'), 'district'] = 'bankura'
outbreaks.loc[(outbreaks.col4 == '23') & (outbreaks.col6 == '23/10/13') & (outbreaks.col7 == '23/10/13'), 'district'] = 'puruliya'
outbreaks.loc[(outbreaks.col4 == '13') & (outbreaks.col6 == '31/07/13') & (outbreaks.col7 == '31/07/13'), 'district'] = 'puruliya'
outbreaks.loc[(outbreaks.col4 == '33') & (outbreaks.col6 == '01/06/13') & (outbreaks.col7 == '08/06/13'), 'district'] = 'chitradurga'
outbreaks.loc[(outbreaks.col4 == '88') & (outbreaks.col6 == '05/05/13'), 'district'] = 'sangli'
outbreaks.loc[(outbreaks.col4 == '60') & (outbreaks.col6 == '01/05/13') & (outbreaks.col7 == '02/05/13'), 'district'] = 'puruliya'
outbreaks.loc[(outbreaks.col4 == '36') & (outbreaks.col6 == '19/04/13'), 'district'] = 'davanagere'
outbreaks.loc[(outbreaks.col4 == '161') & (outbreaks.col6 == '27/08/12'), 'district'] = 'nagpur'
outbreaks.loc[(outbreaks.col4 == '73') & (outbreaks.col6 == '22/07/12') & (outbreaks.col7 == '23/07/12'), 'district'] = 'bankura'
outbreaks.loc[(outbreaks.col4 == '19') & (outbreaks.col6 == '21/07/12') & (outbreaks.col7 == '21/07/12'), 'district'] = 'chikmagalur'
outbreaks.loc[(outbreaks.col4 == '21') & (outbreaks.col6 == '06/07/12') & (outbreaks.col7 == '12/07/12'), 'district'] = 'birbhum'
outbreaks.loc[(outbreaks.col4 == '59') & (outbreaks.col6 == '25/05/12') & (outbreaks.col7 == '25/05/12'), 'district'] = 'tumkur'
outbreaks.loc[(outbreaks.col4 == '8') & (outbreaks.col6 == '03/05/12') & (outbreaks.col7 == '07/05/12'), 'district'] = 'wayanad'
outbreaks.loc[(outbreaks.col4 == '26') & (outbreaks.col7 == '16.08.11'), 'district'] = 'birbhum'
outbreaks.loc[(outbreaks.col4 == '86') & (outbreaks.col6 == '25/10/13') & (outbreaks.col7 == '28/10/13'), 'district'] = 'bankura'
outbreaks.loc[(outbreaks.col4 == '25') & (outbreaks.col6 == '19/05/12'), 'district'] = 'mandya'
outbreaks.loc[(outbreaks.col4 == '12') & (outbreaks.col6 == '13.03.12') & (outbreaks.col7 == '14.03.12'), 'district'] = 'mysore'

In [None]:
outbreaks.head()

In [None]:
outbreaks.tail()

In [None]:
outbreaks.shape

In [None]:
outbreaks = outbreaks[outbreaks.district.notnull()]

In [None]:
outbreaks.shape

In [None]:
outbreaks_mapped = pd.merge(states_districts, outbreaks, how='right', on='district')[['state', 'district', 'start_date_year', 'start_date_month', 'season', 'outbreak', 'geometry']].reset_index(drop=True)

In [None]:
outbreaks_mapped.shape

In [None]:
outbreaks_mapped

In [None]:
duplicate_districts = states_districts.district.value_counts()[states_districts.district.value_counts() > 1].index.tolist()

In [None]:
outbreaks_mapped[outbreaks_mapped.district.isin(duplicate_districts)].sort_values('district')

In [None]:
outbreaks[(outbreaks.district == 'bijapur') & (outbreaks.start_date_month == '07')] # karnataka

In [None]:
outbreaks[(outbreaks.district == 'bijapur') & (outbreaks.start_date_month == '08')] # karnataka

In [None]:
outbreaks[(outbreaks.district == 'raigarh') & (outbreaks.start_date_month == '07')] # maharashtra

In [None]:
outbreaks[(outbreaks.district == 'raigarh') & (outbreaks.start_date_month == '03')] # maharashtra

In [None]:
outbreaks_mapped.drop(outbreaks_mapped[(outbreaks_mapped.state == 'chhattisgarh') & (outbreaks_mapped.district.isin(['bijapur', 'raigarh']))].index, inplace=True)

In [None]:
outbreaks_mapped.shape

In [None]:
outbreaks_mapped.drop_duplicates().shape

In [None]:
outbreaks_mapped.drop_duplicates().reset_index(drop=True).to_file('../data/cholera_outbreaks/monthly_cholera_outbreaks_per_district_2010_2015.shp')