# load pdfs and extract relevant text

In [None]:
import os
import numpy as np
import pandas as pd
import PyPDF2
import re

In [None]:
files_to_skip = ['52nd_wk_old.pdf',
                 '11th_wk14_old.pdf']

In [None]:
years = ['2010',
         '2011',
         '2012',
         '2013',
         '2014',
         '2015']

In [None]:
path = 'data/cholera_outbreaks/'

In [None]:
%%time

outbreaks_raw = []

for year in years:
    print('Processing year: {}...'.format(year))
    (_, _, file_names) = next(os.walk(path+year))
    file_names = np.setdiff1d(file_names, files_to_skip)
    
    for file in file_names:
        print('Processing file: {}...'.format(file))
        document = PyPDF2.PdfFileReader(path+year+'/'+file)
        pages = document.getNumPages()
        
        for page in range(pages):
            text = document.getPage(page).extractText()
            text = re.sub('\n', '', text)
            text = text.split(' ')
            text = [i for i in text if i != '']
            text = [i.lower() for i in text]
    
            indices = [i for i, s in enumerate(text) if 'cholera' in s]
        
            if len(indices) > 0:
                for index in indices:
                    cholera_index = []
                    cholera_index.append(index)
                    index_list = np.arange(index-5,index).tolist() + cholera_index + np.arange(index+1,index+11).tolist()
                    index_list = [i for i in index_list if i < len(text)]
                    outbreak_info = [text[i] for i in index_list]
                    outbreak_info.append(year+'/'+file)
                    outbreaks_raw.append(outbreak_info)
            else:
                pass

In [None]:
outbreaks_raw = pd.DataFrame(outbreaks_raw)

In [None]:
outbreaks_raw.shape

In [None]:
outbreaks_raw.head()

In [None]:
column_names = []
for i in range(outbreaks_raw.shape[1]):
    column_names.append('col'+str(i))

In [None]:
outbreaks_raw.columns = column_names

In [None]:
outbreaks_raw.columns

In [None]:
outbreaks_raw.head()

In [None]:
outbreaks_raw = df.apply(lambda x: 'cholera' if 'cholera' in x else x)

In [None]:
for i in range(outbreaks_raw.shape[1]):
    print('Column {} contains cholera: {}'.format(i, 'cholera' in outbreaks_raw.iloc[:,i].unique()))

In [None]:
outbreaks_raw[outbreaks_raw.col1 == 'cholera']

In [None]:
outbreaks_raw[outbreaks_raw.col2 == 'cholera']

In [None]:
outbreaks_raw[outbreaks_raw.col6 == 'cholera']

In [None]:
outbreaks_raw[outbreaks_raw.col12 == 'cholera']

In [None]:
outbreaks_raw[outbreaks_raw.col14 == 'cholera']

In [None]:
outbreaks_raw[outbreaks_raw.col15 == 'cholera']

In [None]:
#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_columns', None)

# clean cholera column

In [None]:
outbreaks_raw[outbreaks_raw.col5 == 'cholera'].shape

In [None]:
outbreaks_raw.col5.value_counts()

In [None]:
outbreaks_raw['col5_clean'] = outbreaks_raw.col5.apply(lambda x: 'cholera' if 'cholera' in x else x)

In [None]:
outbreaks_raw.col5_clean.unique()

In [None]:
outbreaks_raw[outbreaks_raw.col5_clean == 'cholera'].shape

# filter outbreaks

In [None]:
outbreaks_raw['outbreak'] = outbreaks_raw.col6.apply(lambda x: 0 if re.search('[a-zA-Z]', x) else 1)

In [None]:
outbreaks = outbreaks_raw[outbreaks_raw.outbreak == 1].copy()

In [None]:
outbreaks.shape

# extract start dates

In [None]:
outbreaks.columns

In [None]:
for col in outbreaks.columns[6:17]:
    outbreaks[col+'_clean'] = outbreaks[col].apply(lambda x: None if (len(re.sub('[^0-9]', '', str(x))) < 6) | (len(re.sub('[^0-9]', '', str(x))) > 8) else re.sub('[^0-9]', '', str(x)))
    outbreaks[col+'_clean'] = outbreaks[col+'_clean'].apply(lambda x: x if x is None else (x[:4] + '20' + x[4:] if len(x) == 6 else x))

In [None]:
outbreaks.columns

In [None]:
start_dates = []
for i, row in outbreaks.iterrows():
    dates = row[19:]
    start_date = next((i for i in dates if i is not None), None)
    start_dates.append(start_date)

In [None]:
outbreaks['start_date'] = start_dates

In [None]:
outbreaks.start_date.unique()

# map outbreak month to season

In [None]:
outbreaks['outbreak_month'] = outbreaks.start_date.apply(lambda x: x[2:4])

In [None]:
outbreaks.head()

In [None]:
seasons = {'01': 'winter',
           '02': 'winter',
           '03': 'pre_monsoon',
           '04': 'pre_monsoon',
           '05': 'pre_monsoon',
           '06': 'monsoon',
           '07': 'monsoon',
           '08': 'monsoon',
           '09': 'monsoon',
           '10': 'post_monsoon',
           '11': 'post_monsoon',
           '12': 'post_monsoon'}

In [None]:
outbreaks['season'] = outbreaks.outbreak_month.map(seasons)

In [None]:
outbreaks.head()

# map states and districts

In [1]:
import geopandas as gpd
import requests
import zipfile
import io

In [None]:
import os

In [8]:
os.listdir()

['extract_outbreaks_from_pdfs.ipynb',
 'data',
 '.ipynb_checkpoints',
 'download_cholera_outbreaks_pdfs.py',
 'ijerph-17-09378-v2.pdf',
 'download_cholera_outbreaks_pdfs.ipynb']

In [9]:
os.chdir('data')

In [10]:
os.listdir()

['gadm36_IND_shp', 'gadm36_IND_shp.zip', 'cholera_outbreaks']

In [11]:
directory = 'gadm36_IND_shp/'

In [12]:
file = 'gadm36_IND_2.shp'

In [14]:
india = gpd.read_file(directory+file)

In [15]:
print("Shape of the dataframe: {}".format(india.shape))
print("Projection of dataframe: {}".format(india.crs))
india.tail() #last 5 records in dataframe

Shape of the dataframe: (666, 14)
Projection of dataframe: {'init': 'epsg:4326'}


Unnamed: 0,GID_0,NAME_0,GID_1,NAME_1,NL_NAME_1,GID_2,NAME_2,VARNAME_2,NL_NAME_2,TYPE_2,ENGTYPE_2,CC_2,HASC_2,geometry
661,IND,India,IND.36_1,West Bengal,,IND.36.16_1,Pashchim Medinipur,Paschim Medinipur,,District,District,,IN.WB.WM,"POLYGON ((87.22874 21.95608, 87.22562 21.95712..."
662,IND,India,IND.36_1,West Bengal,,IND.36.17_1,Purba Medinipur,Purba Medinipur,,District,District,,IN.WB.EM,"MULTIPOLYGON (((87.98972 22.21750, 87.99028 22..."
663,IND,India,IND.36_1,West Bengal,,IND.36.18_1,Puruliya,,,District,District,,IN.WB.PU,"POLYGON ((85.88916 23.15176, 85.88641 23.15335..."
664,IND,India,IND.36_1,West Bengal,,IND.36.19_1,South 24 Parganas,,,District,District,,IN.WB.PS,"MULTIPOLYGON (((88.01861 21.57278, 88.01889 21..."
665,IND,India,IND.36_1,West Bengal,,IND.36.20_1,Uttar Dinajpur,,,District,District,,IN.WB.UD,"MULTIPOLYGON (((88.45434 25.66317, 88.45428 25..."


In [16]:
states_districts = india[['NAME_1', 'NAME_2']]

In [19]:
states_districts.columns = ['State', 'District']

In [20]:
states_districts.State.nunique()

36

In [21]:
states_districts.District.nunique()

659

In [18]:
states_districts.drop_duplicates()

Unnamed: 0,NAME_1,NAME_2
0,Andaman and Nicobar,Nicobar Islands
1,Andaman and Nicobar,North and Middle Andaman
2,Andaman and Nicobar,South Andaman
3,Andhra Pradesh,Anantapur
4,Andhra Pradesh,Chittoor
...,...,...
661,West Bengal,Pashchim Medinipur
662,West Bengal,Purba Medinipur
663,West Bengal,Puruliya
664,West Bengal,South 24 Parganas


In [22]:
states_districts.State.value_counts()

Uttar Pradesh             75
Madhya Pradesh            51
Bihar                     38
Maharashtra               36
Gujarat                   33
Rajasthan                 33
Tamil Nadu                32
Odisha                    30
Karnataka                 30
Assam                     27
Chhattisgarh              27
Jharkhand                 24
Jammu and Kashmir         22
Punjab                    22
Haryana                   21
West Bengal               20
Arunachal Pradesh         18
Kerala                    14
Uttarakhand               13
Andhra Pradesh            13
Himachal Pradesh          12
Nagaland                  11
Telangana                 10
Meghalaya                 10
Manipur                    9
Mizoram                    8
Tripura                    8
Sikkim                     4
Puducherry                 4
Andaman and Nicobar        3
Goa                        2
Daman and Diu              2
NCT of Delhi               1
Dadra and Nagar Haveli     1
Chandigarh    

In [23]:
states_districts.District.value_counts()

Balrampur       2
Pratapgarh      2
Raigarh         2
Aurangabad      2
Hamirpur        2
               ..
Yadgir          1
West Tripura    1
Kishanganj      1
Tapi            1
Pilibhit        1
Name: District, Length: 659, dtype: int64

In [25]:
states_districts[states_districts.District == 'Balrampur']

Unnamed: 0,State,District
102,Chhattisgarh,Balrampur
569,Uttar Pradesh,Balrampur


In [None]:
# map states and districts (concat col0 - col4, remove non-state and -district strings)
# aggregate outbreaks by district and month