# Preprocess cholera outbreaks

In [1]:
import os
import glob
import numpy as np
import pandas as pd
import geopandas as gpd

In [2]:
years = list(np.arange(2010, 2019))

In [3]:
path = '../data/cholera_outbreaks'

## Identify files supposedly containing outbreaks

In [4]:
%%time

files_with_outbreaks = []

# loop through years
for year in years:
    
    print('Processing {}...'.format(year))
    
    # set counter
    counter = 0
    
    # get all text file paths
    files = glob.glob(os.path.join(path, str(year), '*.txt'))
    
    # loop through files
    for file in files:
        
        # read text file
        with open(file) as f:
            lines = f.readlines()
            
        # loop through lines and check whether cholera is mentioned
        for line in lines:
            if 'cholera' in line.lower():
                files_with_outbreaks.append(file)
                counter += 1
                break
    
    print('Found {} files with outbreaks.'.format(counter))

Processing 2010...
Found 39 files with outbreaks.
Processing 2011...
Found 45 files with outbreaks.
Processing 2012...
Found 42 files with outbreaks.
Processing 2013...
Found 46 files with outbreaks.
Processing 2014...
Found 41 files with outbreaks.
Processing 2015...
Found 44 files with outbreaks.
Processing 2016...
Found 52 files with outbreaks.
Processing 2017...
Found 52 files with outbreaks.
Processing 2018...
Found 47 files with outbreaks.
CPU times: user 153 ms, sys: 11.3 ms, total: 164 ms
Wall time: 163 ms


In [5]:
len(files_with_outbreaks)

408

In [6]:
strings_to_remove = ['xi.',
                     'xii.',
                     'vi.',
                     'iii.',
                     'xiv.',
                     'xliv.',
                     'ix.',
                     'v.',
                     'i.',
                     'xv.',
                     'ii.',
                     'xxxiv.',
                     'xlv.',
                     'i',
                     'ix',
                     'xxxvii.',
                     'xvii.',
                     'x',
                     'xviii.',
                     'xvi.',
                     'xliii.',
                     'xxxii.',
                     'xxix',
                     'xiii.',
                     'xxix.',
                     'ii',
                     'xxi.',
                     'x.',
                     'viii.',
                     'xxxii',
                     'iv.',
                     'xxxi.',
                     'xxx.']

In [7]:
%%time

outbreaks_raw = pd.DataFrame(columns=['district', 'cholera_mention', 'start_date', 'cholera_index', 'file'])

# loop through files with outbreaks
for file in files_with_outbreaks:
    
    # read text file
    with open(file) as f:
        lines = f.readlines()
    
    # remove any leading and trailing whitespace
    lines = [line.strip() for line in lines]
    
    # remove empty elements
    lines = [line for line in lines if line != '']
    
    # remove certain strings
    lines = [line for line in lines if line not in strings_to_remove]
    
    # get indices where cholera is mentioned
    indices = [index for index, line in enumerate(lines) if 'cholera' in line.lower()]
    
    # loop through extracted indices and extract data
    for index in indices:
        
        outbreak = []
        
        # district is typically 1 line above cholera mention
        district_index = index - 1
        
        # start date is typically 2 lines below cholera mention for data from 2010 and 2011 and 3 lines below for data since 2012
        if int(file[-8:-4]) in [2010, 2011]:
            start_date_index = index + 2
        else:
            start_date_index = index + 3
        
        # district
        outbreak.append(lines[district_index].lower())
        
        # cholera mention
        outbreak.append(lines[index].lower())
        
        # start date (check whether text file is long enough)
        if start_date_index < len(lines):
            outbreak.append(lines[start_date_index])
        else:
            outbreak.append('')
        
        # cholera index
        outbreak.append(index)
        
        # file
        outbreak.append(file.split('/')[-1])
        
        # append data to existing dataframe
        outbreaks_raw.loc[len(outbreaks_raw)] = outbreak

CPU times: user 10.6 s, sys: 22.4 ms, total: 10.6 s
Wall time: 10.6 s


In [8]:
outbreaks_raw.shape

(2075, 5)

In [9]:
outbreaks_raw

Unnamed: 0,district,cholera_mention,start_date,cholera_index,file
0,and one stool sample collected. all three wate...,unfit for drinking and stool sample was negati...,"Outbreak reported from G.Hosakote village, ID ...",89,42nd_2010.txt
1,the outbreak. for laboratory confirmation thre...,found to be not potable and two stool samples ...,"Cases reported from Manmad PHC, Talatal due to...",205,42nd_2010.txt
2,investigated the outbreak. out of four stool s...,tested positive for cholera. daily house to ho...,disinfection of water sources and overhead tan...,182,8th_2010.txt
3,control,cholera,03.07.10,90,27th_2010.txt
4,locality and sent for lab testing on daily bas...,"karnal, out of which one found positive for vi...",distributed in the locality and villagers advi...,136,27th_2010.txt
...,...,...,...,...,...
2070,sent to dphl nadiad; was negative for,cholera. cases are due to food consumed,informed for necessary actions. Health camp,166,17th_2018.txt
2071,report awaited. stool samples were negative for,cholera and all water samples were nonpotable.,Relevant health education given.,188,17th_2018.txt
2072,north district,cholera,17-04-18,905,17th_2018.txt
2073,akola,cholera,17-04-18,993,17th_2018.txt


## Filter outbreaks

In [10]:
characters_to_remove = ['?', '.', ' ', 'i', 'v', 'x']
for character in characters_to_remove:
    outbreaks_raw['cholera_mention'] = outbreaks_raw['cholera_mention'].apply(lambda x: x.replace(character, ''))

In [11]:
outbreaks_raw['cholera_mention'] = outbreaks_raw['cholera_mention'].apply(lambda x: x[1:] if x[0] == 'l' else x)

In [12]:
outbreaks_raw['cholera_mention'] = outbreaks_raw['cholera_mention'].apply(lambda x: x[:-1] if x[-1] == 'e' else x)

In [13]:
# manually inspect later
outbreaks_raw[outbreaks_raw['cholera_mention'].str.len() > len('cholera')].to_csv('manual_inspection_1.csv', index=False)
outbreaks_raw[outbreaks_raw['cholera_mention'].str.len() > len('cholera')].shape

(1235, 5)

In [14]:
outbreaks_filtered = outbreaks_raw[outbreaks_raw['cholera_mention'].str.len() == len('cholera')].reset_index(drop=True)
outbreaks_filtered

Unnamed: 0,district,cholera_mention,start_date,cholera_index,file
0,control,cholera,03.07.10,90,27th_2010.txt
1,navsari,cholera,24.06.10,447,27th_2010.txt
2,puducherry,cholera,29.06.10,473,27th_2010.txt
3,nadia,cholera,27.06.10,481,27th_2010.txt
4,control,cholera,06.07.10,147,32nd_2010.txt
...,...,...,...,...,...
835,chickenpox,cholera,Dengue,41,40th_2018.txt
836,viral fever,cholera,Chickenpox,64,40th_2018.txt
837,hoshiarpur,cholera,02-10-18,223,40th_2018.txt
838,north district,cholera,17-04-18,905,17th_2018.txt


## Clean start dates

In [15]:
outbreaks_filtered['has_start_date'] = outbreaks_filtered['start_date'].apply(lambda x: 1 if sum(c.isdigit() for c in x) > 4 else 0)

In [16]:
# manually inspect later
outbreaks_filtered[outbreaks_filtered['has_start_date'] == 0].drop('has_start_date', axis=1).to_csv('manual_inspection_2.csv', index=False)
outbreaks_filtered[outbreaks_filtered['has_start_date'] == 0].shape

(428, 6)

In [17]:
outbreaks_start_dates = outbreaks_filtered[outbreaks_filtered['has_start_date'] == 1].drop('has_start_date', axis=1).reset_index(drop=True)
outbreaks_start_dates

Unnamed: 0,district,cholera_mention,start_date,cholera_index,file
0,control,cholera,03.07.10,90,27th_2010.txt
1,navsari,cholera,24.06.10,447,27th_2010.txt
2,puducherry,cholera,29.06.10,473,27th_2010.txt
3,nadia,cholera,27.06.10,481,27th_2010.txt
4,control,cholera,06.07.10,147,32nd_2010.txt
...,...,...,...,...,...
407,morena,cholera,27-08-18,317,35th_2018.txt
408,ludhiana,cholera,29-08-18,580,35th_2018.txt
409,hoshiarpur,cholera,02-10-18,223,40th_2018.txt
410,north district,cholera,17-04-18,905,17th_2018.txt


In [18]:
outbreaks_start_dates['start_date'] = outbreaks_start_dates['start_date'].apply(lambda x: x.replace('/', '.').replace('-', '.'))

In [19]:
outbreaks_start_dates['month'] = outbreaks_start_dates['start_date'].apply(lambda x: x.split('.')[1])
outbreaks_start_dates['month'] = outbreaks_start_dates['month'].apply(lambda x: x[1] if x[0] == '0' else x)
outbreaks_start_dates['month'] = outbreaks_start_dates['month'].astype(int)

In [20]:
sorted(outbreaks_start_dates['month'].unique())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [21]:
outbreaks_start_dates['year'] = outbreaks_start_dates['start_date'].apply(lambda x: x.split('.')[2])
outbreaks_start_dates['year'] = outbreaks_start_dates['year'].str.pad(width=3, side='left', fillchar='0')
outbreaks_start_dates['year'] = outbreaks_start_dates['year'].str.pad(width=4, side='left', fillchar='2')
outbreaks_start_dates['year'] = outbreaks_start_dates['year'].astype(int)

In [22]:
sorted(outbreaks_start_dates['year'].unique())

[2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]

In [23]:
outbreaks_start_dates = outbreaks_start_dates[outbreaks_start_dates['year'] > 2009].reset_index(drop=True)

In [24]:
outbreaks_start_dates

Unnamed: 0,district,cholera_mention,start_date,cholera_index,file,month,year
0,control,cholera,03.07.10,90,27th_2010.txt,7,2010
1,navsari,cholera,24.06.10,447,27th_2010.txt,6,2010
2,puducherry,cholera,29.06.10,473,27th_2010.txt,6,2010
3,nadia,cholera,27.06.10,481,27th_2010.txt,6,2010
4,control,cholera,06.07.10,147,32nd_2010.txt,7,2010
...,...,...,...,...,...,...,...
406,morena,cholera,27.08.18,317,35th_2018.txt,8,2018
407,ludhiana,cholera,29.08.18,580,35th_2018.txt,8,2018
408,hoshiarpur,cholera,02.10.18,223,40th_2018.txt,10,2018
409,north district,cholera,17.04.18,905,17th_2018.txt,4,2018


## Clean districts

In [25]:
outbreaks_start_dates['district'] = outbreaks_start_dates['district'].apply(lambda x: x.replace('(', '').replace(')', ''))

In [26]:
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'ahmedabad', 'district'] = 'ahmadabad'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'ahmednagar', 'district'] = 'ahmadnagar'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'badwani', 'district'] = 'barwani'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'balasore', 'district'] = 'baleshwar'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'banaskantha', 'district'] = 'banas kantha'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'bangalore urban', 'district'] = 'bangalore'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'bardhaman', 'district'] = 'barddhaman'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'belagavi', 'district'] = 'belgaum'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'berhampur', 'district'] = 'ganjam'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'davangere', 'district'] = 'davanagere'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'davengere', 'district'] = 'davanagere'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'gandhi nagar', 'district'] = 'gandhinagar'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'gondia', 'district'] = 'gondiya'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'haridwar', 'district'] = 'hardwar'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'haveli', 'district'] = 'dadra and nagar haveli'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'hooghly', 'district'] = 'hugli'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'hoogly', 'district'] = 'hugli'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'jangir', 'district'] = 'janjgir-champa'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'janjgir', 'district'] = 'janjgir-champa'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'kannada', 'district'] = 'uttara kannada'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'khandwa', 'district'] = 'east nimar'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'mysuru', 'district'] = 'mysore'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'nagar haveli', 'district'] = 'dadra and nagar haveli'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'north delhi', 'district'] = 'west'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'north district', 'district'] = 'west'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'parganas', 'district'] = 'north 24 parganas'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'pargans', 'district'] = 'north 24 parganas'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'purlia', 'district'] = 'puruliya'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'purulia', 'district'] = 'puruliya'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'raigad', 'district'] = 'raigarh'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'rampurhat', 'district'] = 'birbhum'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'sas nagar', 'district'] = 'sahibzada ajit singh nagar'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'sibsagar', 'district'] = 'sivasagar'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'uttar kannada', 'district'] = 'uttara kannada'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'west delhi', 'district'] = 'west'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'gundlupet', 'district'] = 'chamrajnagar'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'howrah', 'district'] = 'haora'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'd&n haveli', 'district'] = 'dadra and nagar haveli'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'gulburga', 'district'] = 'gulbarga'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'villupuram', 'district'] = 'viluppuram'
outbreaks_start_dates.loc[outbreaks_start_dates['district'] == 'chikkaballapur', 'district'] = 'chikballapura'

## Merge outbreaks and district polygons

In [27]:
path = '../data/cholera_outbreaks/gadm36_IND_shp'

In [28]:
file = 'gadm36_IND_2.shp'

In [29]:
india = gpd.read_file(os.path.join(path, file))
india.shape

(666, 14)

In [30]:
# select districts and geometry
districts = india[['NAME_2', 'geometry']].copy()

In [31]:
# rename district column
districts = districts.rename(columns={'NAME_2': 'district'})

In [32]:
# make districts lowercase to simplify the mapping
districts['district'] = districts['district'].str.lower()

In [33]:
districts.shape

(666, 2)

In [34]:
districts['district'].nunique()

659

In [35]:
unique_districts = districts[['district']].drop_duplicates()
unique_districts['match'] = 1
unique_districts.shape

(659, 2)

In [36]:
outbreaks_districts = pd.merge(outbreaks_start_dates, unique_districts, how='left', on='district')
outbreaks_districts

Unnamed: 0,district,cholera_mention,start_date,cholera_index,file,month,year,match
0,control,cholera,03.07.10,90,27th_2010.txt,7,2010,
1,navsari,cholera,24.06.10,447,27th_2010.txt,6,2010,1.0
2,puducherry,cholera,29.06.10,473,27th_2010.txt,6,2010,1.0
3,nadia,cholera,27.06.10,481,27th_2010.txt,6,2010,1.0
4,control,cholera,06.07.10,147,32nd_2010.txt,7,2010,
...,...,...,...,...,...,...,...,...
406,morena,cholera,27.08.18,317,35th_2018.txt,8,2018,1.0
407,ludhiana,cholera,29.08.18,580,35th_2018.txt,8,2018,1.0
408,hoshiarpur,cholera,02.10.18,223,40th_2018.txt,10,2018,1.0
409,west,cholera,17.04.18,905,17th_2018.txt,4,2018,1.0


In [37]:
# manually inspect later
outbreaks_districts[pd.isnull(outbreaks_districts['match'])].drop('match', axis=1).to_csv('manual_inspection_3.csv', index=False)
outbreaks_districts[pd.isnull(outbreaks_districts['match'])].shape

(110, 8)

In [38]:
outbreaks_districts = outbreaks_districts[pd.notnull(outbreaks_districts['match'])].drop('match', axis=1).reset_index(drop=True)
outbreaks_districts

Unnamed: 0,district,cholera_mention,start_date,cholera_index,file,month,year
0,navsari,cholera,24.06.10,447,27th_2010.txt,6,2010
1,puducherry,cholera,29.06.10,473,27th_2010.txt,6,2010
2,nadia,cholera,27.06.10,481,27th_2010.txt,6,2010
3,indore,cholera,21.09.10,217,40th_2010.txt,9,2010
4,solapur,cholera,28.02.10,288,11th_2010.txt,2,2010
...,...,...,...,...,...,...,...
296,morena,cholera,27.08.18,317,35th_2018.txt,8,2018
297,ludhiana,cholera,29.08.18,580,35th_2018.txt,8,2018
298,hoshiarpur,cholera,02.10.18,223,40th_2018.txt,10,2018
299,west,cholera,17.04.18,905,17th_2018.txt,4,2018


The original paper identifies 630 outbreaks from July 2009 to December 2019.

This notebook identifies about 50% of all outbreaks automatically. The rest needs to be manually inspected.