In [35]:
import pandas as pd
import numpy as np
from collections import defaultdict

This is case by case data of Table 7, given in the appendix of "On the mode of communication of cholera" by Dr. Snow. I copied the data into a text file and the following the code to convert that data into a tabulated form. 

In [42]:
file_name = 'table_7_death_data'

with open(file_name, "r") as file:
    content = file.read()

In [43]:
case_dict = defaultdict(lambda: [])

In [44]:
len(content.split('District:')[1:])

32

So we have data for all 32 subdistricts (12 supplied by Southwark, 16 by both and 4 by Lambeth)

In [45]:
content = content.split('District:')[1:]

In [46]:
content[0]

'St. Saviour, Southwark: Christchurch:\n\nAt 34, Charlotte Street, on 29th July, a stock-maker, aged 29, "Asiatic cholera 18 hours". Lambeth.\n\nAt 45, Gravel Lane, on 1st August, the widow of a farmer, aged 48, "cholera 12 hours". Southwark and Vauxhall.\n\nAt 1, Alpha Place, on 1st August, a barrister\'s clerk, aged 57, "cholera 24 hours". Southwark and Vauxhall.\n\n'

In [47]:
districts = [data.split(':')[0] for data in content]
districts

['St. Saviour, Southwark',
 'St. Saviour, Southwark',
 'St. Olave, Southwark',
 'St. Olave, Southwark',
 'Bermondsey',
 'Bermondsey',
 'Bermondsey',
 'St. George, Southwark',
 'St. George, Southwark',
 'St. George, Southwark',
 'Newington',
 'Newington',
 'Newington',
 'Lambeth',
 'Lambeth',
 'Lambeth',
 'Lambeth',
 'Lambeth',
 'Lambeth',
 'Lambeth',
 'Lambeth',
 'Wandsworth',
 'Wandsworth',
 'Wandsworth',
 'Wandsworth',
 'Wandsworth',
 'Camberwell',
 'Camberwell',
 'Camberwell',
 'Camberwell',
 'Rotherhithe',
 'Lewisham']

Now for subdistricts:

In [54]:
sub_districts = [data.split(':')[1][1:] for data in content] 
sub_districts

['Christchurch',
 'St. Saviour',
 'St. Olave',
 'St. John, Horsleydown',
 'St. James',
 'St. Mary Magdalen',
 'Leather Market',
 'Kent Road',
 'Borough Road',
 'London Road',
 'Trinity',
 'St. Peter, Walworth',
 'St. Mary',
 'Waterloo (First Part)',
 'Waterloo (Second Part)',
 'Lambeth Church (First Part)',
 'Lambeth Church (Second Part)',
 'Kennington (First Part)',
 'Kennington (2nd Part)',
 'Brixton',
 'Norwood',
 'Clapham',
 'Battersea',
 'Wandsworth',
 'Putney',
 'Streatham',
 'Dulwich',
 'Camberwell',
 'Peckham',
 'St. George',
 'Rotherhithe',
 'Sydenham']

So we have the Districts and sub districts

In [57]:
dis_sub_dis = tuple(zip(districts, sub_districts))
dis_sub_dis

(('St. Saviour, Southwark', 'Christchurch'),
 ('St. Saviour, Southwark', 'St. Saviour'),
 ('St. Olave, Southwark', 'St. Olave'),
 ('St. Olave, Southwark', 'St. John, Horsleydown'),
 ('Bermondsey', 'St. James'),
 ('Bermondsey', 'St. Mary Magdalen'),
 ('Bermondsey', 'Leather Market'),
 ('St. George, Southwark', 'Kent Road'),
 ('St. George, Southwark', 'Borough Road'),
 ('St. George, Southwark', 'London Road'),
 ('Newington', 'Trinity'),
 ('Newington', 'St. Peter, Walworth'),
 ('Newington', 'St. Mary'),
 ('Lambeth', 'Waterloo (First Part)'),
 ('Lambeth', 'Waterloo (Second Part)'),
 ('Lambeth', 'Lambeth Church (First Part)'),
 ('Lambeth', 'Lambeth Church (Second Part)'),
 ('Lambeth', 'Kennington (First Part)'),
 ('Lambeth', 'Kennington (2nd Part)'),
 ('Lambeth', 'Brixton'),
 ('Lambeth', 'Norwood'),
 ('Wandsworth', 'Clapham'),
 ('Wandsworth', 'Battersea'),
 ('Wandsworth', 'Wandsworth'),
 ('Wandsworth', 'Putney'),
 ('Wandsworth', 'Streatham'),
 ('Camberwell', 'Dulwich'),
 ('Camberwell', 'Cam

Now lets make a case dictionary

In [60]:
content[0].split('\n\n')[1:-1]

['At 34, Charlotte Street, on 29th July, a stock-maker, aged 29, "Asiatic cholera 18 hours". Lambeth.',
 'At 45, Gravel Lane, on 1st August, the widow of a farmer, aged 48, "cholera 12 hours". Southwark and Vauxhall.',
 'At 1, Alpha Place, on 1st August, a barrister\'s clerk, aged 57, "cholera 24 hours". Southwark and Vauxhall.']

In [63]:
content[1].split('\n\n')[0].split(':')

['St. Saviour, Southwark', ' St. Saviour', '']

In [68]:
new_case_list = []

for case_list in content:
    cases = case_list.split('\n\n')[1:-1]
    district = case_list.split('\n\n')[0].split(':')[0]
    sub_district = case_list.split('\n\n')[0].split(':')[1]
    for case in cases:
        case = district + ':' + sub_district + ':' + case
        new_case_list.append(case)
    
    

In [70]:
len(new_case_list)

337

this should be 334. but there are 3 lines that say "No deaths from cholera."

In [77]:
for case in new_case_list:
    if "No death from cholera" in case:
        print(case)

Lambeth: Norwood:No death from cholera.
Wandsworth: Streatham:No death from cholera.
Camberwell: Dulwich:No death from cholera.


In [78]:
only_cases = [case for case in new_case_list if "No death from cholera" not in case]
len(only_cases)

334

so we have our case text.

lets see the general format of cases:

In [79]:
only_cases[:5]

['St. Saviour, Southwark: Christchurch:At 34, Charlotte Street, on 29th July, a stock-maker, aged 29, "Asiatic cholera 18 hours". Lambeth.',
 'St. Saviour, Southwark: Christchurch:At 45, Gravel Lane, on 1st August, the widow of a farmer, aged 48, "cholera 12 hours". Southwark and Vauxhall.',
 'St. Saviour, Southwark: Christchurch:At 1, Alpha Place, on 1st August, a barrister\'s clerk, aged 57, "cholera 24 hours". Southwark and Vauxhall.',
 'St. Saviour, Southwark: St. Saviour:At 1, Park Street, on 25th July, the wife of a labourer, aged 35, "Asiatic cholera 14½ hours". Southwark and Vauxhall.',
 'St. Saviour, Southwark: St. Saviour:At 40, Bankside, on 25th July, the son of a locksmith, aged 5 years, "cholera 12 hours". Southwark and Vauxhall.']