## Visualizing the History of Pandemics

<b>Topics Covered</b>
<ul>
  <li>Web Scraping</li>
  <li>Data Cleaning using regex</li>
  <li>Data Analysis</li>
  <li>Data Plotting using plotly</li>
</ul>

### Importing the Libraries

In [423]:
import requests
import pandas as pd      
from bs4 import BeautifulSoup as bs
import re
import plotly.express as px

### Washington Post

https://www.washingtonpost.com/graphics/2020/local/retropolis/coronavirus-deadliest-pandemics/

In [2]:
wp_page = requests.get("https://www.washingtonpost.com/graphics/2020/local/retropolis/coronavirus-deadliest-pandemics/")

In [3]:
wp_page.status_code

200

In [568]:
wp_soup = bs(wp_page.content, 'html.parser')

In [5]:
wp_virus_list=[item.get_text() for item in wp_soup.find_all('h2')][1:-1]

In [6]:
[names.get_text() for  names in wp_soup.find_all('h4')]

['Deaths: 5 million • Cause: Measles and smallpox',
 'Deaths: 30-50 million • Source: Rats and fleas',
 'Deaths: 75-200 million • Source: Rats and fleas',
 'Deaths: 25-55 million • Cause: Variola virus',
 'Deaths: 75,000-100,000 • Source: Rats and fleas',
 'Deaths: 1 million • Cause: V. cholerae bacteria',
 'Deaths: 150,000 • Source: Mosquitoes',
 'Deaths: 50 million • Cause: H1N1',
 'Deaths: 1 million • Cause: H2N2',
 'Deaths: 200,000 • Cause: H1N1']

In [7]:
wp_pandemicname_list=[]
wp_pandemicname_regex=re.compile('([0-9a-zA-Z ,-]+)')
for names in wp_soup.find_all('h2')[1:-1]:
    if (wp_pandemicname_regex.search(names.text)!= None):
        wp_pandemicname_list.append(wp_pandemicname_regex.search(names.text).group(1))

In [8]:
wp_pandemicname_list

['Antonine Plague',
 'Plague of Justinian',
 'Black Death',
 'New World smallpox',
 'Great Plague of London',
 'The cholera pandemics',
 'Yellow fever',
 'The 1918 flu',
 'Asian flu',
 'Swine flu']

In [9]:
wp_death_cause_regex=re.compile('Deaths:\s([0-9a-zA-Z ,-]+)\u2022\s[a-zA-Z]+:\s([a-zA-Z0-9. ]+)')

In [10]:
wp_death_list=[]
wp_cause_list=[]

for names in wp_soup.find_all('h4'):
    if (wp_death_cause_regex.search(names.text)!= None):
        wp_death_list.append(wp_death_cause_regex.search(names.text).group(1))
        wp_cause_list.append(wp_death_cause_regex.search(names.text).group(2))

In [11]:
wp_death_list,wp_cause_list

(['5 million ',
  '30-50 million ',
  '75-200 million ',
  '25-55 million ',
  '75,000-100,000 ',
  '1 million ',
  '150,000 ',
  '50 million ',
  '1 million ',
  '200,000 '],
 ['Measles and smallpox',
  'Rats and fleas',
  'Rats and fleas',
  'Variola virus',
  'Rats and fleas',
  'V. cholerae bacteria',
  'Mosquitoes',
  'H1N1',
  'H2N2',
  'H1N1'])

In [12]:
wp_year_from_list=[]
wp_year_to_list=[]
wp_year_regex=re.compile('([0-9]+)-?([0-9a-z]+)?')
for names in wp_soup.find_all('h5'):
    if (wp_year_regex.search(names.text)!= None):
        wp_year_from_list.append(wp_year_regex.search(names.text).group(1))
        wp_year_to_list.append(wp_year_regex.search(names.text).group(2))

In [13]:
wp_year_from_list,wp_year_to_list

(['165',
  '541',
  '1347',
  '1520',
  '1665',
  '1817',
  '1800',
  '1918',
  '1957',
  '2009'],
 ['180', '542', '1352', 'unknown', None, '1923', 's', '1920', '1958', None])

In [14]:
wp_dict_data={'Pandemic Name':wp_pandemicname_list,'Cause':wp_cause_list,
              'From':wp_year_from_list,'To':wp_year_to_list,'Death Count':wp_death_list}

In [15]:
pandemic_df=pd.DataFrame(wp_dict_data)

In [16]:
pandemic_df

Unnamed: 0,Pandemic Name,Cause,From,To,Death Count
0,Antonine Plague,Measles and smallpox,165,180,5 million
1,Plague of Justinian,Rats and fleas,541,542,30-50 million
2,Black Death,Rats and fleas,1347,1352,75-200 million
3,New World smallpox,Variola virus,1520,unknown,25-55 million
4,Great Plague of London,Rats and fleas,1665,,"75,000-100,000"
5,The cholera pandemics,V. cholerae bacteria,1817,1923,1 million
6,Yellow fever,Mosquitoes,1800,s,150000
7,The 1918 flu,H1N1,1918,1920,50 million
8,Asian flu,H2N2,1957,1958,1 million
9,Swine flu,H1N1,2009,,200000


In [17]:
#Cleaning "Death Count" Column for data manipulation
pandemic_df["Death Count"]= pandemic_df["Death Count"].str.replace('million','000000')
#Keeping the minimum values
pandemic_df["Death Count"]= pandemic_df["Death Count"].str.split("-", expand = True)
#Appending the 0s to index 1-3
pandemic_df["Death Count"][1:4]=pandemic_df["Death Count"][1:4]+'000000'
#Removing commas and spaces
pandemic_df["Death Count"]= pandemic_df["Death Count"].str.replace(",","")
pandemic_df["Death Count"]= pandemic_df["Death Count"].str.replace(" ","").astype(float)

In [18]:
#Replace text data to empty from "To" column
pandemic_df["To"]= pandemic_df["To"].str.replace(r'[a-zA-Z]+',"",regex=True)
pandemic_df["To"].fillna(value="", inplace=True)

In [19]:
pandemic_df

Unnamed: 0,Pandemic Name,Cause,From,To,Death Count
0,Antonine Plague,Measles and smallpox,165,180.0,5000000.0
1,Plague of Justinian,Rats and fleas,541,542.0,30000000.0
2,Black Death,Rats and fleas,1347,1352.0,75000000.0
3,New World smallpox,Variola virus,1520,,25000000.0
4,Great Plague of London,Rats and fleas,1665,,75000.0
5,The cholera pandemics,V. cholerae bacteria,1817,1923.0,1000000.0
6,Yellow fever,Mosquitoes,1800,,150000.0
7,The 1918 flu,H1N1,1918,1920.0,50000000.0
8,Asian flu,H2N2,1957,1958.0,1000000.0
9,Swine flu,H1N1,2009,,200000.0


### Wikipedia

https://en.wikipedia.org/wiki/List_of_epidemics"

In [22]:
wiki_page = requests.get("https://en.wikipedia.org/wiki/List_of_epidemics")

In [23]:
wiki_page.status_code

200

In [24]:
wiki_soup = bs(wiki_page.content, 'html.parser')

In [25]:
[type(item) for item in list(wiki_soup.children)]

[bs4.element.NavigableString,
 bs4.element.Doctype,
 bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString]

In [54]:
html = list(wiki_soup.children)[3]

In [32]:
[item.get_text() for item in html.find_all('th')][0:5]

['Event\n', 'Date\n', 'Location\n', 'Disease\n', 'Death toll (estimate)\n']

In [137]:
wiki_pandemicname_list=[]
#Extracting nth element from td tags
wiki_pandemicname_list=[item.get_text() for item in html.find_all('td')][0::6]
#Striping \n
wiki_pandemicname_list=[item.strip() for item in wiki_pandemicname_list if str(item)][:-8]

In [138]:
len(wiki_pandemicname_list)

244

In [635]:
wiki_pandemicname_list

['Influenza epidemic',
 'Plague of Athens',
 '412 BC epidemic',
 'Antonine Plague',
 'Plague of Cyprian',
 'Plague of Justinian',
 'Roman Plague of 590',
 'Plague of Sheroe',
 'Plague of 664',
 'Plague of 698–701',
 '735–737 Japanese smallpox epidemic',
 'Plague of 746–747',
 'Black Death (start of the Second plague pandemic)',
 'Sweating sickness (multiple outbreaks)',
 '1489 Spain typhus epidemic',
 '1510 influenza pandemic',
 '1520 Mexico smallpox epidemic',
 'Cocoliztli Epidemic of 1545–1548',
 '1561 Chile smallpox epidemic',
 '1563 London plague',
 'Cocoliztli epidemic of 1576',
 '1582 Tenerife plague epidemic',
 '1592–1596 Seneca nation measles epidemic',
 '1592–93 Malta plague epidemic',
 '1592–93 London plague',
 '1596-1602 Spain plague epidemic',
 '1600–1650 South America malaria epidemic',
 '1603 London plague epidemic',
 '1609 Egyptian plague epidemic',
 '1616 New England infections epidemic',
 '1629–1631 Italian plague',
 '1632–1635 Augsburg plague epidemic',
 'Massachusett

In [139]:
#Extracting To Year
wiki_To_list=[]
wiki_To_regex=re.compile('([0-9,]+)')
for names in html.find_all('td')[1::6]:
    if (wiki_To_regex.search(names.text)!= None):
        wiki_To_list.append(wiki_To_regex.search(names.text).group(1))

In [124]:
#Extracting nth element from td tags
wiki_Death_list=[item.get_text() for item in html.find_all('td')][4::6]
#Striping \n
wiki_Death_list=[item.strip() for item in wiki_Death_list if str(item)][:-8]

In [126]:
len(wiki_Death_list)

244

In [395]:
wiki_dict_data={'Pandemic Name':wiki_pandemicname_list,'Year':wiki_To_list[:-1],
              'Death Count':wiki_Death_list}

In [396]:
wiki_pandemic_df=pd.DataFrame(wiki_dict_data)

In [398]:
wiki_pandemic_df.head()

Unnamed: 0,Pandemic Name,Year,Death Count
0,Influenza epidemic,1200,Unknown
1,Plague of Athens,429,"75,000–100,000"
2,412 BC epidemic,412,Unknown
3,Antonine Plague,165,5–10 million
4,Plague of Cyprian,250,1 million+


### Data Cleaning

In [399]:
#Cleaning "Death Count" Column for data manipulation
#Eliminating lower death count by Replacing – with upper death number only
wiki_pandemic_df["Death Count"]= wiki_pandemic_df["Death Count"].str.replace(r'[0-9]+–',"",regex=True)
wiki_pandemic_df["Death Count"]= wiki_pandemic_df["Death Count"].str.replace(r'[0-9]+-',"",regex=True)

In [401]:
#replacing millions with 0s
wiki_pandemic_df["Death Count"]= wiki_pandemic_df["Death Count"].str.replace('million','000000')

In [403]:
#Removing commas and +
wiki_pandemic_df["Death Count"]= wiki_pandemic_df["Death Count"].str.replace(",","")
wiki_pandemic_df["Death Count"]= wiki_pandemic_df["Death Count"].str.replace("+","")

In [409]:
#Replace data after numbers with "" 
# Exmaple 498 (377 in Angola 121 in Congo) keep 486 only
wiki_pandemic_df["Death Count"]= wiki_pandemic_df["Death Count"].str.replace("\s+.+","")
#Remove spaces
wiki_pandemic_df["Death Count"]= wiki_pandemic_df["Death Count"].str.replace(" ","")

In [411]:
#Filtering out the unknown
wiki_pandemic_df=wiki_pandemic_df[wiki_pandemic_df['Death Count']!='Unknown'].reset_index(drop=True)

In [414]:
# Keeping only the numbers from and rejecting text
wiki_pandemic_df=wiki_pandemic_df[wiki_pandemic_df['Death Count'].str.match(r'([0-9]+)')==True]

In [415]:
for i in wiki_pandemic_df['Death Count']:
    print(i)

75100000
10
1
100
2
200
10000
17000
8
15
20100
2.5
59000
3000
19900
600700000
40000
1
280000
13712
1000
1525000
10400
500000
1250000
24148
100000
40000
11300
1000
76000
83000
3100
520
500
1300
18000
164000
100000
844
2200
20000
50000
711700
20000
4050000
940
8000
50000
2
11000
5000
60000
2955000
300000
300000
4500
60000
65000
100000
700
520000
100000
19000
2800
150000
17000
3498
20000
3400
10000
7970
1
4737
616
3000
12
6000
32000
80000
600000
748
1326200
500000
40000
8000
20000
4046
13000
298600
3164
1
20788
500000
132
800000
119
200300000
22
4
60000
40000
1.5
7130
100
2.5
30
500
538
43
1627
10277
4
30000
500
4
5
35
1027
24
15000
32
64
5600
145
89432
56
178
10000
105
40
400
139
774
1
0
658
2
7
27
1200
61
17
50
50
1000
394
187
684
10
183
37
2
67
407
123
3322
115
18
172
4293
18
49
1
1100
151575400
10075
4500
170
350
171
862
8
142
11323
183
292
36
2035
53
498
3886
69
1317
61000
17
2271
6400
2
338
15
83
3930
381247


In [383]:
#Convert to Float
wiki_pandemic_df['Death Count']=wiki_pandemic_df['Death Count'].astype('float64')

In [569]:
#Get Summary Statistics
wiki_pandemic_df['Death Count'].describe()

count    1.940000e+02
mean     5.395447e+06
std      4.686891e+07
min      0.000000e+00
25%      6.750000e+01
50%      1.308500e+03
75%      2.000000e+04
max      6.007000e+08
Name: Death Count, dtype: float64

In [574]:
wiki_pandemic_df[wiki_pandemic_df['Death Count']==max(wiki_pandemic_df['Death Count'])]

Unnamed: 0,Pandemic Name,Year,Death Count
15,1596-1602 Spain plague epidemic,1596,600700000.0


In [572]:
factor=max(wiki_pandemic_df['Death Count'])/381247
factor

1575.6189556901431

In [417]:
wiki_pandemic_df.tail()

Unnamed: 0,Pandemic Name,Year,Death Count
190,2019 Philippines measles outbreak,2019,338.0
191,2019 Kuala Koh measles outbreak,2019,15.0
192,2019 Samoa measles outbreak,2019,83.0
193,2019–20 dengue fever epidemic,2019,3930.0
194,COVID-19 pandemic,2019,381247.0


In [636]:
wiki_pandemic_df.to_excel(r'C:/Users/Owner/Downloads/College/Practice/Pandemics/pandemics.xlsx', sheet_name='wikipedia', index = False)

## Plotting the Data Frame

In [520]:
#Limiting the data according to choice
df = wiki_pandemic_df[(wiki_pandemic_df['Death Count']>=100000) & (wiki_pandemic_df['Death Count']<7000000)]

In [521]:
#Divide data by 1Million
df['Death Count']=df['Death Count'].apply(lambda x: round(x/1000000,2))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [567]:
#plotting data based on requirement
fig = px.scatter(df, x="Year", y="Death Count", size="Death Count", color="Pandemic Name",text = "Pandemic Name", 
                 log_x=False, size_max=60,width=1000, height=1000)
fig.update_layout(showlegend=False)
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.update_layout(
    height=800,
    title_text='History of Pandemics-Ancient Era to Modern Times',
    yaxis_title="Death Toll (per 1 Million)",
    font=dict(family="Calibri, monospace",size=18,color="#7f7f7f")
)
fig.update_traces(textposition='middle center')
fig.update_traces(textfont_size=8)

fig.add_annotation(
            x=2019,
            y=.38,
            text="Covid19 Deaths: .38M",bgcolor='floralwhite',font=dict(
                family="Calibri, monospace",
                size=15,
                color="black"
            ))

fig.add_annotation(
            x=2019,
            y=4,
            text="<b>pan·dem·ic</b><br>(of a disease) prevalent over a <br>whole country or the world.",
font=dict(
                family="Calibri, monospace",
                size=15,
                color="black"
            ))
fig.show()

### Magnitude

In [605]:
#Creating new column by dividing the death count of covid with others.
wiki_pandemic_df['Magnitude']=wiki_pandemic_df['Death Count']/381247

In [623]:
df_magnitude=wiki_pandemic_df[wiki_pandemic_df['Magnitude']>=2.0].reset_index(drop=True)
df_magnitude['Axis']=0

In [624]:
df_magnitude

Unnamed: 0,Pandemic Name,Year,Death Count,Magnitude,Axis
0,Plague of Athens,429,75100000.0,196.985157,0
1,1596-1602 Spain plague epidemic,1596,600700000.0,1575.618956,0
2,1634–1640 Wyandot people epidemic of infections,1634,1525000.0,4.000031,0
3,Naples Plague,1656,1250000.0,3.278714,0
4,1743 Sicily plague epidemic,1743,4050000.0,10.623034,0
5,1802–1803 Saint-Domingue yellow fever epidemic,1802,2955000.0,7.750881,0
6,1871 Buenos Aires yellow fever epidemic,1871,1326200.0,3.478585,0
7,Sixth cholera pandemic,1899,800000.0,2.098377,0
8,1900–1920 Uganda African trypanosomiasis epidemic,1900,200300000.0,525.381183,0
9,2009 swine flu pandemic,2009,151575400.0,397.577948,0


In [634]:
#plotting data based on requirement
fig = px.scatter(df_magnitude, x="Axis", y="Magnitude", size="Death Count", color="Pandemic Name",
                 size_max=60,width=1000, height=1000)
fig.update_layout(
    height=800,
    title_text='Covid-19 vs Other Pandemics',
    yaxis_title="Magnitude",
     xaxis = go.XAxis(
        title = "",
        showticklabels=False),
    font=dict(family="Calibri, monospace",size=18,color="#7f7f7f")
)
fig.show()

Thank you! I Hope you enjoyed it.