In [1]:
# Import libraries
import numpy as np
import pandas as pd
import os
import urllib
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

In [2]:
url = "https://covid19.who.int/WHO-COVID-19-global-data.csv"
file_path = os.path.join("data", "covid")

In [3]:
os.makedirs(file_path, exist_ok = True)
csv_path = os.path.join(file_path, "WHO-COVID-19-global-data.csv")
urllib.request.urlretrieve(url, csv_path)

('data/covid/WHO-COVID-19-global-data.csv',
 <http.client.HTTPMessage at 0x7fac01e4ffa0>)

In [4]:
# Reading in the data
df = pd.read_csv(csv_path)
display(df.head())
display(df.tail())

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
0,2020-02-24,AF,Afghanistan,EMRO,5,5,0,0
1,2020-02-25,AF,Afghanistan,EMRO,0,5,0,0
2,2020-02-26,AF,Afghanistan,EMRO,0,5,0,0
3,2020-02-27,AF,Afghanistan,EMRO,0,5,0,0
4,2020-02-28,AF,Afghanistan,EMRO,0,5,0,0


Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
39534,2020-08-31,ZW,Zimbabwe,AFRO,6,6412,0,196
39535,2020-09-01,ZW,Zimbabwe,AFRO,85,6497,6,202
39536,2020-09-02,ZW,Zimbabwe,AFRO,62,6559,1,203
39537,2020-09-03,ZW,Zimbabwe,AFRO,79,6638,3,206
39538,2020-09-04,ZW,Zimbabwe,AFRO,40,6678,0,206


In [5]:
# Understanding data
display(df.index)
display(df.shape)
display(df.columns)
display(df.dtypes)

RangeIndex(start=0, stop=39539, step=1)

(39539, 8)

Index(['Date_reported', ' Country_code', ' Country', ' WHO_region',
       ' New_cases', ' Cumulative_cases', ' New_deaths', ' Cumulative_deaths'],
      dtype='object')

Date_reported         object
 Country_code         object
 Country              object
 WHO_region           object
 New_cases             int64
 Cumulative_cases      int64
 New_deaths            int64
 Cumulative_deaths     int64
dtype: object

In [6]:
# Strip blank spaces in column names
df.columns = [col.strip() for col in df.columns]
display(df.columns)

Index(['Date_reported', 'Country_code', 'Country', 'WHO_region', 'New_cases',
       'Cumulative_cases', 'New_deaths', 'Cumulative_deaths'],
      dtype='object')

In [7]:
# Info related to data
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39539 entries, 0 to 39538
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Date_reported      39539 non-null  object
 1   Country_code       39364 non-null  object
 2   Country            39539 non-null  object
 3   WHO_region         39539 non-null  object
 4   New_cases          39539 non-null  int64 
 5   Cumulative_cases   39539 non-null  int64 
 6   New_deaths         39539 non-null  int64 
 7   Cumulative_deaths  39539 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 2.4+ MB


None

In [8]:
# Statistical data
display(df.describe())

Unnamed: 0,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
count,39539.0,39539.0,39539.0,39539.0
mean,668.08417,41710.34,22.010825,1830.673639
std,4080.904164,258995.7,128.610816,9866.995021
min,-2461.0,1.0,-514.0,0.0
25%,0.0,76.0,0.0,1.0
50%,10.0,927.0,0.0,17.0
75%,146.5,8895.0,3.0,190.0
max,86432.0,6095007.0,6409.0,185687.0


In [9]:
# Checking for missing values
display(df.isnull().sum())

Date_reported          0
Country_code         175
Country                0
WHO_region             0
New_cases              0
Cumulative_cases       0
New_deaths             0
Cumulative_deaths      0
dtype: int64

In [10]:
# Check for unique countries
display(df["Country"].unique())
display(df["Country"].unique().shape)

# Create list of unique countries
Country = df["Country"].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Anguilla', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin',
       'Bermuda', 'Bhutan', 'Bolivia (Plurinational State of)',
       'Bonaire, Sint Eustatius and Saba', 'Bosnia and Herzegovina',
       'Botswana', 'Brazil', 'British Virgin Islands',
       'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi',
       'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Cayman Islands',
       'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia',
       'Comoros', 'Congo', 'Costa Rica', 'Côte d’Ivoire', 'Croatia',
       'Cuba', 'Curaçao', 'Cyprus', 'Czechia',
       'Democratic Republic of the Congo', 'Denmark', 'Djibouti',
       'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Es

(216,)

In [11]:
#  Looking at US data
df[df.Country == "United States of America"]

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
37893,2020-01-20,US,United States of America,AMRO,5,5,0,0
37894,2020-01-21,US,United States of America,AMRO,0,5,0,0
37895,2020-01-22,US,United States of America,AMRO,0,5,0,0
37896,2020-01-23,US,United States of America,AMRO,0,5,0,0
37897,2020-01-24,US,United States of America,AMRO,1,6,0,0
...,...,...,...,...,...,...,...,...
38118,2020-09-01,US,United States of America,AMRO,37068,5936572,473,182162
38119,2020-09-02,US,United States of America,AMRO,31808,5968380,423,182585
38120,2020-09-03,US,United States of America,AMRO,42662,6011042,1025,183610
38121,2020-09-04,US,United States of America,AMRO,39402,6050444,1004,184614


In [12]:
# Check for new deaths greater than 1000
df.loc[df.New_deaths > 1000, ['New_deaths', 'Country']]

Unnamed: 0,New_deaths,Country
4996,1179,Brazil
4998,1188,Brazil
4999,1001,Brazil
5003,1039,Brazil
5004,1086,Brazil
...,...,...
38114,1229,United States of America
38115,1155,United States of America
38120,1025,United States of America
38121,1004,United States of America


In [13]:
# Check for new deaths > 1000 in the US only
# Display cumulative deaths data
df.loc[(df.New_deaths > 1000) & (df.Country_code == 'US'), ['Date_reported', 'Country', 'New_deaths', 'Cumulative_deaths']]

Unnamed: 0,Date_reported,Country,New_deaths,Cumulative_deaths
37968,2020-04-04,United States of America,1061,5854
37969,2020-04-05,United States of America,1166,7020
37970,2020-04-06,United States of America,1338,8358
37971,2020-04-07,United States of America,1201,9559
37972,2020-04-08,United States of America,1286,10845
...,...,...,...,...
38114,2020-08-28,United States of America,1229,178561
38115,2020-08-29,United States of America,1155,179716
38120,2020-09-03,United States of America,1025,183610
38121,2020-09-04,United States of America,1004,184614


In [14]:
# Checking for maximum and minumum number of new cases in the US
print("Maximum number of new cases in the US: ")
display(df.loc[df.Country_code == 'US', ['New_cases']].max())
print("Minimum number of new cases in the US: ")
display(df.loc[df.Country_code == 'US', ['New_cases']].min())

Maximum number of new cases in the US: 


New_cases    74354
dtype: int64

Minimum number of new cases in the US: 


New_cases    0
dtype: int64

In [15]:
# Check that the sum of new cases is equal to the number of cumulative cases in the US
display(df.loc[df.Country_code == 'US', ['New_cases']].sum())
display(df.loc[df.Country_code == 'US', ['Cumulative_cases']].max())

New_cases    6095007
dtype: int64

Cumulative_cases    6095007
dtype: int64

In [16]:
# Check location of max deaths
df.New_deaths.idxmax()

37981

In [17]:
# Get more info on when max death occurred and where
df.loc[df.New_deaths.idxmax(),['Date_reported', 'Country', 'New_deaths', 'Cumulative_deaths']]

Date_reported                      2020-04-17
Country              United States of America
New_deaths                               6409
Cumulative_deaths                       32280
Name: 37981, dtype: object

In [18]:
# Check for errors in data where new deaths is less than zero
df[df.New_deaths <0]

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
194,2020-09-05,AF,Afghanistan,EMRO,16,38304,-1,1409
536,2020-08-04,DZ,Algeria,AFRO,507,31972,-2,1229
1942,2020-06-04,AU,Australia,WPRO,8,7229,-1,102
2583,2020-08-30,BS,Bahamas,AMRO,37,2057,-10,40
2751,2020-08-03,BH,Bahrain,EMRO,346,41536,-1,147
5753,2020-07-13,BF,Burkina Faso,AFRO,13,1033,-1,53
8115,2020-06-09,CG,Congo,AFRO,0,683,-2,20
8909,2020-08-15,CU,Cuba,AMRO,55,3229,-1,88
9412,2020-07-05,CZ,Czechia,EURO,121,12440,-1,351
9413,2020-07-06,CZ,Czechia,EURO,75,12515,-3,348


In [19]:
# Create new column with data on percentage of new cases
df['pct_cases'] = (df['New_cases']/ df['Cumulative_cases']) * 100
df

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths,pct_cases
0,2020-02-24,AF,Afghanistan,EMRO,5,5,0,0,100.000000
1,2020-02-25,AF,Afghanistan,EMRO,0,5,0,0,0.000000
2,2020-02-26,AF,Afghanistan,EMRO,0,5,0,0,0.000000
3,2020-02-27,AF,Afghanistan,EMRO,0,5,0,0,0.000000
4,2020-02-28,AF,Afghanistan,EMRO,0,5,0,0,0.000000
...,...,...,...,...,...,...,...,...,...
39534,2020-08-31,ZW,Zimbabwe,AFRO,6,6412,0,196,0.093575
39535,2020-09-01,ZW,Zimbabwe,AFRO,85,6497,6,202,1.308296
39536,2020-09-02,ZW,Zimbabwe,AFRO,62,6559,1,203,0.945266
39537,2020-09-03,ZW,Zimbabwe,AFRO,79,6638,3,206,1.190118
