In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px

## Importing the Data

### Cases and Deaths

In [2]:
cases_death_df = pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/jhu/full_data.csv')
cases_death_df.head()

Unnamed: 0,date,location,new_cases,new_deaths,total_cases,total_deaths,weekly_cases,weekly_deaths,biweekly_cases,biweekly_deaths
0,2020-02-24,Afghanistan,5.0,,5.0,,,,,
1,2020-02-25,Afghanistan,0.0,,5.0,,,,,
2,2020-02-26,Afghanistan,0.0,,5.0,,,,,
3,2020-02-27,Afghanistan,0.0,,5.0,,,,,
4,2020-02-28,Afghanistan,0.0,,5.0,,,,,


In [3]:
cases_death_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244188 entries, 0 to 244187
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   date             244188 non-null  object 
 1   location         244188 non-null  object 
 2   new_cases        243829 non-null  float64
 3   new_deaths       224387 non-null  float64
 4   total_cases      244179 non-null  float64
 5   total_deaths     224506 non-null  float64
 6   weekly_cases     242625 non-null  float64
 7   weekly_deaths    223201 non-null  float64
 8   biweekly_cases   240877 non-null  float64
 9   biweekly_deaths  221446 non-null  float64
dtypes: float64(8), object(2)
memory usage: 18.6+ MB


In [4]:
len(cases_death_df['location'].unique())

231

### Vacinations

In [5]:
vaccinations_df = pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/vaccinations.csv')
vaccinations_df.head()

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred
0,Afghanistan,AFG,2021-02-22,0.0,0.0,,,,,0.0,0.0,,,,,
1,Afghanistan,AFG,2021-02-23,,,,,,1367.0,,,,,33.0,1367.0,0.003
2,Afghanistan,AFG,2021-02-24,,,,,,1367.0,,,,,33.0,1367.0,0.003
3,Afghanistan,AFG,2021-02-25,,,,,,1367.0,,,,,33.0,1367.0,0.003
4,Afghanistan,AFG,2021-02-26,,,,,,1367.0,,,,,33.0,1367.0,0.003


In [6]:
vaccinations_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155889 entries, 0 to 155888
Data columns (total 16 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   location                             155889 non-null  object 
 1   iso_code                             155889 non-null  object 
 2   date                                 155889 non-null  object 
 3   total_vaccinations                   72365 non-null   float64
 4   people_vaccinated                    69263 non-null   float64
 5   people_fully_vaccinated              66612 non-null   float64
 6   total_boosters                       41131 non-null   float64
 7   daily_vaccinations_raw               60049 non-null   float64
 8   daily_vaccinations                   154811 non-null  float64
 9   total_vaccinations_per_hundred       72365 non-null   float64
 10  people_vaccinated_per_hundred        69263 non-null   float64
 11  people_fully_

In [7]:
len(vaccinations_df['location'].unique())

235

### Vaccinations by Manufacturers

In [8]:
vaccinations_by_manufacturers_df = pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/vaccinations-by-manufacturer.csv')
vaccinations_by_manufacturers_df.head()

Unnamed: 0,location,date,vaccine,total_vaccinations
0,Argentina,2020-12-29,Oxford/AstraZeneca,1
1,Argentina,2020-12-29,Sinopharm/Beijing,1
2,Argentina,2020-12-29,Sputnik V,20490
3,Argentina,2020-12-30,Sputnik V,40591
4,Argentina,2020-12-31,Sputnik V,43397


In [9]:
vaccinations_by_manufacturers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56058 entries, 0 to 56057
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   location            56058 non-null  object
 1   date                56058 non-null  object
 2   vaccine             56058 non-null  object
 3   total_vaccinations  56058 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 1.7+ MB


In [10]:
len(vaccinations_by_manufacturers_df['location'].unique())

44

### Locations

In [11]:
locations_df = pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/jhu/locations.csv')
locations_df.head()

Unnamed: 0,Country/Region,location,continent,population_year,population
0,Afghanistan,Afghanistan,Asia,2022.0,41128772.0
1,Albania,Albania,Europe,2022.0,2842318.0
2,Algeria,Algeria,Africa,2022.0,44903228.0
3,Andorra,Andorra,Europe,2022.0,79843.0
4,Angola,Angola,Africa,2022.0,35588996.0


In [12]:
locations_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Country/Region   219 non-null    object 
 1   location         219 non-null    object 
 2   continent        218 non-null    object 
 3   population_year  218 non-null    float64
 4   population       218 non-null    float64
dtypes: float64(2), object(3)
memory usage: 8.7+ KB


## Data Pre-processing

### Merge Cases and Vaccinations

In [13]:
cases_deaths_vaccinations_df = cases_death_df.merge(vaccinations_df, how='outer', on=['location', 'date']).merge(locations_df, how='left', on='location')
cases_deaths_vaccinations_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 252918 entries, 0 to 252917
Data columns (total 28 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   date                                 252918 non-null  object 
 1   location                             252918 non-null  object 
 2   new_cases                            243829 non-null  float64
 3   new_deaths                           224387 non-null  float64
 4   total_cases                          244179 non-null  float64
 5   total_deaths                         224506 non-null  float64
 6   weekly_cases                         242625 non-null  float64
 7   weekly_deaths                        223201 non-null  float64
 8   biweekly_cases                       240877 non-null  float64
 9   biweekly_deaths                      221446 non-null  float64
 10  iso_code                             155889 non-null  object 
 11  total_vaccina

In [18]:
country_filter = cases_deaths_vaccinations_df['Country/Region'] == cases_deaths_vaccinations_df['Country/Region']
cases_deaths_vaccinations_df[country_filter]['Country/Region'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Anguilla', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin',
       'Bermuda', 'Bhutan', 'Bolivia', 'Bonaire, Sint Eustatius and Saba',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'British Virgin Islands', 'Brunei', 'Bulgaria', 'Burkina Faso',
       'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cabo Verde',
       'Cayman Islands', 'Central African Republic', 'Chad', 'Chile',
       'China', 'Colombia', 'Comoros', 'Congo (Brazzaville)',
       'Cook Islands', 'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba',
       'Curacao', 'Cyprus', 'Czechia', 'Congo (Kinshasa)', 'Denmark',
       'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Eswatini', 'Ethiopia', 'Faroe I

Checking for non-country locations

In [19]:
cases_deaths_vaccinations_df[cases_deaths_vaccinations_df['Country/Region'] != cases_deaths_vaccinations_df['Country/Region']]['location'].unique()

array(['Africa', 'Asia', 'Europe', 'European Union', 'High income',
       'Low income', 'Lower middle income', 'North America', 'Oceania',
       'South America', 'Upper middle income', 'World', 'England',
       'Guernsey', 'Jersey', 'Niue', 'Northern Cyprus',
       'Northern Ireland', 'Pitcairn', 'Scotland',
       'Sint Maarten (Dutch part)', 'Tokelau', 'Turkmenistan', 'Wales'],
      dtype=object)

In [None]:
income_stream = ['High income', 'Upper middle income', 'Lower middle income']
regions = ['Europe', 'Asia', 'North America', 'South America', 'Africa']
organizations = ['World', 'European Union', 'Oceania']

In [147]:
vaccinations_by_manufacturers_df = vaccinations_by_manufacturers_df.merge(locations_df, how='left', on='location')

## Exploratory Data Analysis

In [20]:
countries_only_df = cases_deaths_vaccinations_df[country_filter]

### Cases and Deaths

#### Top 25 Countries with most COVID-19 cases 

In [30]:
cases_per_country = cases_deaths_vaccinations_df[country_filter].groupby('Country/Region').sum().reset_index()

fig = px.bar(cases_per_country.sort_values('total_cases', ascending=False).head(25), x='Country/Region', y='total_cases', color='total_cases', color_continuous_scale='orrd')
fig.update_layout(title='Top 25 countries with most COVID-19 cases', title_x=0.5, title_font=dict(size=18, color='Darkred'))
fig.show()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



#### Top 25 Countries with most COVID-19 deaths 

In [31]:
cases_per_country = cases_deaths_vaccinations_df[country_filter].groupby('Country/Region').sum().reset_index()

fig = px.bar(cases_per_country.sort_values('total_deaths', ascending=False).head(25), x='Country/Region', y='total_deaths', color='total_deaths', color_continuous_scale='orrd')
fig.update_layout(title='Top 25 countries with most COVID-19 cases', title_x=0.5, title_font=dict(size=18, color='Darkred'))
fig.show()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



#### Top Continents with Most COVID-19 Cases

In [32]:
cases_per_continent = cases_deaths_vaccinations_df[country_filter].groupby('Country/Region').sum().reset_index()

fig = px.bar(cases_per_continent.sort_values('total_cases', ascending=False).head(25), x='Country/Region', y='total_cases', color='total_cases', color_continuous_scale='orrd')
fig.update_layout(title='Total COVID-19 Cases per Continent', title_x=0.5, title_font=dict(size=18, color='Darkred'))
fig.show()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



#### Top Continents with Most New COVID-19 Cases

In [33]:
cases_per_continent = cases_deaths_vaccinations_df[country_filter].groupby('Country/Region').sum().reset_index()

fig = px.bar(cases_per_continent.sort_values('new_cases', ascending=False).head(25), x='Country/Region', y='new_cases', color='new_cases', color_continuous_scale='orrd')
fig.update_layout(title='New COVID-19 Cases per Continent', title_x=0.5, title_font=dict(size=18, color='Darkred'))
fig.show()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



#### Top Continent with Most COVID-19 Deaths

In [34]:
cases_per_continent = cases_deaths_vaccinations_df[country_filter].groupby('Country/Region').sum().reset_index()

fig = px.bar(cases_per_continent.sort_values('total_deaths', ascending=False).head(25), x='Country/Region', y='total_deaths', color='total_deaths', color_continuous_scale='orrd')
fig.update_layout(title='Total COVID-19 Deaths per Continent', title_x=0.5, title_font=dict(size=18, color='Darkred'))
fig.show()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



#### Top Continent with Most New COVID-19 cases

In [27]:
cases_per_continent = cases_deaths_vaccinations_df[country_filter].groupby('continent').sum().reset_index()

fig = px.bar(cases_per_continent.sort_values('new_deaths', ascending=False).head(25), x='continent', y='new_deaths', color='new_deaths', color_continuous_scale='orrd')
fig.update_layout(title='New COVID-19 Deaths per Continent', title_x=0.5, title_font=dict(size=18, color='Darkred'))
fig.show()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



#### COVID-19 Cases Map

In [36]:
cases_per_country = cases_deaths_vaccinations_df[country_filter].groupby('Country/Region').sum().reset_index()

fig = px.choropleth(cases_per_country, 
                    locations='Country/Region',
                    locationmode='country names',
                    color='total_cases',
                    hover_name='Country/Region',
                    color_continuous_scale='orrd')

fig.update_layout(title='COVID-19 Cases',
                  title_x=0.5,
                  title_font=dict(size=18, color='Darkred'),
                  geo=dict(showframe=False,
                             showcoastlines=False,
                             projection_type='equirectangular'))

fig.show()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



#### COVID-19 Deaths Map

In [37]:
cases_per_country = cases_deaths_vaccinations_df[country_filter].groupby('Country/Region').sum().reset_index()

fig = px.choropleth(cases_per_country, 
                    locations='Country/Region',
                    locationmode='country names',
                    color='total_deaths',
                    hover_name='Country/Region',
                    color_continuous_scale='orrd')

fig.update_layout(title='COVID-19 Deaths',
                  title_x=0.5,
                  title_font=dict(size=18, color='Darkred'),
                  geo=dict(showframe=False,
                             showcoastlines=False,
                             projection_type='equirectangular'))

fig.show()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



### Vaccinations

In [39]:
vaccinations_per_country = cases_deaths_vaccinations_df[country_filter].groupby('Country/Region').sum().reset_index()

fig = px.choropleth(cases_per_country, 
                    locations='Country/Region',
                    locationmode='country names',
                    color='total_vaccinations',
                    hover_name='Country/Region',
                    color_continuous_scale='orrd')

fig.update_layout(title='COVID-19 Deaths',
                  title_x=0.5,
                  title_font=dict(size=18, color='Darkred'),
                  geo=dict(showframe=False,
                             showcoastlines=False,
                             projection_type='equirectangular'))

fig.show()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



### Vaccinations by Manufacturers

In [127]:
countries_only_filter = vaccinations_by_manufacturers_df['continent'] == vaccinations_by_manufacturers_df['continent']
total_vaccinations = vaccinations_by_manufacturers_df[countries_only_filter].groupby('vaccine').sum().reset_index()

fig = px.bar(total_vaccinations.sort_values('total_vaccinations', ascending=False).head(25), x='vaccine', y='total_vaccinations', color='total_vaccinations', color_continuous_scale='orrd')
fig.update_layout(title='Total Vaccinations per Manufacturers', title_x=0.5, title_font=dict(size=18, color='Darkred'))
fig.show()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [133]:
vaccinations_by_country = vaccinations_by_manufacturers_df[countries_only_filter].groupby('location').sum().reset_index()
fig = px.bar(vaccinations_by_country.sort_values('total_vaccinations', ascending=False).head(25), x='location', y='total_vaccinations', color='total_vaccinations', color_continuous_scale='orrd')
fig.update_layout(title='Total Vaccinations per Country', title_x=0.5, title_font=dict(size=18, color='Darkred'))
fig.show()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [138]:
fig = px.choropleth(vaccinations_by_country, 
                    locations='location',
                    locationmode='country names',
                    color='total_vaccinations',
                    hover_name='location',
                    color_continuous_scale='orrd')

fig.update_layout(title='COVID-19 Cases',
                  title_x=0.5,
                  title_font=dict(size=18, color='Darkred'),
                  geo=dict(showframe=False,
                             showcoastlines=False,
                             projection_type='equirectangular'))

fig.show()