In [1]:
import math
import numpy as np 
import pandas as pd 
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime

In [2]:
vaccine_df = pd.read_csv("VaccineData.csv")
summary_df = pd.read_csv("worldometer_coronavirus_summary_data.csv")
daily_df = pd.read_csv("worldometer_coronavirus_daily_data.csv")

## Let's look at the data

In [3]:
vaccine_df

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines,source_name,source_website
0,Albania,ALB,2021-01-10,0.0,0.0,,,,0.00,0.00,,,Pfizer/BioNTech,Ministry of Health,https://shendetesia.gov.al/vaksinimi-anticovid...
1,Albania,ALB,2021-01-11,,,,,64.0,,,,22.0,Pfizer/BioNTech,Ministry of Health,https://shendetesia.gov.al/vaksinimi-anticovid...
2,Albania,ALB,2021-01-12,128.0,128.0,,,64.0,0.00,0.00,,22.0,Pfizer/BioNTech,Ministry of Health,https://shendetesia.gov.al/vaksinimi-anticovid...
3,Albania,ALB,2021-01-13,188.0,188.0,,60.0,63.0,0.01,0.01,,22.0,Pfizer/BioNTech,Ministry of Health,https://shendetesia.gov.al/vaksinimi-anticovid...
4,Albania,ALB,2021-01-14,266.0,266.0,,78.0,66.0,0.01,0.01,,23.0,Pfizer/BioNTech,Ministry of Health,https://shendetesia.gov.al/vaksinimi-anticovid...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2671,USA,USA,2021-02-07,41210937.0,31579100.0,9147185.0,2172973.0,1441091.0,12.32,9.44,2.74,4309.0,"Moderna, Pfizer/BioNTech",Centers for Disease Control and Prevention,https://covid.cdc.gov/covid-data-tracker/#vacc...
2672,USA,USA,2021-02-08,42417617.0,32340146.0,9518015.0,1206680.0,1456459.0,12.68,9.67,2.85,4355.0,"Moderna, Pfizer/BioNTech",Centers for Disease Control and Prevention,https://covid.cdc.gov/covid-data-tracker/#vacc...
2673,USA,USA,2021-02-09,43206190.0,32867213.0,9840429.0,788573.0,1489333.0,12.92,9.83,2.94,4453.0,"Moderna, Pfizer/BioNTech",Centers for Disease Control and Prevention,https://covid.cdc.gov/covid-data-tracker/#vacc...
2674,USA,USA,2021-02-10,44769970.0,33783384.0,10469514.0,1563780.0,1555959.0,13.39,10.10,3.13,4652.0,"Moderna, Pfizer/BioNTech",Centers for Disease Control and Prevention,https://covid.cdc.gov/covid-data-tracker/#vacc...


In [4]:
summary_df

Unnamed: 0,country,continent,total_confirmed,total_deaths,total_recovered,active_cases,serious_or_critical,total_cases_per_1m_population,total_deaths_per_1m_population,total_tests,total_tests_per_1m_population,population
0,Afghanistan,Asia,55492,2427.0,48392.0,4673.0,1012.0,1406,61.0,273745.0,6936.0,39468080
1,Albania,Europe,91987,1543.0,55983.0,34461.0,37.0,31986,537.0,402000.0,139785.0,2875837
2,Algeria,Africa,110513,2935.0,75816.0,31762.0,42.0,2492,66.0,,,44339845
3,Andorra,Europe,10463,107.0,9868.0,488.0,15.0,135282,1383.0,193595.0,2503103.0,77342
4,Angola,Africa,20329,491.0,18790.0,1048.0,7.0,607,15.0,178539.0,5331.0,33492335
...,...,...,...,...,...,...,...,...,...,...,...,...
214,Wallis And Futuna Islands,Australia/Oceania,9,,5.0,4.0,,810,,1202.0,108142.0,11115
215,Western Sahara,Africa,10,1.0,8.0,1.0,,16,2.0,,,606369
216,Yemen,Asia,2136,616.0,1430.0,90.0,23.0,71,20.0,17404.0,576.0,30230495
217,Zambia,Africa,68454,940.0,62575.0,4939.0,369.0,3661,50.0,1010701.0,54049.0,18699850


In [5]:
daily_df

Unnamed: 0,date,country,cumulative_total_cases,daily_new_cases,active_cases,cumulative_total_deaths,daily_new_deaths
0,2020-2-15,Afghanistan,0.0,,0.0,0.0,
1,2020-2-16,Afghanistan,0.0,,0.0,0.0,
2,2020-2-17,Afghanistan,0.0,,0.0,0.0,
3,2020-2-18,Afghanistan,0.0,,0.0,0.0,
4,2020-2-19,Afghanistan,0.0,,0.0,0.0,
...,...,...,...,...,...,...,...
79954,2021-2-09,Zimbabwe,34781.0,123.0,4139.0,1353.0,14.0
79955,2021-2-10,Zimbabwe,34864.0,83.0,4029.0,1364.0,11.0
79956,2021-2-11,Zimbabwe,34949.0,85.0,4096.0,1382.0,18.0
79957,2021-2-12,Zimbabwe,35045.0,96.0,3693.0,1393.0,11.0


## We need an aggregate of the data, we can do this by defining a function

In [6]:
def aggregate(df: pd.Series, agg_col: str) -> pd.DataFrame:
    
    data = df.groupby("country")[agg_col].max()
    data = pd.DataFrame(data)
    
    return data

In [30]:
data

Unnamed: 0,country,continent,total_confirmed,total_deaths,total_recovered,active_cases,serious_or_critical,total_cases_per_1m_population,total_deaths_per_1m_population,total_tests,...,population,vaccines,people_vaccinated,people_vaccinated_per_hundred,people_fully_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,total_vaccinations,percentage_vaccinated,tested_positive
0,Afghanistan,Asia,55492,2427.0,48392.0,4673.0,1012.0,1406,61.0,273745.0,...,39468080,,,,,,,,,20.271420
1,Albania,Europe,91987,1543.0,55983.0,34461.0,37.0,31986,537.0,402000.0,...,2875837,Pfizer/BioNTech,689.0,0.02,438.0,0.02,0.04,1127.0,0.039189,22.882338
2,Algeria,Africa,110513,2935.0,75816.0,31762.0,42.0,2492,66.0,,...,44339845,Sputnik V,,,,,0.00,30.0,0.000068,
3,Andorra,Europe,10463,107.0,9868.0,488.0,15.0,135282,1383.0,193595.0,...,77342,Pfizer/BioNTech,1291.0,1.67,,,1.67,1291.0,1.669209,5.404582
4,Angola,Africa,20329,491.0,18790.0,1048.0,7.0,607,15.0,178539.0,...,33492335,,,,,,,,,11.386308
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,Uzbekistan,Asia,79416,622.0,77890.0,904.0,153.0,2352,18.0,1377915.0,...,33769261,,,,,,,,,5.763490
146,Venezuela,South America,132743,1273.0,124734.0,6736.0,110.0,4676,45.0,2781130.0,...,28385636,,,,,,,,,4.772988
147,Yemen,Asia,2136,616.0,1430.0,90.0,23.0,71,20.0,17404.0,...,30230495,,,,,,,,,12.273041
148,Zambia,Africa,68454,940.0,62575.0,4939.0,369.0,3661,50.0,1010701.0,...,18699850,,,,,,,,,6.772923


In [8]:
cols_to_summarize = ['people_vaccinated', 
                     'people_vaccinated_per_hundred', 
                     'people_fully_vaccinated', 
                     'people_fully_vaccinated_per_hundred', 
                     'total_vaccinations_per_hundred', 
                     'total_vaccinations']

summary = summary_df.set_index("country")
vaccines = vaccine_df[['country', 'vaccines']].drop_duplicates().set_index('country')
summary = summary.join(vaccines)

In [9]:
for col in cols_to_summarize:   
    summary = summary.join(aggregate(vaccine_df, col))

summary['percentage_vaccinated'] = summary.total_vaccinations / summary.population * 100
summary['tested_positive'] = summary.total_confirmed / summary.total_tests * 100

## The data is now ready for analysis

### Question 1: Which countries have the most vaccines?

In [38]:
def visualize(data: pd.DataFrame, height, xcolumn: str, ycolumn: str, title:str, colors:str, n):
        data = data.sort_values(ycolumn, ascending=False).dropna(subset=[ycolumn])        

        if n is not None: 
            data = data.iloc[:n]
        fig = go.Figure(go.Bar(
                    hoverinfo='skip',
                     x=data[xcolumn], 
                     y=data[ycolumn], 
                     marker=dict(
                         color = data[ycolumn],
                         colorscale=colors,
                        ),
                    ),
                )
        
        fig.update_layout(
        title=title,
        xaxis_title=f"Top {n} {xcolumn.title()}",
        plot_bgcolor='rgba(0,0,0,0)',
        hovermode="x",
        height=height)
        fig.show()

In [53]:
visualize(summary.reset_index(), 500, 'country', "total_vaccinations", 'Number of Vaccines by Country', "Blugrn", n=20 )

### Question 2: How many people are in serious or critical condition based on countries?

In [47]:
data = summary.dropna(subset=['serious_or_critical'])
data = data.reset_index()

visualize(data, 500, 'country', "serious_or_critical", 'Number of people in Serious Condition', "Reds", n=20)

### Question 3: What are the most popular vaccines

In [44]:
data = summary.dropna(subset=['vaccines'])
data = summary.groupby('vaccines')['total_vaccinations'].sum()
data = pd.DataFrame(data).reset_index()
visualize(data, 600, 'vaccines', "total_vaccinations", 'Number of Vaccines', "blugrn", n=20)