In [1]:
import pandas as pd

# reading csv
DATA_PATH  = "country_vaccination_stats.csv"
df = pd.read_csv(DATA_PATH)

In [2]:
# Imputation

# Group by country and find the minimum daily vaccination number
min_vaccinations = df.groupby("country")["daily_vaccinations"].min().reset_index()

# Replace missing data with the minimum daily vaccination number
df_filled = df.merge(min_vaccinations, on="country", suffixes=("", "_min"))
df_filled["daily_vaccinations"] = df_filled["daily_vaccinations"].fillna(df_filled["daily_vaccinations_min"])

# Fill missing data for countries without any valid vaccination number with 0
df_filled["daily_vaccinations"].fillna(0, inplace=True)

# Convert daily_vaccinations column to integers after handling NaN
df_filled["daily_vaccinations"] = df_filled["daily_vaccinations"].astype(int)

# Drop the temporary column
df_filled.drop(columns=["daily_vaccinations_min"], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filled["daily_vaccinations"].fillna(0, inplace=True)


In [3]:
DATE = '2021-01-06'

# Convert date column to datetime type
df_filled['date'] = pd.to_datetime(df_filled['date'])

# Filter the dataset for records with the date 1/6/2021
vaccinations_1_6_2021 = df_filled[df_filled['date'] == DATE]

# Calculate the total number of vaccinations on 1/6/2021
total_vaccinations_1_6_2021 = vaccinations_1_6_2021['daily_vaccinations'].sum()

total_vaccinations_1_6_2021

1485255