In [None]:
# Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
# Load the dataset
url = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
covid_df = pd.read_csv(url)

# Initial check
print("Dataset shape:", covid_df.shape)
print("\nColumns:", covid_df.columns.tolist())
print("\nMissing values:\n", covid_df.isnull().sum().sort_values(ascending=False).head(20))
print("\nSample data:")
covid_df.head()

In [None]:
# Convert date column to datetime
covid_df['date'] = pd.to_datetime(covid_df['date'])

# Select key columns and countries of interest
key_columns = ['date', 'location', 'total_cases', 'new_cases', 'total_deaths', 
               'new_deaths', 'total_vaccinations', 'people_vaccinated', 
               'population', 'life_expectancy']

countries = ['Kenya', 'United States', 'India', 'United Kingdom', 'Brazil', 'South Africa']

# Filter data
clean_df = covid_df[covid_df['location'].isin(countries)][key_columns]

# Handle missing values - forward fill for time series data
clean_df = clean_df.groupby('location').apply(lambda x: x.ffill())
clean_df = clean_df.reset_index(drop=True)

# Calculate additional metrics
clean_df['death_rate'] = clean_df['total_deaths'] / clean_df['total_cases']
clean_df['vaccination_rate'] = clean_df['people_vaccinated'] / clean_df['population']

In [None]:
# Plot total cases over time
plt.figure(figsize=(12, 6))
sns.lineplot(data=clean_df, x='date', y='total_cases', hue='location')
plt.title('Total COVID-19 Cases Over Time')
plt.ylabel('Total Cases (millions)')
plt.yscale('log')
plt.show()

# Plot total deaths over time
plt.figure(figsize=(12, 6))
sns.lineplot(data=clean_df, x='date', y='total_deaths', hue='location')
plt.title('Total COVID-19 Deaths Over Time')
plt.ylabel('Total Deaths')
plt.show()

# Compare death rates
latest_data = clean_df.groupby('location').last().reset_index()
plt.figure(figsize=(10, 6))
sns.barplot(data=latest_data, x='location', y='death_rate')
plt.title('Case Fatality Rate by Country')
plt.ylabel('Death Rate (deaths/cases)')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Plot vaccination progress
plt.figure(figsize=(12, 6))
sns.lineplot(data=clean_df, x='date', y='vaccination_rate', hue='location')
plt.title('Vaccination Progress Over Time')
plt.ylabel('Percentage of Population Vaccinated')
plt.show()

# Latest vaccination status
latest_vacc = latest_data.sort_values('vaccination_rate', ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(data=latest_vacc, x='location', y='vaccination_rate')
plt.title('Percentage of Population Vaccinated by Country')
plt.ylabel('Vaccination Rate')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Prepare data for choropleth
latest_global = covid_df.groupby(['iso_code', 'location']).last().reset_index()

# Create choropleth for total cases per million
fig = px.choropleth(latest_global, 
                    locations="iso_code",
                    color="total_cases_per_million",
                    hover_name="location",
                    color_continuous_scale=px.colors.sequential.Plasma,
                    title="Total COVID-19 Cases Per Million People")
fig.show()

# Create choropleth for vaccination rates
fig = px.choropleth(latest_global, 
                    locations="iso_code",
                    color="people_vaccinated_per_hundred",
                    hover_name="location",
                    color_continuous_scale=px.colors.sequential.Viridis,
                    title="Percentage of Population Vaccinated")
fig.show()