In [1]:
# Download the dataset
url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
!wget -qO covid_confirmed.csv $url

In [2]:
import pandas as pd

df_raw = pd.read_csv("covid_confirmed.csv")
print(df_raw.head())

  Province/State Country/Region       Lat       Long  1/22/20  1/23/20  \
0            NaN    Afghanistan  33.93911  67.709953        0        0   
1            NaN        Albania  41.15330  20.168300        0        0   
2            NaN        Algeria  28.03390   1.659600        0        0   
3            NaN        Andorra  42.50630   1.521800        0        0   
4            NaN         Angola -11.20270  17.873900        0        0   

   1/24/20  1/25/20  1/26/20  1/27/20  ...  2/28/23  3/1/23  3/2/23  3/3/23  \
0        0        0        0        0  ...   209322  209340  209358  209362   
1        0        0        0        0  ...   334391  334408  334408  334427   
2        0        0        0        0  ...   271441  271448  271463  271469   
3        0        0        0        0  ...    47866   47875   47875   47875   
4        0        0        0        0  ...   105255  105277  105277  105277   

   3/4/23  3/5/23  3/6/23  3/7/23  3/8/23  3/9/23  
0  209369  209390  209406  2

In [3]:
# Drop unnecessary columns and melt the date columns
df = df_raw.drop(columns=["Lat", "Long", "Province/State"])

# Wide to long format
df_long = df.melt(id_vars=["Country/Region"], var_name="Date", value_name="Confirmed")

# Convert Date to datetime
df_long["Date"] = pd.to_datetime(df_long["Date"])

# Group by country and date (some countries have multiple entries)
df_grouped = df_long.groupby(["Country/Region", "Date"]).sum().reset_index()
df_grouped.rename(columns={"Country/Region": "Country"}, inplace=True)

print(df_grouped.head())


       Country       Date  Confirmed
0  Afghanistan 2020-01-22          0
1  Afghanistan 2020-01-23          0
2  Afghanistan 2020-01-24          0
3  Afghanistan 2020-01-25          0
4  Afghanistan 2020-01-26          0


  df_long["Date"] = pd.to_datetime(df_long["Date"])


In [4]:
# Add 'Month' column for monthly grouping
df_grouped["Month"] = df_grouped["Date"].dt.to_period("M").dt.to_timestamp()

# Get monthly last available value (cumulative confirmed)
df_monthly = df_grouped.sort_values("Date").groupby(["Country", "Month"]).last().reset_index()

# Calculate monthly increase
df_monthly["Monthly_Increase"] = df_monthly.groupby("Country")["Confirmed"].diff().fillna(0)

# Calculate growth rate (avoid division by zero)
df_monthly["Growth_Rate"] = df_monthly.groupby("Country")["Confirmed"].pct_change().fillna(0).replace([float('inf'), -float('inf')], 0)

print(df_monthly.head())


       Country      Month       Date  Confirmed  Monthly_Increase  Growth_Rate
0  Afghanistan 2020-01-01 2020-01-31          0               0.0     0.000000
1  Afghanistan 2020-02-01 2020-02-29          5               5.0     0.000000
2  Afghanistan 2020-03-01 2020-03-31        166             161.0    32.200000
3  Afghanistan 2020-04-01 2020-04-30       1827            1661.0    10.006024
4  Afghanistan 2020-05-01 2020-05-31      15180           13353.0     7.308703


In [6]:
# to SQLite

import sqlite3

# Connect to SQLite and save table
conn = sqlite3.connect("covid_etl.db")
df_monthly.to_sql("covid_data", conn, if_exists="replace", index=False)

print("Data loaded to SQLite.")


Data loaded to SQLite.


In [9]:
# Top 5 Countries by Confirmed Cases in a Selected Month
month = "2021-01-01"  # Change as needed

query_top5 ="SELECT Country, Confirmed FROM covid_data WHERE Month = '{month}' ORDER BY Confirmed DESC LIMIT 5;"


top5_df = pd.read_sql(query_top5, conn)
print(top5_df)


Empty DataFrame
Columns: [Country, Confirmed]
Index: []


In [12]:
# Compare Monthly Growth Rates for Selected Countries
countries = ("India", "Brazil", "Russia")

query_growth = f"""
SELECT Country, Month, ROUND(Growth_Rate * 100, 2) AS GrowthPercent
FROM covid_data
WHERE Country IN {countries}
ORDER BY Country, Month;
"""

growth_df = pd.read_sql(query_growth, conn)
print(growth_df.head())



  Country                Month  GrowthPercent
0  Brazil  2020-01-01 00:00:00           0.00
1  Brazil  2020-02-01 00:00:00           0.00
2  Brazil  2020-03-01 00:00:00      285750.00
3  Brazil  2020-04-01 00:00:00        1425.05
4  Brazil  2020-05-01 00:00:00         491.99


In [13]:
# Identify countries with zero reported cases for given time periods.

month_zero = "2020-03-01"

query_zero = f"""
SELECT Country
FROM covid_data
WHERE Month = '{month_zero}' AND Confirmed = 0;
"""

zero_df = pd.read_sql(query_zero, conn)
print(zero_df)


Empty DataFrame
Columns: [Country]
Index: []
