<a href="https://colab.research.google.com/github/appliedcode/mthree-c422/blob/mthree-422-srilatha/etlpractice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
!pip install -q pandas numpy sqlalchemy
import pandas as pd
import numpy as np
from sqlalchemy import create_engine


In [39]:
# Load data directly from the URL
url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
df_raw = pd.read_csv(url)

# Quick check
df_raw.head()


Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,2/28/23,3/1/23,3/2/23,3/3/23,3/4/23,3/5/23,3/6/23,3/7/23,3/8/23,3/9/23
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,209322,209340,209358,209362,209369,209390,209406,209436,209451,209451
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,334391,334408,334408,334427,334427,334427,334427,334427,334443,334457
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,271441,271448,271463,271469,271469,271477,271477,271490,271494,271496
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,47866,47875,47875,47875,47875,47875,47875,47875,47890,47890
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,105255,105277,105277,105277,105277,105277,105277,105277,105288,105288


In [40]:
# Step 4: Clean and Reshape

# Safe fillna assignment
df_raw['Country/Region'] = df_raw['Country/Region'].fillna('Unknown')

# Drop unnecessary columns
df = df_raw.drop(columns=['Lat', 'Long', 'Province/State'])

# Group by country and sum provinces
df_grouped = df.groupby('Country/Region').sum().reset_index()

# Reshape from wide to long
df_long = df_grouped.melt(id_vars=["Country/Region"], var_name="Date", value_name="Confirmed")

# Convert 'Date' safely
df_long["Date"] = pd.to_datetime(df_long["Date"], format='%m/%d/%y')  # known format

# Step 5: Aggregate monthly and engineer features

# Extract month
df_long["Month"] = df_long["Date"].dt.to_period("M").dt.to_timestamp()

# Aggregate confirmed cases per country per month
df_monthly = df_long.groupby(["Country/Region", "Month"])["Confirmed"].sum().reset_index()

# Calculate Monthly Increase
df_monthly["Monthly_Increase"] = df_monthly.groupby("Country/Region")["Confirmed"].diff().fillna(0)

# Calculate Growth Rate
df_monthly["Growth_Rate"] = df_monthly["Monthly_Increase"] / df_monthly.groupby("Country/Region")["Confirmed"].shift(1)
df_monthly["Growth_Rate"] = df_monthly["Growth_Rate"].fillna(0)



In [41]:
# Sort for lag calculation
df_monthly.sort_values(by=["Country/Region", "Month"], inplace=True)

# Monthly case increase
df_monthly["Monthly_Increase"] = df_monthly.groupby("Country/Region")["Confirmed"].diff().fillna(0)

# Growth rate: increase / previous total
df_monthly["Growth_Rate"] = df_monthly["Monthly_Increase"] / df_monthly.groupby("Country/Region")["Confirmed"].shift(1)
df_monthly["Growth_Rate"].fillna(0, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_monthly["Growth_Rate"].fillna(0, inplace=True)


In [42]:
from sqlalchemy import create_engine

# Create SQLite engine
engine = create_engine('sqlite:///covid_global.db', echo=False)

# Save to SQL table
df_monthly.to_sql("covid_monthly", con=engine, if_exists="replace", index=False)


7839

In [43]:
query = """
SELECT [Country/Region], Month, Confirmed
FROM covid_monthly
WHERE Month = '2021-04-01'
ORDER BY Confirmed DESC
LIMIT 5;
"""
pd.read_sql(query, con=engine)


Unnamed: 0,Country/Region,Month,Confirmed


In [44]:
selected_countries = ('India', 'United States', 'Brazil')

query = f"""
SELECT [Country/Region], Month, Growth_Rate
FROM covid_monthly
WHERE [Country/Region] IN {selected_countries}
ORDER BY [Country/Region], Month;
"""
pd.read_sql(query, con=engine)


Unnamed: 0,Country/Region,Month,Growth_Rate
0,Brazil,2020-01-01 00:00:00.000000,0.000000
1,Brazil,2020-02-01 00:00:00.000000,inf
2,Brazil,2020-03-01 00:00:00.000000,7424.400000
3,Brazil,2020-04-01 00:00:00.000000,27.241064
4,Brazil,2020-05-01 00:00:00.000000,6.619181
...,...,...,...
73,India,2022-11-01 00:00:00.000000,-0.031419
74,India,2022-12-01 00:00:00.000000,0.033569
75,India,2023-01-01 00:00:00.000000,0.000120
76,India,2023-02-01 00:00:00.000000,-0.096705


In [45]:
query = """
SELECT [Country/Region], Month
FROM covid_monthly
WHERE Confirmed = 0 AND Month = '2020-03-01';
"""
pd.read_sql(query, con=engine)


Unnamed: 0,Country/Region,Month
