# Day 1: COVID-19 Data Exploration

This notebook loads and explores the COVID-19 dataset from Our World in Data.

In [None]:
# Install required packages (uncomment if needed)
# %pip install pandas matplotlib seaborn

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
pd.set_option('display.max_columns', None)
sns.set(style='whitegrid')

In [None]:
# Load the dataset
url = 'https://covid.ourworldindata.org/data/owid-covid-data.csv'
df = pd.read_csv(url)
df.shape

In [None]:
# Preview the dataset
df.head()

In [None]:
# View data types and null counts
df.info()
df.isnull().sum().sort_values(ascending=False).head(10)

In [None]:
# Select important columns of interest
columns_of_interest = [
    'location', 'date', 'total_cases', 'new_cases', 'total_deaths',
    'people_vaccinated', 'population', 'tests_per_case', 'reproduction_rate'
]
df_subset = df[columns_of_interest]
df_subset['date'] = pd.to_datetime(df_subset['date'])
df_subset.head()

In [None]:
# Filter data for two sample countries (India and USA)
countries = ['India', 'United States']
df_sample = df_subset[df_subset['location'].isin(countries)]
df_sample.head()

In [None]:
# Plot total cases over time for India and USA
plt.figure(figsize=(12, 6))
sns.lineplot(data=df_sample, x='date', y='total_cases', hue='location')
plt.title('Total COVID-19 Cases Over Time')
plt.xlabel('Date')
plt.ylabel('Total Cases')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Save this sample and upload to azure blob storage container.Manually Create Unity Catalog Metastore
df_subset.to_csv('data-samples/raw/owid_subset.csv', index=False)

Used this video to create metastore to proceed for Day2

https://www.youtube.com/watch?v=4uKRzDf0zIc