In [None]:
# ðŸ¦  COVID-19 Data Analysis & Visualization
### A complete exploratory analysis of the global COVID-19 dataset

This notebook includes:
- Data loading & cleaning  
- Missing value handling  
- Country name standardization  
- Trend analysis  
- Daily case evolution  
- Country comparison (India vs USA etc.)  
- Mortality & recovery rates  
- Global top-10 visualizations  
- Time-series trend plots  

In [None]:
### Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import country_converter as coco
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
### ðŸ“Œ Step 1 â€” Load & Inspect Data

In [None]:
covid_df = pd.read_csv("covid_19_data.csv")

covid_df.shape
covid_df.info()
covid_df.head()

In [None]:
### ðŸ“Œ Step 2 â€” Find Missing Values

In [None]:
def missing_values(df):
    missing = df.isna().sum()
    return missing[missing > 0]

missing_values(covid_df)

In [None]:
### ðŸ“Œ Step 3 â€” Standardize Country Names

In [None]:
cc = coco.CountryConverter()

mapping_country = {
    'Others':'Unknown',
    'North Ireland': 'Ireland',
    'Channel Islands': 'United Kingdom',
    'Diamond Princess': 'Japan',
    'MS Zaandam': 'Netherlands'
}

covid_df['Country/Region'] = covid_df['Country/Region'].replace(mapping_country)

unique_countries = covid_df['Country/Region'].unique()
converted = cc.convert(names=unique_countries, to='name_short', not_found='Unknown')

country_map = dict(zip(unique_countries, converted))

covid_df['Country/Region'] = covid_df['Country/Region'].map(country_map)

In [None]:
### ðŸ“Œ Step 4 â€” Compare Total Confirmed Cases Between Two Countries

In [None]:
def compare_country_conf_cases(df,col):
    df['ObservationDate'] = pd.to_datetime(df['ObservationDate'])

    if len(col)==2:
        country1 = df[df['Country/Region'].str.lower() == col[0].lower()]
        country2 = df[df['Country/Region'].str.lower() == col[1].lower()]
        
        tot1 = country1.groupby('ObservationDate')['Confirmed'].sum().iloc[-1]
        tot2 = country2.groupby('ObservationDate')['Confirmed'].sum().iloc[-1]

        return tot1, tot2

    else:
        return "Please provide exactly 2 countries"

In [None]:
### ðŸ“Œ Step 5 â€” COVID-19 Cases Over Time (Country Level)

In [None]:
def covid_over_time(df,country):
    df['ObservationDate'] = pd.to_datetime(df['ObservationDate'])
    data = df[df['Country/Region'].str.lower() == country.lower()]
    grouped = data.groupby('ObservationDate')['Confirmed'].sum()

    plt.figure(figsize=(10, 5))
    plt.plot(grouped.index, grouped.values)
    plt.title(f"COVID-19 Confirmed Cases Over Time â€” {country}")
    plt.show()

In [None]:
### ðŸ“Œ Step 6 â€” Top 10 Countries by Deaths

In [None]:
def top_deaths(df):
    deaths = df.groupby('Country/Region')['Deaths'].sum().sort_values(ascending=False).head(10)
    deaths.plot(kind='bar', figsize=(8,4), color='red')
    plt.title("Top 10 Countries by Deaths")
    plt.show()

In [None]:
### ðŸ“Œ Step 7 â€” Recovery Rate & Mortality Rate

In [None]:
def recovery_rate(df):
    rates={}
    for country in df['Country/Region'].unique():
        cd = df[df['Country/Region']==country]
        rec = cd['Recovered'].sum()
        conf = cd['Confirmed'].sum()
        rates[country] = (rec/conf*100) if conf>0 else 0
    return rates

def mortality_rate(df):
    rates={}
    for country in df['Country/Region'].unique():
        cd = df[df['Country/Region']==country]
        death = cd['Deaths'].sum()
        conf = cd['Confirmed'].sum()
        rates[country] = (death/conf*100) if conf>0 else 0
    return rates

In [None]:
### ðŸ“Œ Step 8 â€” Daily New Confirmed Cases (Global)

In [None]:
def daily_cases(df):
    df['ObservationDate'] = pd.to_datetime(df['ObservationDate'])
    df['Confirmed'] = pd.to_numeric(df['Confirmed'], errors='coerce')

    daily = df.resample('D', on='ObservationDate')['Confirmed'].sum().diff().clip(lower=0)
    daily.plot(figsize=(10,4))
    plt.title("Daily Global New COVID-19 Cases")
    plt.show()

In [None]:
# ðŸ“Œ Step 9 â€” Predict COVID-19 Cases for the Next 14 Days (Machine Learning)

In this section, we use a simple Linear Regression model to forecast future confirmed cases.
This demonstrates:

- Feature creation  
- Model training  
- Prediction  
- Visualization of future trends  

In [None]:
# Ensure datetime format
covid_df['ObservationDate'] = pd.to_datetime(covid_df['ObservationDate'])

# Group by date â†’ total confirmed worldwide
daily_confirmed = covid_df.groupby('ObservationDate')['Confirmed'].sum().reset_index()

# Create numeric "day number" feature
daily_confirmed['Day'] = (daily_confirmed['ObservationDate'] - daily_confirmed['ObservationDate'].min()).dt.days

# Features (X) and Target (y)
X = daily_confirmed[['Day']]
y = daily_confirmed['Confirmed']

In [None]:
model = LinearRegression()
model.fit(X, y)

print("Model trained successfully!")

In [None]:
# Last available day number
last_day = daily_confirmed['Day'].max()

# Future 14 days
future_days = np.arange(last_day + 1, last_day + 15).reshape(-1, 1)

# Predictions
future_predictions = model.predict(future_days)

future_df = pd.DataFrame({
    "Day_Number": future_days.flatten(),
    "Predicted_Confirmed": future_predictions
})

future_df

In [None]:
plt.figure(figsize=(12,6))

# Plot actual
plt.plot(daily_confirmed['ObservationDate'], daily_confirmed['Confirmed'],
         label="Actual Confirmed Cases", linewidth=2)

# Plot predicted future values
future_dates = daily_confirmed['ObservationDate'].max() + pd.to_timedelta(future_df['Day_Number'] - last_day, unit='D')
plt.plot(future_dates, future_df['Predicted_Confirmed'],
         label="Predicted Next 14 Days", linestyle='--', color='red', linewidth=2)

plt.title("COVID-19 Confirmed Cases Forecast (Next 14 Days)", fontsize=14)
plt.xlabel("Date")
plt.ylabel("Confirmed Cases")
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend()
plt.show()