# Importing Modules

In [None]:
import numpy as np # type: ignore
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
import warnings as wn

In [None]:
wn.filterwarnings(action = "ignore")

# Preparing and Exploring Dataframe

In [None]:
# Importing csv file to dataframe
covid = pd.read_csv("D:\python\py files\owid-covid-data (1).csv")
covid.head()

In [None]:
covid.shape

In [None]:
covid.describe()

In [None]:
covid.columns.tolist()

In [None]:
covid = covid.drop_duplicates()
print(covid.duplicated().sum())

In [None]:
# droping unnecessary columns from dataframe
column_list = covid.columns.tolist()
del_index_1 = column_list.index("reproduction_rate")
del_index_2 = column_list.index("tests_units")

covid.drop(covid.iloc[:, del_index_1:del_index_2], axis = 1, inplace = True)
covid.shape

In [None]:
# renaming the column names
covid.rename(columns={'date': 'Date','location':'Country','continent': 'Continent','iso_code':'ISO_code'},inplace=True)
continent_unique = list(covid.Continent.unique())
continent_unique

In [None]:
# finding null values in the dataframe
covid.isnull().sum()

In [None]:
# replacing a missing value with a constant value.
imputer = SimpleImputer(strategy='constant')
covid2 = pd.DataFrame(imputer.fit_transform(covid),columns=covid.columns)

In [None]:
# Group by 'Date' and 'Country', and sum numeric columns
covid['total_cases'] = pd.to_numeric(covid['total_cases'], errors='coerce')
covid['total_deaths'] = pd.to_numeric(covid['total_deaths'], errors='coerce')
covid['total_vaccinations'] = pd.to_numeric(covid['total_vaccinations'], errors='coerce')

df = covid.groupby(['Date', 'Country'])[['total_cases','total_deaths','total_vaccinations']].agg('sum').reset_index()
df

# Identifying Outliers Using IQR

In [None]:
#25th percentile and 75 percentile
Q1 = covid['total_cases'].quantile(0.25)
Q3 = covid['total_cases'].quantile(0.75)
IQR = Q3 - Q1

# Define outliers
covid["is_outlier"] = (covid['total_cases'] < (Q1 - 1.5 * IQR)) | (covid['total_cases'] > (Q3 + 1.5 * IQR))

# removing the outliers and printing the new shape 
covid = covid[covid["is_outlier"] == False]
covid.shape

# Plot countries where total_deaths is greater than 1000000

In [None]:
df2 = df[df['total_deaths'] > 1000000]
countries = df2['Country'].unique()
len(countries)

In [None]:
country_deaths_greaterthan1000000 = list(countries)
country_deaths_greaterthan1000000

In [None]:
for idx in range (0, len(countries)):
    C = df2[df2[ 'Country' ]==countries [idx]].reset_index()
    plt.scatter (np.arange(0,len (C)),C[ 'total_cases' ], color="blue", label="total_cases") 
    plt.scatter (np.arange(0,len (C)),C['total_deaths' ], color="red", label="total_deaths")
    plt.scatter (np.arange(0,len (C)),C['total_vaccinations' ], color="green", label="total_vaccinations")
    plt.title(countries[idx])
    plt.xlabel("Number of days since first suspect")
    plt.ylabel("Number of cases")
    plt.legend()
    plt.show()

# How have the number of COVID-19 cases and deaths evolved over time

In [None]:
col = covid[['Date', 'total_cases', 'total_deaths']]
covid['Date'] = pd.to_datetime(df['Date'])
covid = covid.dropna(subset=['total_cases', 'total_deaths'])

global_trends = covid.groupby('Date').agg({
    'total_cases': 'sum',
    'total_deaths': 'sum'
}).reset_index()

In [None]:
# Plot global trends
plt.figure(figsize=(14, 7))

plt.plot(global_trends['Date'],global_trends['total_cases'], label='Total Cases', color='blue')
plt.plot(global_trends['Date'], global_trends['total_deaths'], label='Total Deaths', color='red')

plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Global COVID-19 Cases and Deaths Over Time')
plt.legend()
# plt.grid(True)
plt.show()

Plotting for the countries that have deaths greater than 1000000

In [None]:
for i in country_deaths_greaterthan1000000:
    # Example: Filter for a specific country (e.g., 'United States')
    country_df = covid[covid['Country'] == i]

    if not country_df.empty: 

        # Aggregate and plot for this country
        country_trends = country_df.groupby('Date').agg({
            'total_cases': 'sum',
            'total_deaths': 'sum'
        }).reset_index()

        
        plt.figure(figsize=(14, 7))

        plt.plot(country_trends['Date'], country_trends['total_cases'], label='Total Cases', color='blue')
        plt.plot(country_trends['Date'], country_trends['total_deaths'], label='Total Deaths', color='red')

        plt.xlabel('Date')
        plt.ylabel('Count')
        plt.title('COVID-19 Cases and Deaths in the United States Over Time')
        plt.legend()
        plt.grid(True)
        plt.show()
    else:
        continue

# What is the distribution of COVID-19 cases across different countries

In [None]:
dist = covid[['Country', 'total_cases']]

dist = dist.dropna(subset=['total_cases'])

dist = dist.drop_duplicates()

latest_cases = dist.groupby('Country').agg({
    'total_cases': 'max'  # Assuming you want the latest reported cases
}).reset_index()

In [None]:
# Plot the distribution of total COVID-19 cases by country
plt.figure(figsize=(16, 45))

# Sort countries by total cases
latest_cases_sorted = latest_cases.sort_values(by='total_cases', ascending=False)

# Plot a bar chart
sns.barplot(x='total_cases', y='Country', data=latest_cases_sorted, palette='viridis')

plt.xlabel('Total Cases')
plt.ylabel('Country/Region')
plt.title('Distribution of COVID-19 Cases Across Different Countries/Regions')
plt.show()