In [None]:
!pip install prophet

In [None]:
!pip install tslearn

# Introduction

The COVID-19 pandemic has profoundly impacted the world, presenting unprecedented challenges in healthcare, economy, and social dynamics. Understanding the spread and impact of the virus through data-driven approaches has become crucial for effective decision-making and policy formulation. This project aims to visualize and explore COVID-19 time series data, employing clustering techniques to identify patterns and insights across different regions. By analyzing data from Johns Hopkins University, a leading source for COVID-19 tracking, this project seeks to uncover trends and correlations that can aid in understanding the pandemic's trajectory.

# Background

Since the outbreak of COVID-19 in late 2019, data scientists and researchers have focused on tracking and analyzing the virus's spread globally. Johns Hopkins University has been at the forefront, providing comprehensive datasets that detail daily reports of COVID-19 cases, recoveries, and deaths across countries and regions. These datasets have become invaluable resources for researchers and policymakers, offering insights into the dynamics of the pandemic.

The project leverages this data to perform time series analysis, utilizing visualization techniques to present the data intuitively and engagingly. Furthermore, clustering algorithms are applied to group regions with similar pandemic trends, enabling a deeper understanding of how different factors may influence the virus's spread. By exploring patterns in the data, this project aims to contribute to the broader efforts in managing and mitigating the impacts of COVID-19.

**Key Stakeholders:**

**Healthcare Providers:** Need insights to manage hospital resources and patient care.

**Government Agencies:** Use data to inform public health policies and restrictions.

**Researchers and Academics:** Require data for ongoing studies and publications.

**General Public:** Benefits from understanding the spread and impact of COVID-19.

#Import libraries

In [None]:
# @title
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import plotly.express as px
from plotly.offline import init_notebook_mode,iplot
import plotly.graph_objs as go

import pandas as pd
import random
import math
import time
import datetime
import operator

from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tslearn.clustering import silhouette_score
from tslearn.clustering import TimeSeriesKMeans
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from tslearn.datasets import CachedDatasets
from sklearn.decomposition import PCA



from prophet import Prophet
from prophet.plot import plot_plotly, plot_components_plotly ,add_changepoints_to_plot

#decomposed
from statsmodels.tsa.seasonal import seasonal_decompose
%matplotlib inline
import warnings

plt.style.use('seaborn-poster')
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)


# Import data From Github

In [None]:
# @title
confirmed_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
deaths_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
latest_data = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/01-15-2023.csv')

#Exploring and Preparing Data Data

In [None]:
# @title
#Exploring Confirmed Data
confirmed_df.head()

In [None]:
# @title
deaths_df.head()

In [None]:
# @title
latest_data.head()

Get all Time Series data from COVID-19

In [None]:
# @title
# Grouping all death and confirmed numbers by country for each day

confirmed_ts_by_country = confirmed_df.drop(['Long','Lat','Province/State'],axis=1).groupby('Country/Region').sum()
death_ts_by_country = deaths_df.drop(['Long','Lat','Province/State'],axis=1).groupby('Country/Region').sum()
global_ts_confirmed = confirmed_ts_by_country.sum()
global_ts_deaths = death_ts_by_country.sum()

In [None]:
# @title
import matplotlib.dates as mdates

# Helper functions
def get_time_series(ts):
    """
    Convert the index of a Pandas Series to DateTime format and sort the index.
    """
    ts.index = pd.to_datetime(ts.index, format='%m/%d/%y')
    ts.sort_index(inplace=True)
    return ts

def get_rate_of_change(ts):
    """
    Calculate the rate of change of a Pandas Series.
    """
    temp_df = pd.DataFrame(get_time_series(ts))
    temp_df.columns = ['numbers']
    temp_df['Shifted'] = temp_df['numbers'].shift(1)
    temp_df['Difference'] = temp_df['numbers'] - temp_df['Shifted']

    return temp_df['Difference']


def plotly_plot(ts_list, chart, title, x_title='Date', y_title='Person', ma=False, dir_plot=False, identifier=None, data=True):
    """
    Plot the time series data and optionally its moving average using Plotly.

    Parameters:
    - ts_list: List of Pandas Series with DateTime index
    - chart: Type of chart to plot ('line', 'bar', etc.)
    - title: Title of the chart
    - x_title: Title of the x-axis
    - y_title: Title of the y-axis
    - ma: If True, plots a 7-day moving average
    - dir_plot: If True, skips rate of change calculation
    - identifier: List of identifiers for each time series (optional)
    - data: If True, includes data in the plot
    """
    if not isinstance(ts_list, list):
        ts_list = [ts_list]

    if identifier is None:
        identifier = ['Data'] * len(ts_list)

    # Create the plot
    fig = go.Figure()

    for i, ts in enumerate(ts_list):
        # Skip rate of change
        if not dir_plot:
            ts = get_rate_of_change(ts)

        # Prepare data for plotting
        df = pd.DataFrame(ts)
        df.reset_index(inplace=True)
        df.columns = ['Date', 'Value']

        if ma:
            df['Moving Average'] = df['Value'].rolling(window=14).mean()

        # Plot main data
        if chart == 'line' and data:
            fig.add_trace(go.Scatter(x=df['Date'], y=df['Value'], mode='lines', name=f'{identifier[i]}'))
        elif chart == 'bar' and data:
            fig.add_trace(go.Bar(x=df['Date'], y=df['Value'], name=f'{identifier[i]}', opacity=1))
        else:
            raise ValueError("Unsupported chart type. Use 'line' or 'bar'.")

        # Plot moving average if specified
        if ma:
            fig.add_trace(go.Scatter(x=df['Date'], y=df['Moving Average'], mode='lines', name=f'{identifier[i]} MA14'))

    # Customize layout
    fig.update_layout(
        title=title,
        xaxis_title=x_title,
        yaxis_title=y_title,
        xaxis=dict(
            tickformat='%Y-%m-%d',
            tickangle=90
        ),
        legend=dict(
            x=0.05,
            y=0.95,
            bgcolor='rgba(255, 255, 255, 0)',
            bordercolor='rgba(255, 255, 255, 0)'
        ),
        margin=dict(l=0, r=0, t=50, b=100),
        height=600,
        template='plotly_white'
    )

    # Show plot
    fig.show()


## Worldwide Overview

Investigate what happend 2023-1-15 to the number of death

In [None]:
# @title
wanted_days=death_ts_by_country[['1/14/23','1/15/23']]
wanted_days['diff']=wanted_days['1/15/23']-wanted_days['1/14/23']
wanted_days.sort_values('diff',ascending=False).head()

So i China there is Huge number of death between 1/14/23 and 1/15/23
Need more investigation to know which State

In [None]:
# @title
china_day15_1_23 = deaths_df[deaths_df['Country/Region']=='China'][['Province/State','1/14/23','1/15/23']]
china_day15_1_23['diff']=china_day15_1_23['1/15/23']-china_day15_1_23['1/14/23']
china_day15_1_23.sort_values('diff',ascending=False).head()

In [None]:
# @title
deaths_df.iloc[89,4:].plot()

I think  like there is a problem with the data need further investigation for now it has no explanation, maybe it is delayed report but the state is unknown so iam gonna delete this time series.

In [None]:
# @title
deaths_df.drop(deaths_df.index[89], inplace=True)
death_ts_by_country = deaths_df.drop(['Long','Lat','Province/State'],axis=1).groupby('Country/Region').sum()
global_ts_deaths = death_ts_by_country.sum()

In [None]:
# @title
print('Change Over Time')
plotly_plot(global_ts_confirmed,'line',title='Cumulative Confirmed Cases Worldwide',ma=True,dir_plot=True)
plotly_plot(global_ts_deaths,'line',title='Cumulative Death Cases Worldwide',ma=True,dir_plot=True)
print('Rate of Change')
plotly_plot(global_ts_confirmed,'bar',title='Rate of Confirmed Cases Worldwide',ma=True)
plotly_plot(global_ts_deaths,'bar',title='Rate of Death Cases Worldwide',ma=True)

## Cheking Seasonality and Trend

### Confirmed Cases

In [None]:
# @title
sample_confirmed = get_rate_of_change(global_ts_confirmed).to_frame()
sample_confirmed['Date'] = sample_confirmed.index
sample_confirmed.columns=['y','ds']

c_m = Prophet(seasonality_mode='additive',changepoint_prior_scale=0.6)
c_m.fit(sample_confirmed)
future = c_m.make_future_dataframe(periods=0)
forecast = c_m.predict(future)
plot_components_plotly(c_m,forecast,figsize=(900, 400))

In [None]:
# @title
sample_confirmed = get_rate_of_change(global_ts_deaths).to_frame()
sample_confirmed['Date'] = sample_confirmed.index
sample_confirmed.columns=['y','ds']

c_m = Prophet(seasonality_mode='additive',changepoint_prior_scale=0.5)
c_m.fit(sample_confirmed)
future = c_m.make_future_dataframe(periods=0)
forecast = c_m.predict(future)
plot_components_plotly(c_m,forecast,figsize=(900, 400))

# Analysis of COVID-19 Confirmed Case Trends

## 1. Initial Phase of the Pandemic
In the early stages of the pandemic, the number of confirmed cases was relatively low. This may be attributed to several factors:
- **Detection Challenges**: During this period, the methods for identifying and diagnosing COVID-19 were still being developed.
- **Data Reporting Issues**: Some countries may have been underreporting or concealing actual case numbers.
- **Virus Spread**: The virus had not yet had sufficient time to spread widely.

## 2. Seasonality Observed from 2021
Starting from early 2021, there is a noticeable seasonal pattern in the rate of confirmed cases. We observe an increase in the number of confirmed cases approximately every 3-4 months. This recurring trend suggests a cyclical pattern in the spread of the virus.

## 3. Significant Surge in Confirmed Cases
There was a marked increase in the number of confirmed cases during December 2021 and April 2022. This spike warrants further investigation to understand the underlying causes, which could include factors such as new variants, changes in public health policies, or seasonal effects.

## 4. Decrease in Seasonality Post-April 2022
After April 2022, the seasonal pattern in case rates seems to diminish. This reduction in seasonality could indicate a stabilization in the spread of the virus or a shift in the pandemic dynamics.

# COVID-19 Death Trends Analysis

## 1. Early Death Rates
- **Initial Surge**: Death rates initially spiked due to limited knowledge about the virus and its symptoms. Early on, the virus was often mistaken for a common flu, leading to delayed and inadequate responses.

## 2. Seasonality of Death Rates
- **Recurring Patterns**: Similar to confirmed case numbers, death rates also showed seasonal fluctuations approximately every 4 months. This seasonality may be linked to changes in weather patterns in certain regions and inadequate preparedness for these changes.

## 3. Trends from 2020 to 2022
- **Rising and Falling Rates**: From January 2020 to January 2021, there was a notable upward trend in death rates. This was followed by a gradual decline from January 2021 to February 2022. The decrease in death rates can be attributed to several factors:
  - **Global Awareness**: Increased global awareness about the pandemic led to better prevention strategies.
  - **Vaccination Impact**: Although vaccines were available since late 2020, the decline in death rates was initially slow due to various reasons:
    - **Slow Vaccine Rollout**:
      - **Limited Supply**: Vaccine production and distribution were initially slow.
      - **Logistical Challenges**: Setting up vaccination sites and scheduling appointments took time.
    - **Vaccine Hesitancy**:
      - **Public Concerns**: Concerns about vaccine safety and misinformation caused reluctance.
      - **Access Issues**: Vaccine access was limited in some areas.
    - **New Variants**:
      - **Increased Spread**: Variants like Delta spread rapidly, complicating control efforts.
      - **Reduced Effectiveness**: Some variants reduced vaccine effectiveness, necessitating booster shots.
    - **Delayed Benefits**:
      - **Herd Immunity**: Achieving sufficient vaccination coverage for significant impact took time.
      - **Data Lag**: Analyzing the impact of vaccines took time.
    - **Healthcare System Stress**:
      - **Overwhelmed Hospitals**: The healthcare system was strained, affecting mortality rates.

## 4. Decline in Death Rates Post-2022-2
- **Increased Vaccination Coverage**:
  - **Higher Rates**: By 2022, a larger portion of the global population was vaccinated, including booster doses, which increased immunity and reduced severe cases.
  - **Effective Vaccines**: Vaccines proved highly effective in preventing severe illness and deaths.
- **Widespread Immunity**:
  - **Herd Immunity**: Higher vaccination rates and natural immunity from previous infections contributed to reduced virus spread.
- **Improved Treatments**:
  - **Advanced Therapies**: Enhanced medical treatments improved the management of severe cases and reduced mortality.
- **Adaptation to Variants**:
  - **Updated Vaccines**: New vaccines and boosters targeted emerging variants, improving protection.
  - **Adapted Strategies**: Public health strategies were updated based on new data.
- **Public Health Measures**:
  - **Ongoing Precautions**: Continued use of masks, social distancing, and hygiene measures helped reduce transmission.
- **Behavioral Changes**:
  - **Increased Awareness**: Greater public awareness led to better adherence to preventive guidelines.


# By Country

In [None]:
# @title
sample = confirmed_ts_by_country.iloc[:,-1:].rename(columns={'3/9/23': 'total_cases'})
fig = px.treemap(sample, path=[sample.index], values='total_cases')
print('Tree Map For Number of confirmed cases By Country')
fig.show()

In [None]:
# @title
sample = death_ts_by_country.iloc[:,-1:].rename(columns={'3/9/23': 'total_deaths'})

fig = px.treemap(sample, path=[sample.index], values='total_deaths')
print('Tree Map For Number of Death By Country')

fig.show()

Spatial Analysis for confirmed cases animated over time

In [None]:
spatial_df = pd.DataFrame(columns=['date','confirmed','country'])
for i in range(confirmed_ts_by_country.shape[0]):
  temp_df=None
  country = confirmed_ts_by_country.iloc[i].name
  temp_df = get_time_series(confirmed_ts_by_country.iloc[i]).to_frame()
  temp_df = temp_df.reset_index().rename(columns={country:'confirmed','index':'date'})
  temp_df['country'] = country
  temp_df = temp_df
  spatial_df = pd.concat([spatial_df, temp_df])

In [None]:
spatial_df.confirmed = spatial_df.confirmed.astype(int)

In [None]:
!pip install -U kaleido


In [None]:
import plotly.express as px

# Define a color scale with expanded range
color_scale = [
    [0, 'rgba(255,255,255,1)'],    # White for very low values
    [0.01, 'rgba(255,255,204,1)'],  # Light yellow for small values
    [0.05, 'rgba(255,204,153,1)'],  # Light orange for low values
    [0.2, 'rgba(255,153,51,1)'],    # Orange for moderate values
    [0.5, 'rgba(255,102,0,1)'],     # Dark orange for high values
    [0.8, 'rgba(255,0,0,1)'],       # Dark red for very high values
    [1, 'rgba(139,0,0,1)']          # Darker red for the highest values
]

fig = px.choropleth(
    spatial_df,
    locations='country',
    locationmode='country names',
    color='confirmed',
    hover_name='country',
    animation_frame='date',
    title='COVID-19 Confirmed Cases Over Time',
    width=1400,
    height=800,
    color_continuous_scale=color_scale
)

fig.update_layout(
    updatemenus=[
        {
            "buttons": [
                {
                    "args": [None, {"frame": {"duration": 0, "redraw": True}, "fromcurrent": True}],
                    "label": "Play",
                    "method": "animate",
                },
                {
                    "args": [[None], {"frame": {"duration": 0, "redraw": True}, "mode": "immediate", "transition": {"duration": 0}}],
                    "label": "Pause",
                    "method": "animate",
                },
            ],
            "direction": "left",
            "pad": {"r": 10, "t": 87},
            "showactive": False,
            "type": "buttons",
            "x": 0.1,
            "xanchor": "right",
            "y": 0,
            "yanchor": "top",
        }
    ]
)

fig.show()




The pandemic's impact varied across countries due to several key factors, with more shared factors leading to greater effects. Some of these factors include:

- **Population Density**: Densely populated areas experienced faster virus spread.
- **Travel Connectivity**: High levels of international travel led to early outbreaks.
- **Healthcare Capacity**: Limited infrastructure resulted in higher mortality rates.
- **Government Response**: Timely measures controlled the spread, while delays worsened it.
- **Public Compliance**: Adherence to guidelines and vaccine acceptance influenced outcomes.
- **Socio-Economic Factors**: Economic disparities affected the ability to follow restrictions.
- **Vaccine Rollout**: The speed and efficiency of distribution impacted control efforts.
- **Emerging Variants**: New, more transmissible variants complicated containment.
- **Public Health Infrastructure**: Testing and tracing capabilities were crucial.
- **Cultural Attitudes**: Views on authority and health influenced compliance.

### Some Reasons for top 5 countries

- **United States**:
  - **High Population Density**: Urban areas like New York City saw rapid transmission due to dense populations.
  - **International Travel**: As a major global hub, exposure to international travelers was significant.
  - **Health Inequalities**: Disparities in healthcare access and pre-existing conditions contributed to higher mortality rates.
  - **Tourism and Trade Movement**: The U.S. had high levels of both international tourism and trade, increasing the potential for virus spread.

- **India**:
  - **Population Size**: Being highly populous, controlling the virus's spread across diverse regions was challenging.
  - **Healthcare System Strain**: The pandemic overwhelmed infrastructure, especially during the 2021 second wave.
  - **Economic Factors**: Lockdowns severely impacted the economy, complicating response efforts.
  - **Tourism and Trade Movement**: India experienced substantial international travel and trade, which facilitated the virus's spread.

- **France**:
  - **Population Density**: High population density in urban areas like Paris facilitated the virus's spread.
  - **Healthcare System**: France's healthcare system faced significant pressure, especially in major cities.
  - **Government Response**: Early and strict lockdown measures were implemented, which initially helped control the spread but faced challenges with subsequent waves.
  - **Tourism and Trade Movement**: France is a major tourist destination and trade hub, with extensive international travel contributing to the spread.

- **Germany**:
  - **Effective Early Response**: Germany implemented early and effective containment measures, including widespread testing and contact tracing.
  - **Healthcare Capacity**: The country maintained a relatively robust healthcare system but faced challenges with rising cases in later waves.
  - **Economic Impact**: The pandemic's economic impact was significant, influencing public compliance and response measures.
  - **Tourism and Trade Movement**: Germany's significant role in global trade and tourism increased the virus's potential for widespread impact.

- **Brazil**:
  - **Government Response**: Delayed and inconsistent measures led to rapid virus spread.
  - **Urbanization**: Cities like São Paulo and Rio de Janeiro experienced high transmission due to crowded conditions.
  - **Variants**: A hotspot for new variants, increasing transmission and severity.
  - **Tourism and Trade Movement**: Brazil's trade and tourism activities contributed to the virus's rapid spread.


## Now I will explor data for some countries i will chose top 5 in number of death and confirmed cases

In [None]:
# @title
top_5_by_death = list(sample.sort_values('total_deaths',ascending=False).head().index)

ts_list = []
ts_list2 = []
id_list = []
for country in top_5_by_death:
  id_list.append(f'{country} Bars')
  ts_list.append(death_ts_by_country.loc[country])
  ts_list2.append(confirmed_ts_by_country.loc[country])

print('Death')
plotly_plot(ts_list,'line',title='Death increase over time',ma=True,identifier=id_list,dir_plot=True)
plotly_plot(ts_list,'bar',title='Rate of Death',ma=True,identifier=id_list)

print('Confimed cases')
plotly_plot(ts_list2,'line',title='cases increase over time',ma=True,identifier=id_list,dir_plot=True)
plotly_plot(ts_list2,'bar',title='Rate of cases',ma=True,identifier=id_list)

### 1. Variation in Seasonality by Country

Each country shows different seasonality in confirmed COVID-19 cases due to:

1. **Climate and Weather**: Local conditions influence virus spread.
2. **Government Policies**: Varying effectiveness and timing of measures like lockdowns.
3. **Healthcare Capacity**: Differences in managing peak cases.
4. **Variants**: Impact of new, more transmissible strains varies.
5. **Vaccination**: Differences in rollout speed and public acceptance.

### 2. Case and Death Rate Trends (December 2021 - March 2022)

During this period, many countries saw a rise in confirmed cases but lower death rates due to:

1. **Omicron Variant**: More transmissible but generally less severe.
2. **Widespread Vaccination**: Reduced severity and prevented many severe cases.
3. **Natural Immunity**: Previous exposure led to some level of immunity, reducing severe outcomes.
4. **Improved Treatments**: Enhanced medical protocols and treatments.
5. **Public Health Measures**: Continued use of masks and social distancing helped mitigate impacts.

### 3. Seasonal Patterns: Global vs. Country-Level

Globally, confirmed cases show seasonality every 3-4 months, while country-specific data shows patterns every 10-12 months due to:

1. **Global vs. Local Variability**: Global averages smooth out local trends, showing more frequent seasonality.
2. **Diverse Climatic and Social Conditions**: Local factors create longer-term seasonal effects.
3. **Data Averaging**: Global data aggregates information, reflecting more frequent patterns.
4. **Public Health Measures**: Variations in measures impact local seasonal cycles.
5. **Vaccination and Immunity**: Global vaccination rates and immunity affect patterns differently at the country level.

### 4. Similar Patterns Post-January 2022

After January 2022, similar patterns emerged among top countries due to:

1. **Adaptation**: Adjustments to lockdowns and home-based activities influenced virus spread.
2. **Public Health Measures**: Similar global responses impacted transmission patterns.
3. **Behavioral Changes**: Common behaviors due to restrictions led to similar infection patterns.
4. **Vaccination and Immunity**: Increased global vaccination and immunity contributed to parallel trends across diverse locations.


#Investigate China Data

In [None]:
China_con = confirmed_ts_by_country.loc['China']
China_death = death_ts_by_country.loc['China']

print('Death')
plotly_plot(China_death,'line',title='Death increase over time',ma=True,dir_plot=True)
plotly_plot(China_death,'bar',title='Rate of Death',ma=True,identifier=id_list)

print('Case')
plotly_plot(China_con,'line',title='cases increase over time',ma=True,dir_plot=True)
plotly_plot(China_con,'bar',title='Rate of cases',ma=True)

# Analysis of China's COVID-19 Data Reporting

The data from China shows unusual patterns, with constant numbers of deaths and cases over two years, which seems improbable. After investigating, it appears that China's initial reporting policies contributed to this anomaly. Here’s a summary of the key factors:

## 1. Initial Reporting Delays
- **Early Stages**: In the early stages of the pandemic, there were delays in reporting and limited public information. This resulted in underreporting of both cases and deaths.

## 2. Information Control
- **Censorship**: The Chinese government imposed censorship and restrictions on information about the virus, including suppression of early warnings and criticism of the government's response.
- **Media Restrictions**: Journalists and independent observers faced limitations, affecting the accuracy and flow of information.

## 3. Changes in Reporting Policies
- **Increased Transparency**: As the pandemic progressed, China revised its reporting policies and increased transparency.
- **Data Revisions**: There were significant adjustments to reported figures as new information became available.

## 4. International Criticism
- **Global Scrutiny**: The international community criticized China for its initial handling of the outbreak and the impact on global transparency, focusing on the accuracy and timeliness of the reported data.


#Checking Case_Fatality_Ratio

calcualte Case_Fatality_Ratio per state

In [None]:
State_Case_Fatality_Ratio = latest_data[latest_data['Country_Region']=='US'][['Province_State','Confirmed','Deaths']].groupby('Province_State').sum()
State_Case_Fatality_Ratio['Case_Fatality_Ratio'] = (State_Case_Fatality_Ratio['Deaths']/State_Case_Fatality_Ratio['Confirmed'])*100

#let invistigate US data more in depth

In [None]:
# @title
# Select and sort data
sorted_df = State_Case_Fatality_Ratio.sort_values('Case_Fatality_Ratio', ascending=False)[1:11]

# Apply heatmap styling
styled_df = sorted_df.style.background_gradient(subset=['Case_Fatality_Ratio'], cmap='Reds')

# Display the styled DataFrame
styled_df

In [None]:
# @title

# Create the pie chart
fig = px.pie(
    sorted_df,
    names=sorted_df.index,
    values='Case_Fatality_Ratio',
    title='Case Fatality Ratio by Country',
    color='Case_Fatality_Ratio',
    color_discrete_sequence=px.colors.qualitative.Plotly
)

# Update layout to increase the size of the pie chart
fig.update_layout(
    width=800,  # Adjust the width as needed
    height=800  # Adjust the height as needed
)

# Show the pie chart
fig.show()

# Classification

I will do Cluster classification based on shape of time series for Cofirmed cases change over days

Now i will scale the data because there is huge diffrance in scales

In [None]:
confirmed_ts_by_country.shape

We have 201 time series, each with a length of T=1143. Determining the optimal number of clusters can be computationally intensive, as it requires multiple iterations over the entire dataset. To streamline this process, I will use PCA to reduce the dimensionality of the time series. This will help identify the optimal number of clusters efficiently. Once the optimal number of clusters is determined, I will perform cluster analysis on the original time series data to ensure accurate and meaningful results.

Create Dataframe for death differance for all countries

# I remved noise in two steps first using Savitzky-Golay filter and 2nd using moving average 50 after differancing

In [None]:
from scipy.signal import savgol_filter

def smooth_time_series(series, window_length=50, polyorder=2):
    """
    Smooth a time series using the Savitzky-Golay filter.

    Parameters:
        series (pd.Series): Time series data with a datetime index.
        window_length (int): The length of the filter window (must be odd and greater than polyorder).
        polyorder (int): The order of the polynomial used to fit the samples.

    Returns:
        pd.Series: Smoothed time series.
    """
    # Apply Savitzky-Golay filter
    smoothed_data = savgol_filter(series, window_length=window_length, polyorder=polyorder)

    # Return the smoothed series as a Pandas Series
    return pd.Series(smoothed_data, index=series.index)

In [None]:

def get_rate_of_change_with_ma(ts):
    """
    Calculate the rate of change of a Pandas Series.
    """
    name =ts.name
    temp_df = pd.DataFrame((get_time_series(ts)))
    temp_df.columns = ['numbers']
    temp_df['Shifted'] = temp_df['numbers'].shift(1)
    temp_df['Difference'] = temp_df['numbers'] - temp_df['Shifted']
    temp_df['Moving Average'] = temp_df['Difference'].dropna().rolling(window=50).mean().fillna(0).dropna()
    temp_df.rename(columns={'Moving Average': name}, inplace=True)
    return temp_df[name]

In [None]:
confirmed_diff_df = pd.DataFrame()
countries = confirmed_ts_by_country.index


for country in countries:
    series_df = get_rate_of_change_with_ma(confirmed_ts_by_country.loc[country]).to_frame().T.fillna(0)

    confirmed_diff_df = pd.concat([confirmed_diff_df, series_df], axis=0)
    confirmed_diff_df.index = confirmed_diff_df.index[:-1].tolist() + [country]


* The StandardScaler from sklearn is used to standardize the data before applying PCA. This scaling is done to ensure that each feature (in this case, each time point across all time series) has a mean of 0 and a standard deviation of 1.

* Reason: The goal here is to prepare the data for PCA, which is sensitive to the variances of the original variables. Standardizing the data before PCA ensures that the principal components are not biased toward features with larger scales.

## Anomaly Detectiong



For countries that exhibit no significant rate of change or show a nearly straight line after 100 days, I will classify them as anomalies and exclude them from the clustering analysis.

**Algorithm Explanation:**

1. **Calculate Slope**: The algorithm calculates the slope between two points in the time series. Each point is defined by its timestamp and value. The slope is computed as the change in value divided by the change in time (in seconds).

2. **Check for Anomalies**: For each country, the algorithm calculates slopes between points separated by a given period. It then scales these slopes to a range of [-1, 1] using MinMaxScaler. If the slope is below a certain threshold, it is considered significant. The slopes are checked for anomalies based on their scaled values.

3. **Determine Longest Sequence of Anomalies**: The algorithm converts the list of slopes into binary values (1 if the slope is below the threshold, otherwise 0). It then finds the longest consecutive sequence of 1s (which represent anomalies).

4. **Update Anomaly DataFrame**: If the longest sequence of anomalies for a country is greater than or equal to a specified length (e.g., 390), the country is flagged as an anomaly and included in the final results.


In [None]:
from sklearn.preprocessing import MinMaxScaler

def calculate_slope(point1, point2):
    """
    Calculate the slope between two points of a time series.

    Parameters:
    - point1: A tuple (x1, y1) where x1 is the time (as a timestamp) and y1 is the value at that time.
    - point2: A tuple (x2, y2) where x2 is the time (as a timestamp) and y2 is the value at that time.

    Returns:
    - The slope between the two points.
    """
    x1, y1 = point1
    x2, y2 = point2

    # Calculate the difference in time in seconds
    time_diff = (x2 - x1).total_seconds()  # Converts the time difference to seconds

    # Check for division by zero error
    if time_diff == 0:
        raise ValueError("x2 and x1 cannot be the same value (would cause division by zero).")

    # Calculate the slope as (change in y) / (change in time)
    slope = (y2 - y1) / time_diff
    return slope

def check_anomaly(ts, period, threshold):
    """
    Check for anomalies in a time series based on the slope between points.

    Parameters:
    - ts: The time series data.
    - period: The period over which to calculate the slope.
    - threshold: The threshold value for anomaly detection.

    Returns:
    - A list of scaled slopes between points.
    """
    # Placeholder function to process the time series (assuming it processes the time series correctly)
    ts = get_time_series(ts)

    slope_ph = []

    # Loop through the time series to calculate slopes between points
    for i in range(0, len(ts) - period):
        point_1 = (ts.index[i], ts.iloc[i])  # Start point
        point_2 = (ts.index[i + period], ts.iloc[i + period])  # End point

        # Calculate slope between the two points
        slope = calculate_slope(point_1, point_2)

        # Append slope to list if index is greater than 100
        if i > 100:
            slope_ph.append(slope)

    # Scale slopes to range [-1, 1]
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaled_slope = scaler.fit_transform(np.array(slope_ph).reshape(-1, 1)).reshape(-1,)

    return scaled_slope

def longest_sequence_of_ones(lst):
    """
    Find the length of the longest sequence of 1s in a list.

    Parameters:
    - lst: A list of binary values (0s and 1s).

    Returns:
    - The length of the longest sequence of 1s.
    """
    max_count = 0  # Maximum length of sequence found
    current_count = 0  # Current length of sequence

    for num in lst:
        if num == 1:
            current_count += 1  # Increase the count of current sequence
            max_count = max(max_count, current_count)  # Update max_count if current sequence is longer
        else:
            current_count = 0  # Reset count for sequence of 0s

    return max_count

# Create a DataFrame to store results
anomaly_df_confirm = pd.DataFrame(index=confirmed_diff_df.index, columns=['is_anomaly', 'long_seq'])
anomaly_df_confirm['is_anomaly'] = 0

# Loop through each country in the time series data
countries = confirmed_diff_df.index
for country in countries:
    # Check for anomalies in the time series for each country
    slope_list = check_anomaly(confirmed_diff_df.loc[country], 10, 0.0001)

    # Convert slopes to binary based on a threshold
    binary_list = [1 if x < 0.05 else 0 for x in slope_list]

    # Find the length of the longest sequence of 1s
    long_seq = longest_sequence_of_ones(binary_list)

    # Update the DataFrame with the longest sequence length
    anomaly_df_confirm.loc[country, 'long_seq'] = long_seq

    # Mark as anomaly if the longest sequence is greater than or equal to 500
    if long_seq >= 390:
        anomaly_df_confirm.loc[country, 'is_anomaly'] = 1


In [None]:
anomaly_df_confirm[anomaly_df_confirm.is_anomaly==1]

In [None]:
anomalies_countries_df = confirmed_diff_df.loc[anomaly_df_confirm[anomaly_df_confirm.is_anomaly==1].index]
healthy_confirmed_df =  confirmed_diff_df[~confirmed_diff_df.index.isin(anomaly_df_confirm[anomaly_df_confirm.is_anomaly==1].index)]
# Plot the data
ax = anomalies_countries_df.T.plot(kind='line')

# Set the title of the plot
ax.set_title('Anomalies Countries')

ax.legend().set_visible(False)

# Adjust the layout to make room for the legend
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
standardized_data = scaler.fit_transform(healthy_confirmed_df.T)


In [None]:
standardized_data.T.shape

In [None]:

# Apply PCA
n_components = 20
pca = PCA(n_components=n_components)
pca_transformed_data = pca.fit_transform(standardized_data.T)

Then check for the amount of variance explained by each component:

In [None]:
cumsum_explained_variance_ratio = np.cumsum(pca.explained_variance_ratio_)

# Create a figure and a set of subplots
fig, ax1 = plt.subplots(figsize=(20, 6))

# Plot the explained variance ratio on the primary y-axis
ax1.bar(range(1, pca_transformed_data.shape[1] + 1), pca.explained_variance_ratio_, alpha=0.7, color='b', label='Explained Variance Ratio')
ax1.set_xlabel('Principal Component')
ax1.set_ylabel('Explained Variance Ratio', color='b')
ax1.tick_params(axis='y', labelcolor='b')
ax1.set_title('PCA Explained Variance Ratio')
ax1.set_xticks(range(1, pca_transformed_data.shape[1] + 1))
ax1.grid(axis='y')
ax1.tick_params(axis='x', labelsize=10)

# Create a secondary y-axis and plot the cumulative sum on it
ax2 = ax1.twinx()
ax2.plot(range(1, pca_transformed_data.shape[1] + 1), cumsum_explained_variance_ratio, marker='o', color='r', label='Cumulative Sum')
ax2.set_ylabel('Cumulative Sum', color='r')
ax2.tick_params(axis='y', labelcolor='r')

# Adding legends for both axes
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')

# Show the plot
plt.show()

In [None]:
n_components = 6
pca = PCA(n_components=n_components)
pca_transformed_data = pca.fit_transform(standardized_data.T)

In [None]:
pca_transformed_data.reshape(pca_transformed_data.shape[0],pca_transformed_data.shape[1],1).shape

In [None]:
'''
Sum_of_squared_distances = []
silhouette_scores = []

K = range(3,20)
for k in K:

    km = TimeSeriesKMeans(n_clusters=k, metric="dtw",init="k-means++")
    km.fit(pca_transformed_data.reshape(pca_transformed_data.shape[0],pca_transformed_data.shape[1],1))
    labels = km.predict(pca_transformed_data.reshape(pca_transformed_data.shape[0],pca_transformed_data.shape[1],1))
    Sum_of_squared_distances.append(km.inertia_)
    score = silhouette_score(pca_transformed_data.reshape(pca_transformed_data.shape[0],pca_transformed_data.shape[1],1), labels)
    silhouette_scores.append(score)
    print(f'N Cluster:{k}, inertia: {km.inertia_}, silhouette_scores: {score}')
'''

In [None]:
from sklearn.cluster import KMeans
Sum_of_squared_distances = []
silhouette_scores = []

K = range(3,20)
for k in K:

    km = KMeans(n_clusters=k,init="k-means++")
    km.fit(pca_transformed_data)
    labels = km.predict(pca_transformed_data)
    Sum_of_squared_distances.append(km.inertia_)
    score = silhouette_score(pca_transformed_data, labels)
    silhouette_scores.append(score)
    print(f'N Cluster:{k}, inertia: {km.inertia_}, silhouette_scores: {score}')

Using elbow method to get number of clusters

In [None]:
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
pca_transformed_data.shape

In [None]:
from sklearn.cluster import KMeans

n_clusters = 17
km = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw",init="k-means++")
km.fit(pca_transformed_data.reshape(pca_transformed_data.shape[0],pca_transformed_data.shape[1],1))
labels = km.predict(pca_transformed_data.reshape(pca_transformed_data.shape[0],pca_transformed_data.shape[1],1))
mat = pd.DataFrame(pca_transformed_data)
mat['cluster'] = pd.Series(labels)

import matplotlib.patches as mpatches

sns.set_style("white")
sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 2.5})

LABEL_COLOR_MAP = {
    0:'r',1: 'tan', 2: 'b', 3: 'k', 4: 'c', 5: 'g', 6: 'deeppink', 7: 'skyblue', 8: 'darkcyan', 9: 'orange',
    10: 'yellow', 11: 'tomato', 12: 'seagreen', 13: 'purple', 14: 'olive', 15: 'magenta', 16: 'navy',
    17: 'gold', 18: 'coral', 19: 'lime', 20: 'crimson', 21: 'teal', 22: 'orchid', 23: 'salmon',
    24: 'sienna', 25: 'peru', 26: 'lavender', 27: 'turquoise', 28: 'maroon', 29: 'chocolate', 30: 'indigo'
}
label_color = [LABEL_COLOR_MAP[l] for l in mat['cluster']]

fig = plt.figure(figsize = (12,10))
plt.subplots_adjust(left=0.1, bottom=0.05, right=0.85, top=0.95, wspace=0.2, hspace=0.4)
increment = 0
for ix in range(6):
    for iy in range(ix+1, 6):
        increment += 1
        ax = fig.add_subplot(4,3,increment)
        ax.scatter(mat[ix], mat[iy], c= label_color, alpha=0.5)
        plt.ylabel('PCA {}'.format(iy+1), fontsize = 12)
        plt.xlabel('PCA {}'.format(ix+1), fontsize = 12)
        ax.yaxis.grid(color='lightgray', linestyle=':')
        ax.xaxis.grid(color='lightgray', linestyle=':')
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)

        if increment == 12: break
    if increment == 12: break

#_______________________________________________

comp_handler = []
for i in range(n_clusters):
    comp_handler.append(mpatches.Patch(color = LABEL_COLOR_MAP[i], label = i))

plt.legend(handles=comp_handler, bbox_to_anchor=(1.1, 0.9),
           title='Cluster', facecolor = 'lightgrey',
           shadow = True, frameon = True, framealpha = 1,
           fontsize = 13, bbox_transform = plt.gcf().transFigure)

plt.tight_layout()
plt.show()

PCA didnt help that much to know number of clusters

In [None]:
#first scale the data

#scaler_ts = TimeSeriesScalerMeanVariance()
#scaled_data = scaler_ts.fit_transform(healthy_confirmed_df)


Using MinMaxScaler instead of TimeSeriesScalerMeanVariance becuase it give better results

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(healthy_confirmed_df.T)

In [None]:
#squeezed_array = np.squeeze(scaled_data, axis=-1)

# Convert to a DataFrame
scaled_data_df = pd.DataFrame(scaled_data.T)

Using Dendrogram to know number of clusters needed

In [None]:
from tslearn.metrics import cdist_dtw
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, dendrogram

data_array = scaled_data.T.copy()

distance_matrix = cdist_dtw(data_array, data_array)

linked = linkage(squareform(distance_matrix), method='ward')


In [None]:

# Plot dendrogram
plt.figure(figsize=(18, 7))
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title("Dendrogram")
plt.xlabel("Sample Index")
plt.ylabel("Distance")
plt.show()


applying clusters using eclidean i used DTW was so bad and eclidian gave better results

In [None]:
scaled_data.shape

In [None]:
# Clustring
kmeans_ts_final  = TimeSeriesKMeans(n_clusters=8, metric="euclidean",init="k-means++",random_state=10,n_init=30)
final_clusters  = kmeans_ts_final.fit_predict(scaled_data.T.reshape(scaled_data.shape[1],scaled_data.shape[0],1))

In [None]:
scaled_data_df.index=healthy_confirmed_df.index
scaled_data_df['cluster'] = final_clusters
clusters = scaled_data_df['cluster'].unique()


## Overview of the Algorithm

1. **Initialization:**
   - Create a DataFrame to track country names associated with different cluster configurations.

2. **Iterate Over Possible Cluster Counts:**
   - For each potential number of clusters (from 2 up to a specified maximum depth), perform clustering on the data.

3. **Perform Clustering:**
   - For each cluster count:
     - Apply the KMeans clustering algorithm to the data.
     - Assign cluster labels to each data point based on the clustering result.
     - Associate each data point with its respective cluster and country name.

4. **Aggregate Cluster Data:**
   - Group the data by cluster labels and compile a list of countries for each cluster.
   - Update the DataFrame to store concatenated strings of countries for each cluster configuration.

5. **Concatenate and Evaluate:**
   - Create a new column that concatenates values from different cluster configurations for each data point, separated by underscores.
   - Determine the second most frequent country (or cluster) for each data point based on the concatenated strings.

6. **Assign Core Time Series:**
   - For each cluster:
     - Identify the most common core time series (the second most frequent value from the concatenation).
     - Assign this core time series to the data points in that cluster.

7. **Close Vote:**
   - Determine which core time series is closest to the values assigned to each data point by checking the frequency of occurrence.
   - Update the DataFrame to include this closest vote for each data point.

8. **Return Improved Data:**
   - Return the updated DataFrame with improved cluster assignments and core time series information, removing intermediate columns used for calculations.

## Summary

The algorithm enhances clustering results by exploring various cluster counts, aggregating and analyzing data to identify the most representative clusters, and refining cluster assignments based on a voting mechanism. This process aims to identify more meaningful clusters and improve overall clustering accuracy.

In [None]:
from collections import Counter

def improving_clusters(df,max_depth=100):
  vote_df = pd.DataFrame(data=df.index,columns=['countries'])
  for i in range(2,max_depth+1):
    print(i)
    temp_data = df.iloc[:,:1143].copy()
    n_cluster=i
    km = KMeans(n_clusters=n_cluster,init="k-means++",random_state=10,n_init=30)
    km.fit(temp_data.values)
    labels = km.predict(temp_data.values)
    temp_data['cluster'] = labels
    temp_data['countries'] = temp_data.index
    df_countries = temp_data[['cluster','countries']].groupby('cluster').agg(list).copy()
    for label in labels:
      cluster_index = temp_data.cluster==label
      vote_df.loc[cluster_index.values,f'{n_cluster}'] = '_'.join(df_countries.loc[label].countries)


  vote_df['Concatenated'] = vote_df.drop('countries',axis=1).apply(lambda row: '_'.join(row), axis=1)
  def voting(x):
    my_list = x.split('_')
    counter = Counter(my_list)
    most_common = counter.most_common()
    return most_common[1]
  vote_df['voting'] = vote_df['Concatenated'].apply(voting)
  vote_df['voting_score'] = vote_df['voting'].apply(lambda x: x[1])
  vote_df['voting'] = vote_df['voting'].apply(lambda x: x[0])
  vote_df.index=vote_df['countries']
  vote_df.drop('countries',axis=1,inplace=True)

  clusters = df.cluster.unique()
  for n in clusters:
    cluster_index = df.cluster==n
    #vote_df.loc[df[df.cluster==0].index][['voting','voting_score']].groupby('voting').sum().sort_values('voting_score',ascending=False).index[0]
    df.loc[cluster_index.values,'Core Time Series'] = vote_df.loc[df[df.cluster==n].index][['voting','voting_score']].groupby('voting').sum().sort_values('voting_score',ascending=False).index[0]

  clusters_votes = df['Core Time Series'].unique()
  def close_vote(x):
    my_list = x.split('_')
    counter = Counter(my_list)
    most_common = counter.most_common()
    return most_common
  vote_df['Close Vote'] = None
  df['Close Vote'] = None
  for r,row in vote_df.iterrows():
    temp_list = row['Concatenated']
    temp_vote = close_vote(temp_list)
    for i in temp_vote:
      if i[0] in clusters_votes:
        vote_df.at[r,'Close Vote'] = i[0]
        df.at[r,'Close Vote'] = df[df['Core Time Series'] == i[0]]['cluster'].unique()[0]
        break
  return df.drop(['cluster','Core Time Series'],axis=1).rename(columns={'Close Vote':'cluster'})
improved_clusters = improving_clusters(scaled_data_df.copy())

#Visualize clusters

In [None]:

# Ensure your dataframes are correctly sorted and ready for plotting
scaled_data_df = scaled_data_df.sort_values('cluster')
improved_clusters = improved_clusters.sort_values('cluster')

clusters = scaled_data_df['cluster'].unique()
num_clusters = len(clusters)

# Create subplots for each cluster pair (original and improved) on the same row
fig, axes = plt.subplots(num_clusters, 2, figsize=(20, 6 * num_clusters), sharex=True)

# Iterate over clusters and plot each one
for i, cluster in enumerate(clusters):
    # Plot for original scaled_data_df clusters
    ax = axes[i, 0]  # Access the subplot in the ith row, first column
    cluster_data = scaled_data_df[scaled_data_df['cluster'] == cluster]

    # Ensure centroid data is numeric
    centroid = kmeans_ts_final.cluster_centers_[cluster].ravel()
    ax.plot(centroid, "r-", linewidth=2, label='Centroid')

    for index, row in cluster_data.iterrows():
        # Extract only numeric data for plotting
        time_series = row.drop(['cluster', 'Core Time Series'], errors='ignore')  # Drop the non-numeric column(s)
        ax.plot(time_series, label=f'Row {index}', alpha=.3, linewidth=2)

    ax.set_title(f'Original Cluster {cluster}')
    ax.set_ylabel('Value')

    # Plot for improved_clusters
    ax = axes[i, 1]  # Access the subplot in the ith row, second column
    improved_cluster_data = improved_clusters[improved_clusters['cluster'] == cluster]

    # Ensure centroid data is numeric
    ax.plot(centroid, "r-", linewidth=2, label='Centroid')
    median_row = improved_cluster_data.drop('cluster',axis=1).median()
    ax.plot(median_row, "b-", linewidth=2, label='meidan')
    for index, row in improved_cluster_data.iterrows():
        # Extract only numeric data for plotting
        time_series = row.drop(['cluster'], errors='ignore')  # Drop the non-numeric column(s)
        ax.plot(time_series, label=f'Row {index}', alpha=.3, linewidth=2)

    ax.set_title(f'Improved Cluster {cluster}')
    ax.set_ylabel('Value')

# Add x-axis label to the bottom subplot
for ax in axes[-1, :]:
    ax.set_xlabel('Time Points')

plt.tight_layout()
plt.show()


The Improved Cluster method discovered that no need for cluster 6 and can fit in other clusters

In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

def calculate_wcss(df):
    """
    Calculate the Within-Cluster Sum of Squares (WCSS) for the DataFrame.

    Parameters:
    - df: DataFrame with time series data where the last column is cluster labels.

    Returns:
    - WCSS value.
    """
    X = df.iloc[:, :-1].values  # Time series data
    labels = df.iloc[:, -1].values  # Cluster labels

    wcss = 0
    unique_labels = np.unique(labels)

    for label in unique_labels:
        cluster_data = X[labels == label]
        cluster_center = cluster_data.mean(axis=0)
        wcss += np.sum((cluster_data - cluster_center) ** 2)

    return wcss

def evaluate_clustering(df):
    """
    Evaluate clustering quality metrics and create a silhouette plot for the DataFrame.

    Parameters:
    - df: DataFrame with time series data where the last column is cluster labels.

    Returns:
    - Dictionary with clustering metrics and silhouette plot.
    """
    X = df.iloc[:, :-1].values  # Time series data
    labels = df.iloc[:, -1].values  # Cluster labels

    metrics = {}

    # Calculate WCSS
    metrics['WCSS'] = calculate_wcss(df)

    # Calculate Silhouette Score
    metrics['Silhouette Score'] = silhouette_score(X, labels)

    # Calculate Davies-Bouldin Index
    metrics['Davies-Bouldin Index'] = davies_bouldin_score(X, labels)

    # Calculate Calinski-Harabasz Index
    metrics['Calinski-Harabasz Index'] = calinski_harabasz_score(X, labels)

    # Create Silhouette Plot
    plt.figure(figsize=(14, 6))
    sns.set(style="whitegrid")

    # Compute the silhouette scores for each sample
    silhouette_vals = silhouette_score(X, labels)

    # Plot silhouette scores
    plt.subplot(1, 2, 1)
    silhouette_avg = np.mean(silhouette_vals)
    plt.axvline(x=silhouette_avg, color="red", linestyle="--")
    plt.hist(silhouette_vals, bins=20, edgecolor='k', alpha=0.7)
    plt.title('Silhouette Plot')
    plt.xlabel('Silhouette Score')
    plt.ylabel('Frequency')

    return metrics

def compare_clustering(df1, df2):
    """
    Compare clustering quality between two DataFrames.

    Parameters:
    - df1: First DataFrame with time series data where the last column is cluster labels (original clustering).
    - df2: Second DataFrame with time series data where the last column is cluster labels (improved clustering).

    Returns:
    - Comparison of clustering metrics.
    """
    print("Evaluating Clustering for Original Clustering:")
    metrics1 = evaluate_clustering(df1)
    for metric, value in metrics1.items():
        print(f"{metric} for Original Clustering: {value}")

    print("\nEvaluating Clustering for Improved Clustering:")
    metrics2 = evaluate_clustering(df2)
    for metric, value in metrics2.items():
        print(f"{metric} for Improved Clustering: {value}")

    # Plot silhouette scores for both dataframes
    plt.subplot(1, 2, 2)
    plt.title('Silhouette Comparison')
    plt.bar(['Original Clustering', 'Improved Clustering'],
            [metrics1['Silhouette Score'], metrics2['Silhouette Score']],
            color=['blue', 'green'])
    plt.ylabel('Silhouette Score')

    plt.tight_layout()
    plt.show()

compare_clustering(scaled_data_df, improved_clusters)


In [None]:
improved_clusters['cluster'].value_counts().sort_index().plot(kind='bar')
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.title('Cluster Distribution')
plt.show()

Visualize clusters on the world map

In [None]:
improved_clusters['categ'] = improved_clusters['cluster'].astype(str)
improved_clusters.sort_values('cluster',inplace=True)
fig = px.choropleth(
    improved_clusters.reset_index(),
    locations='index',
    locationmode='country names',
    color='categ',
    hover_name='index',
    title='Choropleth Map of Clusters by Country',
    color_discrete_sequence=px.colors.qualitative.Set3  # Use a discrete color set
)

fig.update_geos(projection_type="natural earth")
fig.update_layout(
    geo=dict(showframe=False, showcoastlines=False),
    width=1200,
    height=800
)
fig.show()

Adjacent countries tend to have same cluster which emphasize clustring quality

### Future Improvements

To enhance the accuracy and reliability of the clustering results, future improvements could include:

- **Incorporate Dynamic Time Warping (DTW):** Implementing DTW for distance measurement could improve the capture of temporal patterns in the data. Unlike Euclidean distance, DTW is better suited for aligning time series data that may vary in speed or timing, making it a more accurate method for identifying similar patterns in COVID-19 case data.

- **Implement Cross-Validation Techniques:** To ensure the robustness of the clustering results, applying cross-validation techniques can help validate the model by testing its performance on different subsets of data. This would help in verifying that the clustering model generalizes well to unseen data and isn't overfitted to the specific dataset used.

By implementing these enhancements, the project could yield even more reliable insights, making it better suited for real-world applications and further research.
