<a href="https://colab.research.google.com/github/awhitehouse1/DS4002-Project-1/blob/main/Testing_country_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

In [None]:
# Pre-processing the data
df = pd.read_csv('/content/Final_time_series_data.csv')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108336 entries, 0 to 108335
Data columns (total 7 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Year                    108336 non-null  int64  
 1   Month                   108336 non-null  int64  
 2   new_deaths_per_million  108335 non-null  float64
 3   Country                 108336 non-null  object 
 4   Date                    108336 non-null  object 
 5   GDP_per_Capita          107217 non-null  float64
 6   Democracy score         108336 non-null  float64
dtypes: float64(3), int64(2), object(2)
memory usage: 5.8+ MB
None


In [None]:
# Replace Inf and -Inf values with NaN
data = df.replace([np.inf, -np.inf], np.nan)

# convert 'Country' into a numerical code
data['Country'] = data['Country'].astype('category')
data['Country_Codes'] = data['Country'].cat.codes

# table of countries and their code
data[['Country_Codes', 'Country']].drop_duplicates()



Unnamed: 0,Country_Codes,Country
0,0,Afghanistan
732,1,Albania
1464,2,Algeria
2196,3,Angola
2928,4,Argentina
...,...,...
104676,143,Uruguay
105408,144,Uzbekistan
106140,145,World
106872,146,Zambia


In [None]:
# Drop rows with NaN values
print(data.isna().sum()) # 1120 na values
data_cleaned = data.dropna()

# drop the categorical 'Country' variable
data_cleaned = data_cleaned.drop(columns=['Country', 'Month', 'Year'])

print(data_cleaned.dtypes)

numeric_cols = ['new_deaths_per_million', 'Democracy score', 'GDP_per_Capita', 'Country_Codes']
#data_cleaned[numeric_cols] = data_cleaned[numeric_cols].apply(pd.to_numeric, errors='coerce')

# convert the 'Date' column to date time and set as the index
data_cleaned['Date'] = pd.to_datetime(data_cleaned['Date'])
data_cleaned.set_index('Date', inplace=True)

# Now check for Inf in the numeric columns only
# print(np.isinf(data_cleaned[numeric_cols]).sum())
print(data_cleaned.head())
print(data_cleaned.dtypes)
data_cleaned.describe()


Year                         0
Month                        0
new_deaths_per_million       1
Country                      0
Date                         0
GDP_per_Capita            1119
Democracy score              0
Country_Codes                0
dtype: int64
new_deaths_per_million    float64
Date                       object
GDP_per_Capita            float64
Democracy score           float64
Country_Codes               int16
dtype: object
            new_deaths_per_million  GDP_per_Capita  Democracy score  \
Date                                                                  
2020-01-22                     0.0      512.055098             2.85   
2020-01-23                     0.0      512.055098             2.85   
2020-01-24                     0.0      512.055098             2.85   
2020-01-25                     0.0      512.055098             2.85   
2020-01-26                     0.0      512.055098             2.85   

            Country_Codes  
Date                       
2

Unnamed: 0,new_deaths_per_million,GDP_per_Capita,Democracy score,Country_Codes
count,107216.0,107216.0,107216.0,107216.0
mean,1.495689,16282.073009,5.542656,73.850256
std,9.057929,22767.321268,2.178896,42.804152
min,0.0,216.827417,0.32,0.0
25%,0.0,1993.424478,3.54,37.0
50%,0.0,6036.447274,5.86,74.0
75%,0.0,20381.85578,7.18,111.0
max,497.31,133711.7944,9.81,147.0


In [None]:
# Vector AutoRegression (VAR) Model

In [None]:
!pip install statsmodels



In [None]:
from statsmodels.tsa.api import VAR

data_cleaned.dtypes

# Fit the VAR model with the cleaned data
model = VAR(data_cleaned)
results = model.fit(maxlags=15, ic='aic')

# Print model summary
print(results.summary())

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Wed, 23, Oct, 2024
Time:                     18:47:52
--------------------------------------------------------------------
No. of Equations:         4.00000    BIC:                    5.33980
Nobs:                     107202.    HQIC:                   5.32559
Log likelihood:          -893351.    FPE:                    204.266
AIC:                      5.31942    Det(Omega_mle):         203.832
--------------------------------------------------------------------
Results for equation new_deaths_per_million
                                coefficient       std. error           t-stat            prob
---------------------------------------------------------------------------------------------
const                             -0.031618         0.033099           -0.955           0.339
L1.new_deaths_per_million         -0.001517         0.003063           -0.495        

In [None]:
# Correlation matrix of residuals shows the correlations between the residuals
# (errors) of the different variables
# new_deaths_per_million and GDP_per_Capita: 0.015304
# Very close to zero which suggests that the errors of these two variables are
# almost uncorrelated, meaning there is no strong linear relationship between the
# prediction errors of these variabls

# new_deaths_per_million and democracy score: 0.028133
# Also close to zero which suggests a weak or non-existent correlation between
# the errors of these two variables

# GDP_per_Capita and democracy score: 0.481471
# Moderate positive correlation which indicates that when the model makes errors
# in predicting GDP_per_Capita, those errors are moderately correlated with errors
# in predicting democracy score

# You want the residuals of different variables to be uncorrelated (close to 0)
# which indicates that the model is doing a good job of capturing the relationships
# between the variables, and the remaining prediction errors are random

# GDP_per_capita and democracy score suggest that the model might not be fully
# capturing the relationship between these variables

In [None]:
# Granger Casuality Test
from statsmodels.tsa.stattools import grangercausalitytests

# Specify the maxlag (based on your VAR model)
max_lag = 15

# Test Granger Causality for GDP per Capita on new_deaths_per_million
print("Granger Causality Test: GDP per Capita -> new_deaths_per_million")
grangercausalitytests(data_cleaned[['new_deaths_per_million', 'GDP_per_Capita']], max_lag)

# Test Granger Causality for Democracy score on new_deaths_per_million
print("\nGranger Causality Test: Democracy score -> new_deaths_per_million")
grangercausalitytests(data_cleaned[['new_deaths_per_million', 'Democracy score']], max_lag)

Granger Causality Test: GDP per Capita -> new_deaths_per_million

Granger Causality
number of lags (no zero) 1
ssr based F test:         F=58.9570 , p=0.0000  , df_denom=107212, df_num=1
ssr based chi2 test:   chi2=58.9587 , p=0.0000  , df=1
likelihood ratio test: chi2=58.9425 , p=0.0000  , df=1
parameter F test:         F=58.9570 , p=0.0000  , df_denom=107212, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=31.2097 , p=0.0000  , df_denom=107209, df_num=2
ssr based chi2 test:   chi2=62.4223 , p=0.0000  , df=2
likelihood ratio test: chi2=62.4041 , p=0.0000  , df=2
parameter F test:         F=31.2097 , p=0.0000  , df_denom=107209, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=23.9481 , p=0.0000  , df_denom=107206, df_num=3
ssr based chi2 test:   chi2=71.8489 , p=0.0000  , df=3
likelihood ratio test: chi2=71.8249 , p=0.0000  , df=3
parameter F test:         F=23.9481 , p=0.0000  , df_denom=107206, df_num=3

Granger Ca

{1: ({'ssr_ftest': (606.6308396088929, 1.4257742912945901e-133, 107212.0, 1),
   'ssr_chi2test': (606.6478143180562, 5.995973000586673e-134, 1),
   'lrtest': (604.9379826710792, 1.4117434389264817e-133, 1),
   'params_ftest': (606.6308396083098,
    1.4257742917049255e-133,
    107212.0,
    1.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f097068d300>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f097068c8b0>,
   array([[0., 1., 0.]])]),
 2: ({'ssr_ftest': (322.78845890900294, 1.7184667937477025e-140, 107209.0, 2),
   'ssr_chi2test': (645.607026153958, 6.430067495626328e-141, 2),
   'lrtest': (643.6709790957393, 1.6928668815044586e-140, 2),
   'params_ftest': (322.78845890882127,
    1.7184667940560893e-140,
    107209.0,
    2.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f097068d750>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f097068de70>,
   array([[0., 0., 1., 0., 0.],

In [None]:
# GDP per capita - New Deaths per million
# At 1-6 lags, the p-value is 0.0000, meaning that GDP per capita granger causes
# new deaths per million with strong statistical significance
# F-test and chi-squared values confirm that GDP per capita can help predict changes
# in new deaths per million during these time periods
# From 7+, the p-value becomes very high, indicating GDP per capita no longer
# granger causes new deaths per million

# Democracy score - New deaths per million
# At all lags 1-5, the p-values are 0.000, meaning democracy score strongly grander
# causes new deaths per million
# High F-test and chi-squared values indicate strong predictive power of democracy
# score on new deaths per million

In [None]:
# SARIMAX Model
pip install statsmodels pandas matplotlib



In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Select the target and exogenous variables
y = df['new_deaths_per_million']
exog = df[['GDP_per_Capita', 'Democracy score']]

# Define the SARIMAX model (replace order and seasonal_order with appropriate values)
model = SARIMAX(y, exog=exog, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))

# Fit the model
sarimax_model = model.fit(disp=False)

# Print model summary
print(sarimax_model.summary())


MissingDataError: exog contains inf or nans