# **Multivariate Analysis for Forecasting the US Unemployment Rate: A Comparative Study of Time Series Models (VAR, ARIMA, Dynamic Regression)**

### Import Library

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.tsa.api import VAR
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.api import acf, pacf, ccf, graphics, adfuller, coint
from sklearn.metrics import mean_squared_error as mse


### Load Data

In [11]:
unemploy = pd.read_csv("../raw/UNEMPLOY.csv")
gdp = pd.read_csv("../raw/GDP.csv")
cpi = pd.read_csv("../raw/CPIAUCSL.csv")

#### Check column type of unemploy table

In [14]:
unemploy.head()

Unnamed: 0,observation_date,UNEMPLOY
0,1948-01-01,2254.0
1,1948-04-01,2239.0
2,1948-07-01,2288.0
3,1948-10-01,2324.0
4,1949-01-01,2825.0


In [17]:
unemploy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304 entries, 0 to 303
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   observation_date  304 non-null    object 
 1   UNEMPLOY          303 non-null    float64
dtypes: float64(1), object(1)
memory usage: 4.9+ KB


In [18]:
unemploy["observation_date"] = pd.to_datetime(unemploy["observation_date"])
unemploy.set_index("observation_date", inplace = True)

In [24]:
unemploy.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 304 entries, 1948-01-01 to 2023-10-01
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   UNEMPLOY  303 non-null    float64
dtypes: float64(1)
memory usage: 4.8 KB


#### Check column type of gdp table

In [21]:
gdp.head()

Unnamed: 0,observation_date,GDP
0,1947-01-01,243.164
1,1947-04-01,245.968
2,1947-07-01,249.585
3,1947-10-01,259.745
4,1948-01-01,265.742


In [22]:
gdp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307 entries, 0 to 306
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   observation_date  307 non-null    object 
 1   GDP               307 non-null    float64
dtypes: float64(1), object(1)
memory usage: 4.9+ KB


In [23]:
gdp["observation_date"] = pd.to_datetime(gdp["observation_date"])
gdp.set_index("observation_date", inplace = True)

In [26]:
gdp.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 307 entries, 1947-01-01 to 2023-07-01
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   GDP     307 non-null    float64
dtypes: float64(1)
memory usage: 4.8 KB


#### Check column type of cpi table

In [27]:
cpi.head()

Unnamed: 0,observation_date,CPIAUCSL
0,1947-01-01,21.7
1,1947-04-01,22.01
2,1947-07-01,22.49
3,1947-10-01,23.127
4,1948-01-01,23.617


In [28]:
cpi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308 entries, 0 to 307
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   observation_date  308 non-null    object 
 1   CPIAUCSL          307 non-null    float64
dtypes: float64(1), object(1)
memory usage: 4.9+ KB


In [29]:
cpi["observation_date"] = pd.to_datetime(cpi["observation_date"])
cpi.set_index("observation_date", inplace = True)

In [30]:
cpi.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 308 entries, 1947-01-01 to 2023-10-01
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   CPIAUCSL  307 non-null    float64
dtypes: float64(1)
memory usage: 4.8 KB


### Concat the Data

In [31]:
df_concat = pd.concat([unemploy, gdp, cpi], axis=1)

In [32]:
df_concat.head()

Unnamed: 0_level_0,UNEMPLOY,GDP,CPIAUCSL
observation_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1947-01-01,,243.164,21.7
1947-04-01,,245.968,22.01
1947-07-01,,249.585,22.49
1947-10-01,,259.745,23.127
1948-01-01,2254.0,265.742,23.617


### Handling Missing Value

In [34]:
df_concat.isna().sum()

UNEMPLOY    5
GDP         1
CPIAUCSL    1
dtype: int64

In [37]:
df_concat.dropna(inplace=True)

In [38]:
df_concat.isna().sum()

UNEMPLOY    0
GDP         0
CPIAUCSL    0
dtype: int64

In [40]:
df_concat.shape

(303, 3)

### Check Any Duplicates

In [42]:
df_concat

Unnamed: 0_level_0,UNEMPLOY,GDP,CPIAUCSL
observation_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1948-01-01,2254.0,265.742,23.617
1948-04-01,2239.0,272.567,23.993
1948-07-01,2288.0,279.196,24.397
1948-10-01,2324.0,280.366,24.173
1949-01-01,2825.0,275.034,23.943
