# Traditional Statistical models

In this notebook we will try and fit various statistical models on our dataset for predictive analysis 

In [1]:
# Import the required libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.api import VAR

In [2]:
# import the train dataset
train = pd.read_csv("https://raw.githubusercontent.com/amulyaprasanth/the_great_indian_hiring_hackathon/refs/heads/main/Dataset/processed/train.csv").set_index("TimeStamp")
train.head()

Unnamed: 0_level_0,Temperature,Rainfall_last_hour,Snowfall_last_hour,Cloud_Cover,Traffic_Vol,Holiday_No,Holiday_Yes,Weather_Airborne particles,Weather_Airborne smoke,Weather_Clear skies,...,Weather_Desc_Strong drizzle,Weather_Desc_Sudden windstorm,Weather_Desc_Torrential downpour,Day_Friday,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday
TimeStamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-02-10 09:00:00,260.1769,0.693147,0.693147,40,5555.0,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2008-02-10 10:00:00,292.7521,0.693147,0.693147,75,4525.0,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2008-02-10 11:00:00,293.4369,0.693147,0.693147,90,4772.0,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2008-02-10 12:00:00,295.1524,0.693147,0.693147,90,5031.0,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2008-02-10 13:00:00,360.6201,0.693147,0.693147,75,4928.0,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [3]:
# Import the test data
test = pd.read_csv("https://raw.githubusercontent.com/amulyaprasanth/the_great_indian_hiring_hackathon/refs/heads/main/Dataset/processed/test.csv").set_index("TimeStamp")
test.head()

Unnamed: 0_level_0,Temperature,Rainfall_last_hour,Snowfall_last_hour,Cloud_Cover,Traffic_Vol,Holiday_No,Holiday_Yes,Weather_Airborne particles,Weather_Airborne smoke,Weather_Clear skies,...,Weather_Desc_Strong drizzle,Weather_Desc_Sudden windstorm,Weather_Desc_Torrential downpour,Day_Friday,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday
TimeStamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-07 00:00:00,24.5,0.693147,0.0,1,,True,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False
2014-01-07 01:00:00,24.3,0.693147,0.0,1,,True,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False
2014-01-07 02:00:00,23.6,0.693147,0.0,1,,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2014-01-07 03:00:00,23.27,0.693147,0.0,1,,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2014-01-07 04:00:00,22.41,0.693147,0.0,40,,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


#### Function to test the stationary of the time series

In [4]:
def adf_test(series,title=''):
    """
    Pass in a time series and an optional title, returns an ADF report
    """
    print(f'Augmented Dickey-Fuller Test: {title}')
    result = adfuller(series.dropna(),autolag='AIC') # .dropna() handles differenced data
    labels = ['ADF test statistic','p-value','# lags used','# observations']
    out = pd.Series(result[0:4],index=labels)
    for key,val in result[4].items():
        out[f'critical value ({key})']=val
    print(out.to_string())          # .to_string() removes the line "dtype: float64"
    if result[1] <= 0.05:
        print("Strong evidence against the null hypothesis")
        print("Reject the null hypothesis")
        print("Data has no unit root and is stationary")
    else:
        print("Weak evidence against the null hypothesis")
        print("Fail to reject the null hypothesis")
        print("Data has a unit root and is non-stationary")

### 1. Check for stationarity in every features

In [None]:
for col in train.columns:
    adf_test(train[col], f"{col}")

Augmented Dickey-Fuller Test: Temperature
ADF test statistic     -1.377187e+01
p-value                 9.648743e-26
# lags used             5.400000e+01
# observations          3.831800e+04
critical value (1%)    -3.430521e+00
critical value (5%)    -2.861615e+00
critical value (10%)   -2.566810e+00
Strong evidence against the null hypothesis
Reject the null hypothesis
Data has no unit root and is stationary
Augmented Dickey-Fuller Test: Rainfall_last_hour
ADF test statistic        -23.473628
p-value                     0.000000
# lags used                39.000000
# observations          38333.000000
critical value (1%)        -3.430521
critical value (5%)        -2.861615
critical value (10%)       -2.566810
Strong evidence against the null hypothesis
Reject the null hypothesis
Data has no unit root and is stationary
Augmented Dickey-Fuller Test: Snowfall_last_hour
ADF test statistic        -21.662320
p-value                     0.000000
# lags used                52.000000
# observa

### Splitting the dataset

In [None]:
# Split the data info features and target
features, target = train.drop("Traffic_Vol", axis=1).values, train["Traffic_Vol"].values

# Split the data into training and test sets
split_size = int(0.8 * len(train))

X_train, y_train, X_val, y_val = features[:split_size], target[:split_size], features[split_size:], target[split_size:]

X_train.shape, y_train.shape, X_val.shape, y_val.shape

### Grid Search for Order P

In [None]:
for i in [1,2,3,4,5,6,7,8,9,10]:
    model = VAR(train)
    results = model.fit(i)
    print('Order =', i)
    print('AIC: ', results.aic)
    print('BIC: ', results.bic)
    print()