# Modelling Experiments

In this notebook, we will fit various models on our dataset

According to the hackathon rules, the evaluation metric is **Root Mean Squared Error**

In [None]:
# IMport the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import root_mean_squared_error

In [2]:
# Import the required librarires
train = pd.read_csv("../Dataset/processed/train.csv")

train.head()

Unnamed: 0,TimeStamp,Temperature,Rainfall_last_hour,Snowfall_last_hour,Cloud_Cover,Traffic_Vol,Holiday_No,Holiday_Yes,Weather_Airborne particles,Weather_Airborne smoke,...,Weather_Desc_Strong drizzle,Weather_Desc_Sudden windstorm,Weather_Desc_Torrential downpour,Day_Friday,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday
0,2008-02-10 09:00:00,260.1769,0.693147,0.693147,40,5555.0,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1,2008-02-10 10:00:00,292.7521,0.693147,0.693147,75,4525.0,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,2008-02-10 11:00:00,293.4369,0.693147,0.693147,90,4772.0,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,2008-02-10 12:00:00,295.1524,0.693147,0.693147,90,5031.0,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,2008-02-10 13:00:00,360.6201,0.693147,0.693147,75,4928.0,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [3]:
# Set the timestamp as the index
train = train.set_index("TimeStamp")

train.head()

Unnamed: 0_level_0,Temperature,Rainfall_last_hour,Snowfall_last_hour,Cloud_Cover,Traffic_Vol,Holiday_No,Holiday_Yes,Weather_Airborne particles,Weather_Airborne smoke,Weather_Clear skies,...,Weather_Desc_Strong drizzle,Weather_Desc_Sudden windstorm,Weather_Desc_Torrential downpour,Day_Friday,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday
TimeStamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-02-10 09:00:00,260.1769,0.693147,0.693147,40,5555.0,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2008-02-10 10:00:00,292.7521,0.693147,0.693147,75,4525.0,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2008-02-10 11:00:00,293.4369,0.693147,0.693147,90,4772.0,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2008-02-10 12:00:00,295.1524,0.693147,0.693147,90,5031.0,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2008-02-10 13:00:00,360.6201,0.693147,0.693147,75,4928.0,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


## Data Splitting

In [4]:
# Split the data into features and targets
features, target = train.drop("Traffic_Vol", axis=1), train["Traffic_Vol"]

features.shape, target.shape

((38373, 58), (38373,))

In [5]:
# Split the training and testing sets
split_size = int(0.8 * (len(train)))

X_train, y_train = features[:split_size], target[:split_size]
X_val, y_val = features[split_size:], target[split_size:]

X_train.shape, y_train.shape, X_val.shape, y_val.shape

((30698, 58), (30698,), (7675, 58), (7675,))

In [6]:
# Scale the dataset
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

## Modelling

In this section, we will deal about 2 types of models

1. Traditional Statistical Methods
2. Machine learning models