# Modelling Trails with Machine Learning Models

In [1]:
# IMport the required libraries
import numpy as np
import pandas as pd
import  matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


In [2]:
# Import the required librarires
train = pd.read_csv("../Dataset/processed/train.csv")

train.head()

Unnamed: 0,TimeStamp,Holiday,Temperature,Rainfall_last_hour,Snowfall_last_hour,Cloud_Cover,Weather_Desc,Traffic_Vol,Weather_Airborne particles,Weather_Airborne smoke,Weather_Clear skies,Weather_Cloudy skies,Weather_Dense fog,Weather_Light fog,Weather_Light rain,Weather_Rainfall,Weather_Snowfall,Weather_Stormy weather,Weather_Sudden windstorm
0,2008-02-10 09:00:00,0.0,289.28,0.0,0.0,40,19,5555,False,False,False,True,False,False,False,False,False,False,False
1,2008-02-10 10:00:00,0.0,290.26,0.0,0.0,75,9,4525,False,False,False,True,False,False,False,False,False,False,False
2,2008-02-10 11:00:00,0.0,290.28,0.0,0.0,90,10,4772,False,False,False,True,False,False,False,False,False,False,False
3,2008-02-10 12:00:00,0.0,290.33,0.0,0.0,90,10,5031,False,False,False,True,False,False,False,False,False,False,False
4,2008-02-10 13:00:00,0.0,292.14,0.0,0.0,75,9,4928,False,False,False,True,False,False,False,False,False,False,False


In [3]:
# Set the timestamp as the index
train = train.set_index("TimeStamp")

train.head()

Unnamed: 0_level_0,Holiday,Temperature,Rainfall_last_hour,Snowfall_last_hour,Cloud_Cover,Weather_Desc,Traffic_Vol,Weather_Airborne particles,Weather_Airborne smoke,Weather_Clear skies,Weather_Cloudy skies,Weather_Dense fog,Weather_Light fog,Weather_Light rain,Weather_Rainfall,Weather_Snowfall,Weather_Stormy weather,Weather_Sudden windstorm
TimeStamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2008-02-10 09:00:00,0.0,289.28,0.0,0.0,40,19,5555,False,False,False,True,False,False,False,False,False,False,False
2008-02-10 10:00:00,0.0,290.26,0.0,0.0,75,9,4525,False,False,False,True,False,False,False,False,False,False,False
2008-02-10 11:00:00,0.0,290.28,0.0,0.0,90,10,4772,False,False,False,True,False,False,False,False,False,False,False
2008-02-10 12:00:00,0.0,290.33,0.0,0.0,90,10,5031,False,False,False,True,False,False,False,False,False,False,False
2008-02-10 13:00:00,0.0,292.14,0.0,0.0,75,9,4928,False,False,False,True,False,False,False,False,False,False,False


## Data Splitting

In [4]:
# Split the data into features and targets
features, target = train.drop("Traffic_Vol", axis=1), train["Traffic_Vol"]

features.shape, target.shape

((38373, 17), (38373,))

In [5]:
# Split the training and testing sets
split_size = int(0.8 * (len(train)))

X_train, y_train = features[:split_size], target[:split_size]
X_val, y_val = features[split_size:], target[split_size:]

X_train.shape, y_train.shape, X_val.shape, y_val.shape

((30698, 17), (30698,), (7675, 17), (7675,))

In [6]:
# Scale the dataset
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)


## Modelling

According to the hackathon, the Evaluation metric will be **Root Mean Square Error**

In [7]:
from sklearn.metrics import root_mean_squared_error


In [8]:
models = {
    "Linear" : LinearRegression(),
    "Ridge" : Ridge(),
    "Lasso": Lasso(),
    "Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42, n_jobs=-1)
}

In [9]:
for model in models.values():
    print(model)

LinearRegression()
Ridge()
Lasso()
DecisionTreeRegressor(random_state=42)
RandomForestRegressor(n_jobs=-1, random_state=42)


In [10]:
def fit_and_score_model(models: dict, X_train=X_train, y_train= y_train, X_val=X_val, y_val=y_val):
    
    # Iterate through the models
    scores = {}
    
    for model_name, model in models.items():
        
        # Fit the model on the data
        model.fit(X_train, y_train)
        
        # Predict on test data
        y_preds = model.predict(X_val)
        
        # Score the predictions
        score = root_mean_squared_error(y_val, y_preds)
        
        scores[model_name] = round(score , 2)
    return scores
        
scores = fit_and_score_model(models)

In [13]:
# Convert the dictionary into a dataframe
scores_df = pd.DataFrame(scores.items(), columns=["Model Name", "Score"]).sort_values(by=["Score"])
scores_df

Unnamed: 0,Model Name,Score
2,Lasso,1959.27
0,Linear,1959.35
1,Ridge,1959.42
4,Random Forest,2195.25
3,Tree,2590.57
