In [117]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Problem 3: Metro Interstate Traffic Volume

In [118]:
# load data
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/metro.csv'
data = pd.read_csv(url,index_col='date_time',parse_dates=True)
data.head()

Unnamed: 0_level_0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,traffic_volume
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-01-01 00:00:00,New Years Day,265.94,0.0,0.0,90,Haze,haze,1513
2016-01-01 00:00:00,New Years Day,265.94,0.0,0.0,90,Snow,light snow,1513
2016-01-01 01:00:00,,266.0,0.0,0.0,90,Snow,light snow,1550
2016-01-01 03:00:00,,266.01,0.0,0.0,90,Snow,light snow,719
2016-01-01 04:00:00,,264.8,0.0,0.0,90,Clouds,overcast clouds,533


This dataset contains information about the hourly traffic volume on the West-bound lane of I-94 (**MN DoT ATR station 301**), roughly midway between Minneapolis and St Paul, MN. The dataset includes hourly weather and temperature reports from 2016 to 2018.

<table><tr>
<td> <img src="i-94a.png" alt="Drawing" style="width: 500px;"/> </td>
<td> <img src="i-94b.png" alt="Drawing" style="width: 500px;"/> </td>
</tr></table>

| Column | Description |
| :-- | --: | 
| `date_time` (index) | Hour of the data collected in local CST time|
| `holiday` | US National holidays plus regional holiday (Minnesota State Fair) | 
| `temp` | Average temp (in kelvin) |
| `rain_1h` | Amount in mm of rain that occurred in the hour |
| `snow_1h` | Amount in mm of snow that occurred in the hour |
| `clouds_all` | Percentage of cloud cover |
| `weather_main` | Short textual description of the current weather |
| `weather_description` | Longer textual description of the current weather |
| `traffic_volume` | Hourly I-94 ATR 301 reported westbound traffic volume |

In [119]:
data

Unnamed: 0_level_0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,traffic_volume
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-01-01 00:00:00,New Years Day,265.94,0.0,0.0,90,Haze,haze,1513
2016-01-01 00:00:00,New Years Day,265.94,0.0,0.0,90,Snow,light snow,1513
2016-01-01 01:00:00,,266.00,0.0,0.0,90,Snow,light snow,1550
2016-01-01 03:00:00,,266.01,0.0,0.0,90,Snow,light snow,719
2016-01-01 04:00:00,,264.80,0.0,0.0,90,Clouds,overcast clouds,533
...,...,...,...,...,...,...,...,...
2018-09-30 19:00:00,,283.45,0.0,0.0,75,Clouds,broken clouds,3543
2018-09-30 20:00:00,,282.76,0.0,0.0,90,Clouds,overcast clouds,2781
2018-09-30 21:00:00,,282.73,0.0,0.0,90,Thunderstorm,proximity thunderstorm,2159
2018-09-30 22:00:00,,282.09,0.0,0.0,90,Clouds,overcast clouds,1450


In [130]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Custom transformer for extracting time-based features
class TimeFeaturesExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        # Copy the input dataframe to avoid changing it outside the scope of the transformer
        X_transformed = X.copy()
        X_transformed['hour'] = X_transformed.index.hour
        X_transformed['day_of_week'] = X_transformed.index.dayofweek
        X_transformed['month'] = X_transformed.index.month
        return X_transformed
    

# Reload the dataset
traffic_data = data

# Remove duplicate rows
traffic_data_cleaned = traffic_data.drop_duplicates()

# Separate the features and the target variable
X = traffic_data_cleaned.drop(columns=['traffic_volume'], axis=1)
y = traffic_data_cleaned['traffic_volume']

# Split the data into training (80%) and testing (20%) sets without shuffling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

# Define the column transformer to handle the one-hot encoding
categorical_features = ['holiday', 'weather_main', 'weather_description']
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first'), categorical_features)
    ],
    remainder='passthrough'  # passthrough features not listed in transformers
)

# Adjusting the column transformer to avoid sparse output
preprocessor.set_params(sparse_threshold=0)

# Create the pipeline
pipeline = Pipeline(steps=[
    ('time_features', TimeFeaturesExtractor()),  # Time-based features extraction
    ('preprocessor', preprocessor),               # One-hot encoding
    ('scaler', StandardScaler()),                 # Feature scaling
    ('regressor', RandomForestRegressor(random_state=42, n_jobs=-1))  # Model
])

# Apply the pipeline
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred_pipeline = pipeline.predict(X_test)
rmse_pipeline = mean_squared_error(y_test, y_pred_pipeline, squared=False)
print('RMSE for Random Forest with Pipeline: {:.4f}'.format(rmse_pipeline))

RMSE for Random Forest with Pipeline: 500.9921


Your **goal** is to **train** a regression pipeline that predicts the hourly traffic volume, **tune** the pipeline hyperparameters, and **test** its performance.