In [1]:
# Package imports
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score

In [2]:
# Import data and split
modeling_data = pd.read_csv("data/final_modeling_data.csv")

only_cols = ['delay_minutes', 'dispatch_hour', 'dispatch_day', 'job_type_encoded', 'traffic_level_encoded', 'temperature_normalized', 'precipitation_normalized']
modeling_data = modeling_data[only_cols]

X = modeling_data.drop('delay_minutes', axis=1)
y = modeling_data['delay_minutes']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [4]:
# Quick RF
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [5]:
# Model Evaluation
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MAE: {mae}, R^2: {r2}")

MAE: 13.57793666666667, R^2: -0.02107416134720208


In [6]:
# CV scores
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
print(f"Average RMSE: {(-scores.mean())**0.5}")

Average RMSE: 17.422621933279924


In [8]:
# Feature Importance
importances = model.feature_importances_

importances

array([0.35471302, 0.05927142, 0.17967381, 0.07812173, 0.16511617,
       0.16310386])