In [1]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## load train data

In [2]:
pd_df_train = pd.read_csv("data/preprocessing_data/task1_preprocess_training_data.csv") 

In [3]:
pd_df_train.head()

Unnamed: 0,intersection_id,tollgate_id,time_window,average_travl_time,lag1,lag2,lag3,lag4,lag5,lag6,...,length,link_count,lane1_length,lane1_count,lane2_length,lane2_count,lane3_length,lane3_count,lane4_length,lane4_count
0,B,3,"[2016-07-19 00:00:00,2016-07-19 00:20:00)",70.85,119.353219,119.350868,119.3441,119.343203,119.342306,119.340349,...,477,5,255,2,78.0,1.0,0.0,0.0,144.0,2.0
1,A,2,"[2016-07-19 00:20:00,2016-07-19 00:40:00)",58.05,70.85,119.350868,119.3441,119.343203,119.342306,119.340349,...,384,6,142,2,0.0,0.0,242.0,4.0,0.0,0.0
2,B,1,"[2016-07-19 00:20:00,2016-07-19 00:40:00)",79.76,58.05,70.85,119.3441,119.343203,119.342306,119.340349,...,821,9,275,4,222.0,2.0,180.0,1.0,144.0,2.0
3,B,3,"[2016-07-19 00:20:00,2016-07-19 00:40:00)",148.79,79.76,58.05,70.85,119.343203,119.342306,119.340349,...,477,5,255,2,78.0,1.0,0.0,0.0,144.0,2.0
4,B,1,"[2016-07-19 00:40:00,2016-07-19 01:00:00)",137.98,148.79,79.76,58.05,70.85,119.342306,119.340349,...,821,9,275,4,222.0,2.0,180.0,1.0,144.0,2.0


## Grouping and creating one hot encoding for intersection_id and time_window for train data

In [4]:
intersection_id_index = pd_df_train.groupby('intersection_id').ngroup()

In [5]:
intersection_id_class_vec = pd.get_dummies(intersection_id_index, prefix= "intesection_id_index")

In [6]:
time_window_index = pd_df_train.groupby('time_window').ngroup()

In [7]:
time_window_class_vec =  pd.get_dummies(time_window_index)

In [8]:
pd_df_train = pd.concat([pd_df_train, intersection_id_index, intersection_id_class_vec, 
                         time_window_index, time_window_class_vec], axis=1)

In [9]:
pd_df_train.head()

Unnamed: 0,intersection_id,tollgate_id,time_window,average_travl_time,lag1,lag2,lag3,lag4,lag5,lag6,...,6436,6437,6438,6439,6440,6441,6442,6443,6444,6445
0,B,3,"[2016-07-19 00:00:00,2016-07-19 00:20:00)",70.85,119.353219,119.350868,119.3441,119.343203,119.342306,119.340349,...,0,0,0,0,0,0,0,0,0,0
1,A,2,"[2016-07-19 00:20:00,2016-07-19 00:40:00)",58.05,70.85,119.350868,119.3441,119.343203,119.342306,119.340349,...,0,0,0,0,0,0,0,0,0,0
2,B,1,"[2016-07-19 00:20:00,2016-07-19 00:40:00)",79.76,58.05,70.85,119.3441,119.343203,119.342306,119.340349,...,0,0,0,0,0,0,0,0,0,0
3,B,3,"[2016-07-19 00:20:00,2016-07-19 00:40:00)",148.79,79.76,58.05,70.85,119.343203,119.342306,119.340349,...,0,0,0,0,0,0,0,0,0,0
4,B,1,"[2016-07-19 00:40:00,2016-07-19 01:00:00)",137.98,148.79,79.76,58.05,70.85,119.342306,119.340349,...,0,0,0,0,0,0,0,0,0,0


In [10]:
train_set_X = pd_df_train.drop(["intersection_id", "time_window"], axis=1)

In [11]:
train_set_X = train_set_X.rename(columns={"average_travl_time":"label"}) 

In [12]:
train_set_X.head()

Unnamed: 0,tollgate_id,label,lag1,lag2,lag3,lag4,lag5,lag6,lag7,Friday,...,6436,6437,6438,6439,6440,6441,6442,6443,6444,6445
0,3,70.85,119.353219,119.350868,119.3441,119.343203,119.342306,119.340349,119.338391,0,...,0,0,0,0,0,0,0,0,0,0
1,2,58.05,70.85,119.350868,119.3441,119.343203,119.342306,119.340349,119.338391,0,...,0,0,0,0,0,0,0,0,0,0
2,1,79.76,58.05,70.85,119.3441,119.343203,119.342306,119.340349,119.338391,0,...,0,0,0,0,0,0,0,0,0,0
3,3,148.79,79.76,58.05,70.85,119.343203,119.342306,119.340349,119.338391,0,...,0,0,0,0,0,0,0,0,0,0
4,1,137.98,148.79,79.76,58.05,70.85,119.342306,119.340349,119.338391,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
train_set_X = train_set_X.loc[:,~train_set_X.columns.duplicated()]

In [14]:
train_set_y = train_set_X.label

In [15]:
pca = PCA(n_components=10)
features_pca=pca.fit_transform(train_set_X.drop("label", axis=1))

In [16]:
scaler=StandardScaler()
features_scled=scaler.fit_transform(features_pca)
features_scled.shape

(25648, 10)

## load test data

In [17]:
pd_df_test = pd.read_csv("data/preprocessing_data/task1_preprocess_test_data.csv") 

In [18]:
pd_df_test.head()

Unnamed: 0,intersection_id,tollgate_id,time_window,average_travl_time,lag1,lag2,lag3,lag4,lag5,lag6,...,length,link_count,lane1_length,lane1_count,lane2_length,lane2_count,lane3_length,lane3_count,lane4_length,lane4_count
0,A,2,"[2016-10-18 08:00:00,2016-10-18 08:20:00)",0,0,0,0,0,0,0,...,384,6,142,2,0.0,0.0,242.0,4.0,0.0,0.0
1,A,3,"[2016-10-18 08:00:00,2016-10-18 08:20:00)",0,0,0,0,0,0,0,...,852,8,404,3,0.0,0.0,448.0,5.0,0.0,0.0
2,B,1,"[2016-10-18 08:00:00,2016-10-18 08:20:00)",0,0,0,0,0,0,0,...,821,9,275,4,222.0,2.0,180.0,1.0,144.0,2.0
3,B,3,"[2016-10-18 08:00:00,2016-10-18 08:20:00)",0,0,0,0,0,0,0,...,477,5,255,2,78.0,1.0,0.0,0.0,144.0,2.0
4,C,1,"[2016-10-18 08:00:00,2016-10-18 08:20:00)",0,0,0,0,0,0,0,...,1550,12,217,3,144.0,1.0,1045.0,6.0,144.0,2.0


## Grouping and creating one hot encoding for intersection_id and time_window for test data

In [19]:
intersection_id_index_test = pd_df_test.groupby('intersection_id').ngroup()

In [20]:
intersection_id_class_vec_test = pd.get_dummies(intersection_id_index_test, prefix= "intesection_id_index")

In [21]:
time_window_index_test = pd_df_test.groupby('time_window').ngroup()

In [22]:
time_window_class_vec_test =  pd.get_dummies(time_window_index_test)

In [23]:
pd_df_test = pd.concat([pd_df_test, intersection_id_index_test, intersection_id_class_vec_test, 
                         time_window_index_test, time_window_class_vec_test], axis=1)

In [24]:
pd_df_test.tail()

Unnamed: 0,intersection_id,tollgate_id,time_window,average_travl_time,lag1,lag2,lag3,lag4,lag5,lag6,...,74,75,76,77,78,79,80,81,82,83
499,C,3,"[2016-10-23 09:40:00,2016-10-23 10:00:00)",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
500,C,3,"[2016-10-24 08:00:00,2016-10-24 08:20:00)",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
501,C,3,"[2016-10-24 09:40:00,2016-10-24 10:00:00)",0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
502,C,3,"[2016-10-22 17:00:00,2016-10-22 17:20:00)",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
503,C,3,"[2016-10-24 18:20:00,2016-10-24 18:40:00)",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [25]:
test_set = pd_df_test.drop(["intersection_id", "time_window"], axis=1)

In [26]:
pca = PCA(n_components=10)
test_features_pca=pca.fit_transform(test_set.drop("average_travl_time", axis=1))

In [27]:
scaler=StandardScaler()
test_features_scled=scaler.fit_transform(test_features_pca)
test_features_scled.shape

(504, 10)

## Split data train set for train and cross validation

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(features_scled, 
                                                    train_set_y, 
                                                    test_size=0.30)

## Implementing Random Forest Regressor

In [30]:
from sklearn.ensemble import RandomForestRegressor

In [31]:
r_model = RandomForestRegressor(n_estimators=10)

In [32]:
r_model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [33]:
y_pred = r_model.predict(X_test)

## Evaluation metrics

In [34]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# MAE,R^2:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

## MAPE Calculation method

In [35]:
def Mean_Absolute_Percentage_Error(labl,predction):
    labl, predction = np.array(labl), np.array(predction)
    return np.mean(np.abs((labl - predction) / labl))

## MAPE for Random Forest Regressor

In [36]:
print("MAPE for RF", Mean_Absolute_Percentage_Error(y_test, y_pred))

MAPE for RF 0.25916502549085335


## Mean Abosute error for Random Forest Regressor

In [37]:
mae_rf=mean_absolute_error(y_test, y_pred)
print("MAE for RF", mae_rf)

MAE for RF 27.737714113366522


## r2_score for Random Forest Regressor

In [38]:
r2_rf=r2_score(y_test,y_pred)
print("r2_score for RF", r2_rf)

r2_score for RF 0.49803893021280965


## Root Mean Squared Error For Random Forest Regressor

In [39]:
#root mean square error metric:

rms = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE for RF", rms)

RMSE for RF 40.56701046608705


In [40]:
#Not needed, unusually large value to compare the error performance metric.

mse = mean_squared_error(y_test, y_pred)
print("MSE for RF", mse)

MSE for RF 1645.682338155616


## Implementing Gradient Boosting Regressor

In [41]:
from sklearn.ensemble import GradientBoostingRegressor

In [42]:
gb_reg = GradientBoostingRegressor()

In [43]:
gb_reg.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [44]:
gb_pred = gb_reg.predict(X_test)

## Evaluation metrics

In [45]:
# mean squared error

mse = mean_squared_error(y_test, gb_pred)
print("MSE for GBR", mse)

MSE for GBR 1462.6099866108539


In [46]:
# root mean squared error

rms_gbt=np.sqrt(mean_squared_error(y_test, gb_pred))
print("RMSE for GBR", rms_gbt)

RMSE for GBR 38.244084334846534


## MAPE for Gradient Boosting Regressor

In [47]:
print("MAPE for GBR", Mean_Absolute_Percentage_Error(y_test, gb_pred))

MAPE for GBR 0.23597822282799474


## MAE for Gradient Boosting Regressor

In [48]:
mae_gbr=mean_absolute_error(y_test, gb_pred)
print("MAE for GBR", mae_gbr)

MAE for GBR 25.572250855541068


## r_2 score for Gradient Boosting Regressor

In [49]:
r2_gb=r2_score(y_test,gb_pred)
print("r2_score for GBR", r2_gb)

r2_score for GBR 0.5538791074446174


## Implementing XGBOOST

In [50]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

## Evaluation of XGBoost performance

In [51]:
print("MAPE for XGB", Mean_Absolute_Percentage_Error(y_test, y_pred_xgb))

MAPE for XGB 0.24239814506453292


In [52]:
print("MAE for XBG", mean_absolute_error(y_test, y_pred_xgb))

MAE for XBG 26.203313664564647


In [53]:
print("RMSE for XGB", np.sqrt(mean_squared_error(y_test, y_pred_xgb)))

RMSE for XGB 38.86529705796881


In [54]:
print("Mean Squared Error XGB", mean_squared_error(y_test, y_pred_xgb))

Mean Squared Error XGB 1510.5113154041592


In [55]:
print("r_2 score XGB", r2_score(y_test,y_pred_xgb))

r_2 score XGB 0.5392683884207605


## Conclusion

Based on the above experiment we choose Gradient Boosting Regressor as the best model as it has the lowest MAPE score 

## Predicting on test data

In [56]:
pred = gb_reg.predict(test_features_scled)

In [57]:
pd_df_test["avg_travel_time"] = pred

In [58]:
f_data = pd_df_test[["intersection_id","tollgate_id","time_window","avg_travel_time"]]
f_data.head()

Unnamed: 0,intersection_id,tollgate_id,time_window,avg_travel_time
0,A,2,"[2016-10-18 08:00:00,2016-10-18 08:20:00)",105.288172
1,A,3,"[2016-10-18 08:00:00,2016-10-18 08:20:00)",98.217986
2,B,1,"[2016-10-18 08:00:00,2016-10-18 08:20:00)",165.363807
3,B,3,"[2016-10-18 08:00:00,2016-10-18 08:20:00)",137.58093
4,C,1,"[2016-10-18 08:00:00,2016-10-18 08:20:00)",124.256278


In [59]:
f_data.to_csv(path_or_buf="data/task1_submission.csv", index=False)