In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, max_error, mean_squared_error,mean_absolute_error, r2_score
import pylab as pl

from sklearn.decomposition import PCA
from sklearn.svm import SVR

## load data

In [2]:
data = pd.read_csv("data/preprocessing_data/task_2_master_table_training.csv")
data = data.drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1)
data.head()

Unnamed: 0,tollgate_id,direction,precipitation,pressure,rel_humidity,sea_pressure,temperature,wind_direction,wind_speed,volume,...,start_date_year,start_date_month,start_date_day,end_date_year,end_date_month,end_date_day,end_time_hour,end_time_min,start_time_hour,start_time_min
0,2,0,0.0,1013.9,90,1018.9,19.3,322,2.5,18,...,2016.0,10.0,17.0,2016.0,10.0,18.0,0.0,0.0,23.0,40.0
1,3,0,0.0,1013.9,90,1018.9,19.3,322,2.5,15,...,2016.0,10.0,17.0,2016.0,10.0,18.0,0.0,0.0,23.0,40.0
2,3,1,0.0,1013.9,90,1018.9,19.3,322,2.5,22,...,2016.0,10.0,17.0,2016.0,10.0,18.0,0.0,0.0,23.0,40.0
3,1,0,0.0,1013.9,90,1018.9,19.3,322,2.5,2,...,2016.0,10.0,17.0,2016.0,10.0,18.0,0.0,0.0,23.0,40.0
4,1,1,0.0,1013.9,90,1018.9,19.3,322,2.5,10,...,2016.0,10.0,17.0,2016.0,10.0,18.0,0.0,0.0,23.0,40.0


In [3]:
features = data.drop(["volume"], axis=1)

In [4]:
label = data["volume"]

## Scaling the features

In [5]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
features_scled=scaler.fit_transform(features)
features_scled

array([[-3.42242472e-04, -8.40918619e-01, -2.61233539e-01, ...,
        -1.22608150e+00,  1.65755005e+00,  1.22579495e+00],
       [ 1.09825609e+00, -8.40918619e-01, -2.61233539e-01, ...,
        -1.22608150e+00,  1.65755005e+00,  1.22579495e+00],
       [ 1.09825609e+00,  1.18917572e+00, -2.61233539e-01, ...,
        -1.22608150e+00,  1.65755005e+00,  1.22579495e+00],
       ...,
       [ 1.09825609e+00,  1.18917572e+00, -2.61233539e-01, ...,
        -1.52592595e-03, -1.73439681e+00, -1.22503145e+00],
       [-1.09894058e+00, -8.40918619e-01, -2.61233539e-01, ...,
        -1.52592595e-03, -1.73439681e+00, -1.22503145e+00],
       [-1.09894058e+00,  1.18917572e+00, -2.61233539e-01, ...,
        -1.52592595e-03, -1.73439681e+00, -1.22503145e+00]])

## Spliting features for training and cross validation

In [6]:
X_train, X_test, y_train, y_test = train_test_split(features_scled, label, test_size = 0.20)

In [7]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape, y_test.shape)

(7704, 20)
(7704,)
(1926, 20) (1926,)


## Implementing SVR

In [8]:
clf_rbf = SVR(kernel='rbf', C=100, gamma=0.005, epsilon=.5)

In [9]:
clf_rbf.fit(X_train,y_train)

SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.5, gamma=0.005,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [10]:
y_pred_rbf = clf_rbf.predict(X_test)

## Evaluation Matrics

In [11]:
print("explained variance score SVR", explained_variance_score(y_test,y_pred_rbf)) 

explained variance score SVR 0.41475895422867715


In [12]:
print("max error SVR", max_error(y_test,y_pred_rbf)) 

max error SVR 236.91023811112922


In [13]:
print("root mean squared error SVR", np.sqrt(mean_squared_error(y_test, y_pred_rbf)))

root mean squared error SVR 37.57634751044892


## MAPE Calculation method

In [14]:
def Mean_Absolute_Percentage_Error(labl, predction):
    labl, predction = np.array(labl), np.array(predction)
    return np.mean(np.abs((labl - predction) / labl))

In [15]:
mae_rbf = mean_absolute_error(y_test, y_pred_rbf)
print("MAE score for SVR(rbf):", mae_rbf)

MAE score for SVR(rbf): 24.424531336086456


In [16]:
x = Mean_Absolute_Percentage_Error(y_test, y_pred_rbf)
print("MAPE for SVR", x)

MAPE for SVR 1.2621947504162723


In [17]:
r2_rbf=r2_score(y_test,y_pred_rbf)
print("R2 score for SVR(rbf):",r2_rbf)

R2 score for SVR(rbf): 0.38307644463994783


## Implementing XGBOOST

In [18]:
import xgboost as xgb

In [19]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

## Evaluation Metrics

In [20]:
print("explained variance score XGB", explained_variance_score(y_test,y_pred))

explained variance score XGB 0.9258814626352956


In [21]:
print("max error XGB", max_error(y_test,y_pred))

max error XGB 166.06109619140625


In [22]:
mse = np.sqrt(mean_squared_error(y_test, y_pred))
print("root mean squared error XGB", mse)

root mean squared error XGB 13.027745113282748


In [23]:
mae_xgb=mean_absolute_error(y_test, y_pred)
print("MAE XGBoost", mae_xgb)

MAE XGBoost 8.344170715686936


In [24]:
x = Mean_Absolute_Percentage_Error(y_test,y_pred)
print("MAPE XGB", x)

MAPE XGB 0.32746105917304824


In [25]:
r2_xgb=r2_score(y_test,y_pred)
print("R2 score for XGB:",r2_xgb)

R2 score for XGB: 0.9258449500684762


## Implementing Random Forest Regressor

In [26]:
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor(max_depth=40)
regr.fit(X_train, y_train)
y_pred_rg=regr.predict(X_test)



## Evaluation Metrics

In [27]:
print("explained variance score RF", explained_variance_score(y_test, y_pred_rg))

explained variance score RF 0.9023975025311388


In [28]:
print("max error RF", max_error(y_test, y_pred_rg))

max error RF 229.0


In [29]:
mse_regr=np.sqrt(mean_squared_error(y_test, y_pred_rg))
print("root mean Squared Error RF", mse_regr)

root mean Squared Error RF 14.952195544787177


In [30]:
mae_rf=mean_absolute_error(y_test, y_pred_rg)
print("MAE score for Random Forest:",mae_rf)

MAE score for Random Forest: 8.5073727933541


In [31]:
x = Mean_Absolute_Percentage_Error(y_test, y_pred_rg)
print("MAPE for RF", x)

MAPE for RF 0.3124226838372659


In [32]:
r2_rf=r2_score(y_test,y_pred_rg)
print("R2 score for Random Forest:",r2_rf)

R2 score for Random Forest: 0.9023185355877335


## Implementing Ada Boost Regressor

In [33]:
from sklearn.ensemble import AdaBoostRegressor

regar = AdaBoostRegressor(n_estimators=10,learning_rate=1)
regar.fit(X_train, y_train)
y_pred_regar=regar.predict(X_test)

## Evaluation Metrics

In [34]:
mse_regar=np.sqrt(mean_squared_error(y_test,y_pred_regar))
print("root mean squared error Adaboost", mse_regar)

root mean squared error Adaboost 32.566549747872244


In [35]:
mae_ada=mean_absolute_error(y_test, y_pred_regar)
print("MAE score for Adaboost:",mae_ada)

MAE score for Adaboost: 23.827251433839184


In [36]:
x = Mean_Absolute_Percentage_Error(y_test,y_pred_regar)
print("MAPE for Adaboost", x)

MAPE for Adaboost 1.4302398181478535


In [37]:
r2_ada=r2_score(y_test,y_pred_regar)
print("R2 score for Adaboost:",r2_ada)

R2 score for Adaboost: 0.5366109946704889


## Adjusted R2

In [38]:
#Adjusted R2:
def adjusted_r2(r2):
    adj_r2 = (1 - (1 - r2) * ((data.shape[0] - 1) / 
          (data.shape[0] - data.shape[1] - 1)))
    return adj_r2

print("adjusted_r2 SVR", adjusted_r2(r2_rbf))
print("adjusted_r2 Ada Boost",adjusted_r2(r2_ada))

adjusted_r2 SVR 0.38172804802644233
adjusted_r2 Ada Boost 0.5355981752375247


## Conclusion

Based on the Above result Random Forest Regressor gives the lowest MAPE score

## Predicting on Test data

In [44]:
t_data = pd.read_csv("data/preprocessing_data/task_2_master_table_test.csv")
t_data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,tollgate_id,direction,precipitation,pressure,rel_humidity,sea_pressure,temperature,...,start_date_year,start_date_month,start_date_day,end_date_year,end_date_month,end_date_day,end_time_hour,end_time_min,start_time_hour,start_time_min
0,0,0,0,2,0,0.0,1008.0,91,1013.0,21.0,...,2016.0,10.0,24.0,2016.0,10.0,24.0,17.0,0.0,16.0,40.0
1,1,1,1,3,0,0.0,1008.0,91,1013.0,21.0,...,2016.0,10.0,24.0,2016.0,10.0,24.0,17.0,0.0,16.0,40.0
2,2,2,2,3,1,0.0,1008.0,91,1013.0,21.0,...,2016.0,10.0,24.0,2016.0,10.0,24.0,17.0,0.0,16.0,40.0
3,3,3,3,1,0,0.0,1008.0,91,1013.0,21.0,...,2016.0,10.0,24.0,2016.0,10.0,24.0,17.0,0.0,16.0,40.0
4,4,4,4,1,1,0.0,1008.0,91,1013.0,21.0,...,2016.0,10.0,24.0,2016.0,10.0,24.0,17.0,0.0,16.0,40.0


In [45]:
t_data = t_data.drop(["Unnamed: 0", "Unnamed: 0.1", "Unnamed: 0.1.1"], axis=1)

In [46]:
t_data.columns

Index(['tollgate_id', 'direction', 'precipitation', 'pressure', 'rel_humidity',
       'sea_pressure', 'temperature', 'wind_direction', 'wind_speed', 'volume',
       'weekday', 'start_date_year', 'start_date_month', 'start_date_day',
       'end_date_year', 'end_date_month', 'end_date_day', 'end_time_hour',
       'end_time_min', 'start_time_hour', 'start_time_min'],
      dtype='object')

In [47]:
t_data = t_data.drop("volume", axis=1)
scaler=StandardScaler()
features_scled=scaler.fit_transform(t_data)
features_scled

array([[ 0.        , -0.81649658, -0.71675983, ..., -1.22474487,
         1.10431526,  1.22474487],
       [ 1.11803399, -0.81649658, -0.71675983, ..., -1.22474487,
         1.10431526,  1.22474487],
       [ 1.11803399,  1.22474487, -0.71675983, ..., -1.22474487,
         1.10431526,  1.22474487],
       ...,
       [ 1.11803399,  1.22474487, -0.71675983, ...,  0.        ,
        -1.10431526, -1.22474487],
       [-1.11803399, -0.81649658, -0.71675983, ...,  0.        ,
        -1.10431526, -1.22474487],
       [-1.11803399,  1.22474487, -0.71675983, ...,  0.        ,
        -1.10431526, -1.22474487]])

In [48]:
f_pred = regr.predict(features_scled)

In [49]:
t_data["volume"] = f_pred

In [50]:
t_data.head()

Unnamed: 0,tollgate_id,direction,precipitation,pressure,rel_humidity,sea_pressure,temperature,wind_direction,wind_speed,weekday,...,start_date_month,start_date_day,end_date_year,end_date_month,end_date_day,end_time_hour,end_time_min,start_time_hour,start_time_min,volume
0,2,0,0.0,1008.0,91,1013.0,21.0,210,1.4,0,...,10.0,24.0,2016.0,10.0,24.0,17.0,0.0,16.0,40.0,37.8
1,3,0,0.0,1008.0,91,1013.0,21.0,210,1.4,0,...,10.0,24.0,2016.0,10.0,24.0,17.0,0.0,16.0,40.0,61.5
2,3,1,0.0,1008.0,91,1013.0,21.0,210,1.4,0,...,10.0,24.0,2016.0,10.0,24.0,17.0,0.0,16.0,40.0,33.3
3,1,0,0.0,1008.0,91,1013.0,21.0,210,1.4,0,...,10.0,24.0,2016.0,10.0,24.0,17.0,0.0,16.0,40.0,24.7
4,1,1,0.0,1008.0,91,1013.0,21.0,210,1.4,0,...,10.0,24.0,2016.0,10.0,24.0,17.0,0.0,16.0,40.0,38.3


In [51]:
time_window = list()
for idx, row in t_data.iterrows():
    s_y = str(row["start_date_year"]).split(".")[0]
    s_m = str(row["start_date_month"]).split(".")[0]
    s_d = str(row["start_date_day"]).split(".")[0]
    s_h = str(row["start_time_hour"]).split(".")[0]
    s_min = str(row["start_time_min"]).split(".")[0]
    if s_min == "0":
        s_min = "00"
    
    e_y = str(row["end_date_year"]).split(".")[0]
    e_m = str(row["end_date_month"]).split(".")[0]
    e_d = str(row["end_date_day"]).split(".")[0]
    e_h = str(row["end_time_hour"]).split(".")[0]
    e_min = str(row["end_time_min"]).split(".")[0]
    if e_min == "0":
        e_min = "00"
    
    temp = "["+s_y+"-"+s_m+"-"+s_d+" "+s_h+":"+s_min+":"+"00,"+e_y+"-"+e_m+"-"+e_d+" "+e_h+":"+e_min+":"+"00)"
    time_window.append(temp)
    
t_data["time_window"] = time_window

In [52]:
t_data.head()

Unnamed: 0,tollgate_id,direction,precipitation,pressure,rel_humidity,sea_pressure,temperature,wind_direction,wind_speed,weekday,...,start_date_day,end_date_year,end_date_month,end_date_day,end_time_hour,end_time_min,start_time_hour,start_time_min,volume,time_window
0,2,0,0.0,1008.0,91,1013.0,21.0,210,1.4,0,...,24.0,2016.0,10.0,24.0,17.0,0.0,16.0,40.0,37.8,"[2016-10-24 16:40:00,2016-10-24 17:00:00)"
1,3,0,0.0,1008.0,91,1013.0,21.0,210,1.4,0,...,24.0,2016.0,10.0,24.0,17.0,0.0,16.0,40.0,61.5,"[2016-10-24 16:40:00,2016-10-24 17:00:00)"
2,3,1,0.0,1008.0,91,1013.0,21.0,210,1.4,0,...,24.0,2016.0,10.0,24.0,17.0,0.0,16.0,40.0,33.3,"[2016-10-24 16:40:00,2016-10-24 17:00:00)"
3,1,0,0.0,1008.0,91,1013.0,21.0,210,1.4,0,...,24.0,2016.0,10.0,24.0,17.0,0.0,16.0,40.0,24.7,"[2016-10-24 16:40:00,2016-10-24 17:00:00)"
4,1,1,0.0,1008.0,91,1013.0,21.0,210,1.4,0,...,24.0,2016.0,10.0,24.0,17.0,0.0,16.0,40.0,38.3,"[2016-10-24 16:40:00,2016-10-24 17:00:00)"


In [53]:
f_data = t_data[["tollgate_id","time_window","direction","volume"]]
f_data.head()

Unnamed: 0,tollgate_id,time_window,direction,volume
0,2,"[2016-10-24 16:40:00,2016-10-24 17:00:00)",0,37.8
1,3,"[2016-10-24 16:40:00,2016-10-24 17:00:00)",0,61.5
2,3,"[2016-10-24 16:40:00,2016-10-24 17:00:00)",1,33.3
3,1,"[2016-10-24 16:40:00,2016-10-24 17:00:00)",0,24.7
4,1,"[2016-10-24 16:40:00,2016-10-24 17:00:00)",1,38.3


In [54]:
f_data.to_csv(path_or_buf="data/task2_submission.csv", index=False)