In [102]:
#Manipulation
import pandas as pd
import numpy as np 

#visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#Data processing
from sklearn.preprocessing import OneHotEncoder

#Data modelling
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn import ensemble
from xgboost import XGBRegressor
#from sklearn.xgboost import XGBRegressor
from xgboost import plot_importance, plot_tree
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb


#Split of train and test within train. PURPOSE:Train
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

#Evaluation metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error

import warnings
warnings.filterwarnings("ignore")


In [11]:
train_df = pd.read_csv("Train.csv")
test_df = pd.read_csv("Test.csv")

#Delete unnecessary columns
train_df.drop(['rain_p_h','snow_p_h','visibility_in_miles','weather_description','wind_direction'],inplace = True,axis = 1)

test_df.drop(['rain_p_h','snow_p_h','visibility_in_miles','weather_description','wind_direction'],inplace = True,axis = 1)

# Removing is holiday feature after filtering it out
#train_df=train_df[train_df['is_holiday']=='None']
train_df.drop(['is_holiday'],axis=1,inplace=True)

test_df.drop(['is_holiday'],axis=1,inplace=True)

# Converting the Float values of temperature to int
train_df['temperature']=np.round(train_df['temperature']).astype(int)
train_df=train_df[train_df['temperature']>0]

test_df['temperature']=np.round(test_df['temperature']).astype(int)


#Obtain hour as separate column and day as separate column

train_df['date_time'] = pd.to_datetime(train_df.date_time)
train_df.insert(1,"date",train_df['date_time'].index)
train_df.insert(2,"hour",train_df['date_time'].dt.hour)
train_df.insert(3,"days",train_df['date_time'].dt.weekday)
train_df.insert(4,"quarter",train_df['date_time'].dt.quarter)
train_df.insert(5,"month",train_df['date_time'].dt.month)
train_df.insert(6,"year",train_df['date_time'].dt.year)
train_df.insert(7,"dayofyear",train_df['date_time'].dt.dayofyear)
train_df.insert(8,"dayofmonth",train_df['date_time'].dt.day)
train_df.insert(9,"weekofyear",train_df['date_time'].dt.weekofyear)


train_df.drop(['date_time'],axis=1,inplace=True)

test_df['date_time'] = pd.to_datetime(test_df.date_time)
test_df.insert(1,"date",test_df['date_time'].index)
test_df.insert(2,"hour",test_df['date_time'].dt.hour)
test_df.insert(3,"days",test_df['date_time'].dt.weekday)
test_df.insert(4,"quarter",test_df['date_time'].dt.quarter)
test_df.insert(5,"month",test_df['date_time'].dt.month)
test_df.insert(6,"year",test_df['date_time'].dt.year)
test_df.insert(7,"dayofyear",test_df['date_time'].dt.dayofyear)
test_df.insert(8,"dayofmonth",test_df['date_time'].dt.day)
test_df.insert(9,"weekofyear",test_df['date_time'].dt.weekofyear)

test_df.drop(['date_time'],axis=1,inplace=True)

# Weather to Lower Case
#train_df['weather_type'] = train_df['weather_type'].str.lower()

#test_df['weather_type'] = test_df['weather_type'].str.lower()

#Filter the wind speed to rectify the abnormality found in describe()
train_df = train_df[train_df['wind_speed']<=9]

#Filter the temperature to rectify the abnormality found in describe()
train_df = train_df[train_df['temperature']>=200]

# Encoding the weather_type feature column
train_df['weather_type'] =np.where(train_df['weather_type']=='Clouds',0 , train_df['weather_type'])
test_df['weather_type'] =np.where(test_df['weather_type']=='Clouds',0 ,test_df['weather_type'])
train_df['weather_type'] =np.where(train_df['weather_type']=='Clear',1 , train_df['weather_type'])
test_df['weather_type'] =np.where(test_df['weather_type']=='Clear',1 ,test_df['weather_type'])
train_df['weather_type'] =np.where(train_df['weather_type']=='Rain',2 , train_df['weather_type'])
test_df['weather_type'] =np.where(test_df['weather_type']=='Rain',2 ,test_df['weather_type'])
train_df['weather_type'] =np.where(train_df['weather_type']=='Drizzle',3 , train_df['weather_type'])
test_df['weather_type'] =np.where(test_df['weather_type']=='Drizzle',3 ,test_df['weather_type'])
train_df['weather_type'] =np.where(train_df['weather_type']=='Mist',4 , train_df['weather_type'])
test_df['weather_type'] =np.where(test_df['weather_type']=='Mist',4 ,test_df['weather_type'])
train_df['weather_type'] =np.where(train_df['weather_type']=='Haze',5 , train_df['weather_type'])
test_df['weather_type'] =np.where(test_df['weather_type']=='Haze',5 ,test_df['weather_type'])
train_df['weather_type'] =np.where(train_df['weather_type']=='Fog',6 , train_df['weather_type'])
test_df['weather_type'] =np.where(test_df['weather_type']=='Fog',6 ,test_df['weather_type'])
train_df['weather_type'] =np.where(train_df['weather_type']=='Thunderstorm',7 , train_df['weather_type'])
test_df['weather_type'] =np.where(test_df['weather_type']=='Thunderstorm',7 ,test_df['weather_type'])
train_df['weather_type'] =np.where(train_df['weather_type']=='Snow',8 , train_df['weather_type'])
test_df['weather_type'] =np.where(test_df['weather_type']=='Snow',8 ,test_df['weather_type'])
train_df['weather_type'] =np.where(train_df['weather_type']=='Squall',9 , train_df['weather_type'])
test_df['weather_type'] =np.where(test_df['weather_type']=='Squall',9 ,test_df['weather_type'])
train_df['weather_type'] =np.where(train_df['weather_type']=='Smoke',10 , train_df['weather_type'])
test_df['weather_type'] =np.where(test_df['weather_type']=='Smoke',10 ,test_df['weather_type'])


In [27]:
lr = LinearRegression()
lr_params = {}
lr_grid = GridSearchCV(lr,lr_params,cv = 5,verbose = 5,scoring = 'neg_mean_absolute_error',return_train_score=True)
lr_grid.fit(train_df,train_df['traffic_volume'])
lr_score = lr_grid.cv_results_
print(lr_score)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................
[CV] .............. , score=(train=-0.000, test=-0.000), total=   0.1s
[CV]  ................................................................
[CV] .............. , score=(train=-0.000, test=-0.000), total=   0.1s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s


[CV] .............. , score=(train=-0.000, test=-0.000), total=   0.1s
[CV]  ................................................................
[CV] .............. , score=(train=-0.000, test=-0.000), total=   0.1s
[CV]  ................................................................
[CV] .............. , score=(train=-0.000, test=-0.000), total=   0.1s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.5s finished


{'mean_fit_time': array([0.05955062]), 'std_fit_time': array([0.01554077]), 'mean_score_time': array([0.00985012]), 'std_score_time': array([0.0002813]), 'params': [{}], 'split0_test_score': array([-3.1862444e-13]), 'split1_test_score': array([-1.4189031e-12]), 'split2_test_score': array([-2.33574338e-13]), 'split3_test_score': array([-1.25359227e-12]), 'split4_test_score': array([-1.5475358e-12]), 'mean_test_score': array([-9.54419382e-13]), 'std_test_score': array([5.62302036e-13]), 'rank_test_score': array([1], dtype=int32), 'split0_train_score': array([-3.12026101e-13]), 'split1_train_score': array([-1.40150078e-12]), 'split2_train_score': array([-2.29177996e-13]), 'split3_train_score': array([-1.27327542e-12]), 'split4_train_score': array([-1.48824338e-12]), 'mean_train_score': array([-9.40844736e-13]), 'std_train_score': array([5.52130755e-13])}


In [28]:
print(lr_grid.best_params_)
print("Test score",lr_score['mean_test_score'])
print(lr_score['split1_test_score'])
print(lr_score['params'])

{}
Test score [-9.54419382e-13]
[-1.4189031e-12]
[{}]


In [33]:
lasso = Lasso()
lasso_params = {'alpha':[0.001,0.01,0.1]}
lasso_grid = GridSearchCV(lasso,lasso_params,cv = 5,verbose = 10,scoring = 'neg_mean_absolute_error',return_train_score=True)
lasso_grid.fit(train_df,train_df['traffic_volume'])
lasso_score = lasso_grid.cv_results_
print(lasso_score)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] alpha=0.001 .....................................................
[CV] ... alpha=0.001, score=(train=-0.000, test=-0.000), total=   0.1s
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] ... alpha=0.001, score=(train=-0.001, test=-0.001), total=   0.2s
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


[CV] ... alpha=0.001, score=(train=-0.000, test=-0.000), total=   0.2s
[CV] alpha=0.001 .....................................................
[CV] ... alpha=0.001, score=(train=-0.001, test=-0.001), total=   0.2s
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.8s remaining:    0.0s


[CV] ... alpha=0.001, score=(train=-0.001, test=-0.001), total=   0.2s
[CV] alpha=0.01 ......................................................
[CV] .... alpha=0.01, score=(train=-0.000, test=-0.000), total=   0.1s
[CV] alpha=0.01 ......................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    1.2s remaining:    0.0s


[CV] .... alpha=0.01, score=(train=-0.000, test=-0.000), total=   0.2s
[CV] alpha=0.01 ......................................................
[CV] .... alpha=0.01, score=(train=-0.000, test=-0.000), total=   0.1s
[CV] alpha=0.01 ......................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    1.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    1.6s remaining:    0.0s


[CV] .... alpha=0.01, score=(train=-0.000, test=-0.000), total=   0.1s
[CV] alpha=0.01 ......................................................
[CV] .... alpha=0.01, score=(train=-0.000, test=-0.000), total=   0.1s
[CV] alpha=0.1 .......................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    1.8s remaining:    0.0s


[CV] ..... alpha=0.1, score=(train=-0.000, test=-0.000), total=   0.1s
[CV] alpha=0.1 .......................................................
[CV] ..... alpha=0.1, score=(train=-0.000, test=-0.000), total=   0.1s
[CV] alpha=0.1 .......................................................
[CV] ..... alpha=0.1, score=(train=-0.000, test=-0.000), total=   0.2s
[CV] alpha=0.1 .......................................................
[CV] ..... alpha=0.1, score=(train=-0.001, test=-0.000), total=   0.1s
[CV] alpha=0.1 .......................................................
[CV] ..... alpha=0.1, score=(train=-0.000, test=-0.001), total=   0.1s
{'mean_fit_time': array([0.1478199 , 0.13520384, 0.10452352]), 'std_fit_time': array([0.02448012, 0.03193214, 0.02879991]), 'mean_score_time': array([0.01125803, 0.01206903, 0.01425996]), 'std_score_time': array([9.76398573e-05, 1.64602449e-03, 2.59633764e-03]), 'param_alpha': masked_array(data=[0.001, 0.01, 0.1],
             mask=[False, False, False],
    

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    2.8s finished


In [34]:
print(lasso_grid.best_params_)
print(lasso_score['mean_test_score'])
print(lasso_score['params'])

{'alpha': 0.01}
[-7.68744500e-04 -4.41683847e-06 -2.75681907e-04]
[{'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}]


In [51]:
ridge = Ridge()
ridge_params = {'alpha':[0,0.5,1]}
ridge_grid = GridSearchCV(ridge,ridge_params,cv = 5,verbose = 10,scoring = 'neg_mean_absolute_error',return_train_score=True)
ridge_grid.fit(train_df,train_df['traffic_volume'])
ridge_score = ridge_grid.cv_results_
print(ridge_score)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] alpha=0 .........................................................
[CV] ....... alpha=0, score=(train=-0.000, test=-0.000), total=   0.1s
[CV] alpha=0 .........................................................
[CV] ....... alpha=0, score=(train=-0.000, test=-0.000), total=   0.1s
[CV] alpha=0 .........................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s


[CV] ....... alpha=0, score=(train=-0.000, test=-0.000), total=   0.1s
[CV] alpha=0 .........................................................
[CV] ....... alpha=0, score=(train=-0.000, test=-0.000), total=   0.1s
[CV] alpha=0 .........................................................
[CV] ....... alpha=0, score=(train=-0.000, test=-0.000), total=   0.1s
[CV] alpha=0.5 .......................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.5s remaining:    0.0s


[CV] ..... alpha=0.5, score=(train=-0.000, test=-0.000), total=   0.1s
[CV] alpha=0.5 .......................................................
[CV] ..... alpha=0.5, score=(train=-0.000, test=-0.000), total=   0.1s
[CV] alpha=0.5 .......................................................
[CV] ..... alpha=0.5, score=(train=-0.000, test=-0.000), total=   0.1s
[CV] alpha=0.5 .......................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.7s remaining:    0.0s


[CV] ..... alpha=0.5, score=(train=-0.000, test=-0.000), total=   0.1s
[CV] alpha=0.5 .......................................................
[CV] ..... alpha=0.5, score=(train=-0.000, test=-0.000), total=   0.1s
[CV] alpha=1 .........................................................
[CV] ....... alpha=1, score=(train=-0.000, test=-0.000), total=   0.0s
[CV] alpha=1 .........................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.8s remaining:    0.0s


[CV] ....... alpha=1, score=(train=-0.000, test=-0.000), total=   0.1s
[CV] alpha=1 .........................................................
[CV] ....... alpha=1, score=(train=-0.000, test=-0.000), total=   0.1s
[CV] alpha=1 .........................................................
[CV] ....... alpha=1, score=(train=-0.000, test=-0.000), total=   0.1s
[CV] alpha=1 .........................................................
[CV] ....... alpha=1, score=(train=-0.000, test=-0.000), total=   0.1s
{'mean_fit_time': array([0.04477463, 0.04402928, 0.04602542]), 'std_fit_time': array([0.00607089, 0.00447348, 0.01031177]), 'mean_score_time': array([0.01051626, 0.01105733, 0.0143538 ]), 'std_score_time': array([0.00081973, 0.00177577, 0.00596329]), 'param_alpha': masked_array(data=[0, 0.5, 1],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'alpha': 0}, {'alpha': 0.5}, {'alpha': 1}], 'split0_test_score': array([-7.57904906e-12, -9.25766472e-0

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    1.4s finished


In [52]:
print(ridge_score['mean_test_score'])
print(ridge_score['mean_train_score'])
print(ridge_grid.best_params_)

[-4.02391492e-12 -8.99530595e-09 -1.79903086e-08]
[-3.05739809e-12 -8.97180092e-09 -1.79433746e-08]
{'alpha': 0}


In [47]:
#rf = RandomForestRegressor(n_estimators=100,min_samples_leaf=2,min_samples_split=4)
#y_pred = []
#y_true = []
#for i in range(5):
#    X_train,X_test,y_train,y_test = train_test_split(train_df,train_df['traffic_volume'],test_size=0.3,random_state=42*i)
#    rf.fit(X_train,y_train)
#    y_pred.append(rf.predict(X_test))
#    y_true.append(y_test)

In [54]:
#mae_func = lambda pred,true:mean_absolute_error(np.expm1(pred),np.expm1(true))
#mae = map(mae_func,y_pred,y_true)
#print(mae)

In [215]:
train_df.head(30)

Unnamed: 0,date,hour,days,quarter,month,year,dayofyear,dayofmonth,weekofyear,air_pollution_index,humidity,wind_speed,dew_point,temperature,clouds_all,weather_type,traffic_volume
0,0,9,1,4,10,2012,276,2,40,121,89,2,1,288,40,0,5545
1,1,10,1,4,10,2012,276,2,40,178,67,3,1,289,75,0,4516
2,2,11,1,4,10,2012,276,2,40,113,66,3,2,290,90,0,4767
3,3,12,1,4,10,2012,276,2,40,20,66,3,5,290,90,0,5026
4,4,13,1,4,10,2012,276,2,40,281,65,3,7,291,75,0,4918
5,5,14,1,4,10,2012,276,2,40,23,65,3,6,292,1,1,5181
6,6,15,1,4,10,2012,276,2,40,184,64,3,7,293,1,1,5584
7,7,16,1,4,10,2012,276,2,40,167,64,3,7,294,1,1,6015
8,8,17,1,4,10,2012,276,2,40,119,63,3,6,294,20,0,5791
9,9,18,1,4,10,2012,276,2,40,161,63,3,3,293,20,0,4770


In [222]:
X_train = train_df[['hour','days','month','humidity']]
y_train = train_df['traffic_volume']

X_test = test_df[['hour','days','month','humidity']]

In [223]:
#xgboost

In [224]:

params = {
 'learning_rate':0.06,\
 'max_depth':2,\
 'min_child_weight':6,\
 'gamma':0,\
 'subsample':0.75,\
 'colsample_bytree':0.8,\
 'nthread':4,\
 'scale_pos_weight':1,\
 'seed':27}

#K fold
#n_folds = 10
#early_stopping = 10
#xg_train = xgb.DMatrix(X_train, label=y_train);
#cv = xgb.cv(params, xg_train, 108, nfold=n_folds, early_stopping_rounds=early_stopping, verbose_eval=1,metrics="rmse")


In [225]:
model_xgb = XGBRegressor(learning_rate=0.06,\
 max_depth=2,\
 min_child_weight=6,\
 gamma=0,\
 subsample=0.75,\
 colsample_bytree=0.8,\
 nthread=4,\
 scale_pos_weight=1,\
 seed=27)

In [226]:
#print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
kfold = KFold(n_splits=10,random_state=42)
results = cross_val_score(model_xgb,X,y,cv=kfold,scoring='neg_mean_squared_log_error')

print(((-results.mean())))

0.23554550726659884


In [260]:
model_xgb.fit(X_train,y_train)
y_pred_xgb = model_xgb.predict(X_test)
y_pred_xgb
y_pred_xgb[y_pred_xgb<0]
#print(cv["test-rmse-mean"].tail())

#y_pred = reg.predict(X_test)
#np.sqrt(mean_squared_log_error(y_test, y_pred))
#y_pred[y_pred<0]



array([-29.96328 , -29.96328 , -29.96328 , -33.384335, -29.96328 ,
       -29.96328 , -29.96328 , -33.384335, -29.96328 , -29.96328 ,
       -29.96328 , -14.325878, -58.57238 , -58.57238 , -58.57238 ,
       -42.934967, -29.96328 , -29.96328 , -29.96328 , -14.325878],
      dtype=float32)

In [228]:
#GBR

In [232]:
params = {
    'n_estimators': 78,
    'max_depth': 8,
    'learning_rate': 0.06,
    'subsample': 0.8,
    'min_samples_leaf':60,
    'min_samples_split':1400
}
model_gbr = ensemble.GradientBoostingRegressor(**params)


In [233]:
kfold = KFold(n_splits=10,random_state=42)
results = cross_val_score(model_gbr,X,y,cv=kfold,scoring='neg_mean_squared_log_error')

print(((-results.mean())))

0.20621133380469064


In [234]:
model_gbr.fit(X_train,y_train)
y_pred_gbr = model_gbr.predict(X_test)
y_pred_gbr

array([ 716.37814839,  716.37814839,  716.37814839, ..., 2249.94157578,
       1896.43808019, 1396.92902134])

In [277]:
#EXPORTING

dft = pd.read_csv("Test.csv")
final = {'date_time':dft['date_time'],'traffic_volume':np.round(y_pred_xgb).astype(int)}
final['traffic_volume'] = np.round(((y_pred_xgb*0.1)+(y_pred_gbr*0.9)).astype(int))
final = pd.DataFrame(final)
final.to_csv('Submission.csv',index = False)

In [278]:
final
#final[final['traffic_volume']<0]

Unnamed: 0,date_time,traffic_volume
0,2017-05-18 00:00:00,738
1,2017-05-18 00:00:00,738
2,2017-05-18 00:00:00,738
3,2017-05-18 01:00:00,470
4,2017-05-18 01:00:00,470
5,2017-05-18 02:00:00,396
6,2017-05-18 02:00:00,396
7,2017-05-18 02:00:00,396
8,2017-05-18 03:00:00,436
9,2017-05-18 03:00:00,436
