## **IMPORT PACKAGES**

To install xgboost package: *!conda install -c conda-forge xgboost*

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

  from numpy.core.umath_tests import inner1d


## **READING PREPARED DATASET**

In [5]:
%%time
source = '/Users/antoniobravomunoz/Documents/DATA_SCIENCE_MASTER/TFM/Data-Science-Master-project/DATA/traffic_data_complete.csv'
df = pd.read_csv(source,sep=',')

CPU times: user 14.9 s, sys: 2.3 s, total: 17.2 s
Wall time: 17.6 s


In [6]:
df.head()

Unnamed: 0,id,intensidad,ocupacion,carga,vmed,periodo_integracion,Hora,Lat,Long,M30,URB,Mes,Dia,Minutos
0,1001,204,12,0,73,5,0,40.409729,-3.740786,1,0,1,1,0
1,1002,252,1,0,79,5,0,40.408029,-3.74376,1,0,1,1,0
2,1003,420,2,0,82,5,0,40.406824,-3.746834,1,0,1,1,0
3,1006,288,1,0,75,5,0,40.411894,-3.736324,1,0,1,1,0
4,1009,276,0,0,76,5,0,40.416233,-3.724909,1,0,1,1,0


In [7]:
df.sample(5)

Unnamed: 0,id,intensidad,ocupacion,carga,vmed,periodo_integracion,Hora,Lat,Long,M30,URB,Mes,Dia,Minutos
451496,3706,556,5,19,77,15,21,40.493856,-3.70004,1,0,1,11,45
1244116,6703,1821,8,40,51,15,11,40.483292,-3.70105,1,0,1,31,45
983199,6891,30,0,0,41,4,2,40.395205,-3.673227,1,0,1,25,30
6429089,6876,1068,2,0,70,5,14,40.388851,-3.684994,1,0,8,11,30
7018280,1021,1032,2,0,65,5,10,40.393484,-3.702727,1,0,8,26,30


In [8]:
df.dtypes

id                       int64
intensidad               int64
ocupacion                int64
carga                    int64
vmed                     int64
periodo_integracion      int64
Hora                     int64
Lat                    float64
Long                   float64
M30                      int64
URB                      int64
Mes                      int64
Dia                      int64
Minutos                  int64
dtype: object

## **SPLIT DATASET IN X (OBSERVATIONS) AND Y (TARGET)**

In [9]:
X=df.drop(['carga'],axis=1)

In [10]:
X.shape

(9740545, 13)

In [11]:
y=df['carga']

In [12]:
y.shape

(9740545,)

## **ML REGRESSION ALGORITHMS**

In our case, we have chosen to use Decission Tree Methods due to they have some interesting advantages:
- They are easy to understand.
- They require little data preparation.
- They are robust.
- They work good with large datasets


Using GridSearchCV we look for the best parameters which give the best score in order to use it as final model parameters.

First of all, we are testing with a **Decision Tree Regressor**

In [13]:
%%time
#Decission Tree
reg_DeciTree=GridSearchCV(DecisionTreeRegressor(min_samples_leaf=1,max_depth=4),
                          param_grid={"min_samples_leaf":[10,20,30,40,70,100],"max_depth":range(2,5)},
                          scoring="neg_mean_squared_error"
                          )
reg_DeciTree.fit(X,y)
print(reg_DeciTree.best_score_)
print(np.sqrt(-reg_DeciTree.best_score_))
print(reg_DeciTree.best_params_)

-70.60611753784305
8.402744643141492
{'max_depth': 4, 'min_samples_leaf': 10}
CPU times: user 11min 32s, sys: 1min 19s, total: 12min 51s
Wall time: 12min 9s


In [14]:
%%time
#Decission Tree v2
reg_DeciTree2=GridSearchCV(DecisionTreeRegressor(min_samples_leaf=1,max_depth=4),
                          param_grid={"min_samples_leaf":[10,20,30,40,70,100],"max_depth":range(10,15)},
                          scoring="neg_mean_squared_error"
                          )
reg_DeciTree2.fit(X,y)
print(reg_DeciTree2.best_score_)
print(np.sqrt(-reg_DeciTree2.best_score_))
print(reg_DeciTree2.best_params_)

-14.437166601055841
3.799627166059302
{'max_depth': 14, 'min_samples_leaf': 30}
CPU times: user 1h 4min 24s, sys: 6min 10s, total: 1h 10min 34s
Wall time: 1h 13min 4s


In [15]:
%%time
#Decission Tree v3
reg_DeciTree3=GridSearchCV(DecisionTreeRegressor(min_samples_leaf=9,max_depth=19),
                          param_grid={"min_samples_leaf":[10,20,30,40],"max_depth":range(19,20)},
                          scoring="neg_mean_squared_error"
                          )
reg_DeciTree3.fit(X,y)
print(np.sqrt(-reg_DeciTree3.best_score_))
print(reg_DeciTree3.best_params_)

2.5348422766054957
{'max_depth': 19, 'min_samples_leaf': 10}
CPU times: user 11min 55s, sys: 21.8 s, total: 12min 17s
Wall time: 12min 10s


In [None]:
%%time
#Random Forest
reg_RF=GridSearchCV(RandomForestRegressor(n_estimators=100,min_samples_leaf=9,max_depth=9),
                          param_grid={"min_samples_leaf":[10,20,30,40],"max_depth":range(10,15)},
                          scoring="neg_mean_squared_error"
                          )
reg_RF.fit(X,y)
print(-reg_RF.best_score_)
print(np.sqrt(-reg_RF.best_score_))
print(reg_RF.best_params_)

In [None]:
%%time
#Random Forest v2
reg_RF2=GridSearchCV(RandomForestRegressor(n_estimators=100,min_samples_leaf=9,max_depth=19),
                          param_grid={"min_samples_leaf":[10,20,30,40],"max_depth":range(19,20)},
                          scoring="neg_mean_squared_error"
                          )
reg_RF2.fit(X,y)
print(-reg_RF2.best_score_)
print(np.sqrt(-reg_RF2.best_score_))
print(reg_RF2.best_params_)

In [None]:
%%time
#Random Forest v3
reg_RF3=GridSearchCV(RandomForestRegressor(n_estimators=100,min_samples_leaf=9,max_depth=29),
                          param_grid={"min_samples_leaf":[10,20,30,40],"max_depth":range(29,30)},
                          scoring="neg_mean_squared_error"
                          )
reg_RF3.fit(X,y)
print(-reg_RF3.best_score_)
print(np.sqrt(-reg_RF3.best_score_))
print(reg_RF3.best_params_)

**Regression metrics**
- ‘explained_variance’ 
- ‘neg_mean_absolute_error’
- ‘neg_mean_squared_error’
- ‘neg_mean_squared_log_error’
- ‘neg_median_absolute_error’

In [None]:
%%time
#XGBoost
reg_XGB=GridSearchCV(XGBRegressor(n_estimators=100,min_samples_leaf=9,max_depth=19),
                          param_grid={"min_samples_leaf":[10,20,30,40],"max_depth":range(19,20)},
                          scoring="neg_mean_squared_error"
                          )
reg_XGB.fit(X,y)
print(reg_XGB.best_score_)
print(np.sqrt(-reg_XGB.best_score_))
print(reg_XGB.best_params_)

In [None]:
%%time
#XGBoost
reg_XGB2=GridSearchCV(XGBRegressor(n_estimators=100,min_samples_leaf=9,max_depth=29),
                          param_grid={"min_samples_leaf":[10,20,30,40],"max_depth":range(29,30)},
                          scoring="neg_mean_squared_error"
                          )
reg_XGB2.fit(X,y)
print(reg_XGB2.best_score_)
print(np.sqrt(-reg_XGB2.best_score_))
print(reg_XGB2.best_params_)