To install xgboost package: *!conda install -c conda-forge xgboost*

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
import numpy as np

In [2]:
# Check if we have a GPU
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 149372704519492368, name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 16116512034728060295
 physical_device_desc: "device: XLA_CPU device", name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 13546559401204391687
 physical_device_desc: "device: XLA_GPU device", name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 11281491559
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 14841359796577988151
 physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:00:04.0, compute capability: 3.7"]

In [3]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


To use the personal Drive unit in order to load and save files.

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
%%time
source = '/content/drive/My Drive/Colab_Notebooks/TFM_Kschool/traffic_data_complete.zip'
df = pd.read_csv(source,sep=',')

CPU times: user 14.4 s, sys: 1.69 s, total: 16.1 s
Wall time: 16.1 s


In [6]:
df.head()

Unnamed: 0,id,intensidad,ocupacion,carga,vmed,periodo_integracion,Hora,Lat,Long,M30,URB,Mes,Dia,Minutos
0,1001,204,12,0,73,5,0,40.409729,-3.740786,1,0,1,1,0
1,1002,252,1,0,79,5,0,40.408029,-3.74376,1,0,1,1,0
2,1003,420,2,0,82,5,0,40.406824,-3.746834,1,0,1,1,0
3,1006,288,1,0,75,5,0,40.411894,-3.736324,1,0,1,1,0
4,1009,276,0,0,76,5,0,40.416233,-3.724909,1,0,1,1,0


In [7]:
df.sample(5)

Unnamed: 0,id,intensidad,ocupacion,carga,vmed,periodo_integracion,Hora,Lat,Long,M30,URB,Mes,Dia,Minutos
4846937,5993,0,0,0,0,15,2,40.417211,-3.623704,0,1,3,31,0
600194,6684,2768,10,78,88,15,13,40.387559,-3.680936,1,0,1,15,15
9084408,6846,3756,9,0,61,5,15,40.399618,-3.720037,1,0,5,15,45
4902448,1015,3588,22,0,23,5,8,40.420487,-3.721843,1,0,2,1,45
1525845,6772,1208,4,35,59,15,16,40.484712,-3.697591,1,0,7,7,15


In [8]:
df.dtypes

id                       int64
intensidad               int64
ocupacion                int64
carga                    int64
vmed                     int64
periodo_integracion      int64
Hora                     int64
Lat                    float64
Long                   float64
M30                      int64
URB                      int64
Mes                      int64
Dia                      int64
Minutos                  int64
dtype: object

SPLITTING THE DATASET IN TRAIN AND TEST SETS

In [0]:
X=df.drop(['carga'],axis=1)

In [10]:
type(X['Hora'][0])

numpy.int64

In [11]:
X.shape

(9740545, 13)

In [0]:
y=df['carga']

In [13]:
y.shape

(9740545,)

Training With GridSearchCV

In [0]:
%%time
#Decission Tree
reg_DeciTree=GridSearchCV(DecisionTreeRegressor(min_samples_leaf=1,max_depth=4),
                          param_grid={"min_samples_leaf":[10,20,30],"max_depth":range(5,6)},
                          scoring="neg_mean_squared_error"
                          )
reg_DeciTree.fit(X,y)
print(reg_DeciTree.best_score_)
print(np.sqrt(-reg_DeciTree.best_score_))
print(reg_DeciTree.best_params_)



-55.22309243886141
7.4312241547985485
{'max_depth': 5, 'min_samples_leaf': 10}
CPU times: user 3min 58s, sys: 3.11 s, total: 4min 1s
Wall time: 4min 1s


In [0]:
%%time
#Decission Tree v2
reg_DeciTree2=GridSearchCV(DecisionTreeRegressor(min_samples_leaf=10,max_depth=9),
                          param_grid={"min_samples_leaf":[10,20,30],"max_depth":range(9,10)},
                          scoring="neg_mean_squared_error"
                          )
reg_DeciTree2.fit(X,y)
print(reg_DeciTree2.best_score_)
print(np.sqrt(-reg_DeciTree2.best_score_))
print(reg_DeciTree2.best_params_)



-33.74100075145445
5.8087004356787455
{'max_depth': 9, 'min_samples_leaf': 30}
CPU times: user 7min 58s, sys: 3.82 s, total: 8min 2s
Wall time: 8min 2s


In [0]:
%%time
#Decission Tree v3
reg_DeciTree3=GridSearchCV(DecisionTreeRegressor(min_samples_leaf=1,max_depth=19),
                          param_grid={"min_samples_leaf":[10,20,30],"max_depth":range(19,20)},
                          scoring="neg_mean_squared_error"
                          )
reg_DeciTree3.fit(X,y)
print(np.sqrt(-reg_DeciTree3.best_score_))
print(reg_DeciTree3.best_params_)



2.5516449684326363
{'max_depth': 19, 'min_samples_leaf': 20}
CPU times: user 11min 19s, sys: 2.76 s, total: 11min 22s
Wall time: 11min 22s


In [0]:
%%time
#Random Forest
reg_RF=GridSearchCV(RandomForestRegressor(n_estimators=50,min_samples_leaf=20,max_depth=19),
                          param_grid={"min_samples_leaf":[20,30],"max_depth":range(19,20)},
                          scoring="neg_mean_squared_error"
                          )
reg_RF.fit(X,y)
print(-reg_RF.best_score_)
print(np.sqrt(-reg_RF.best_score_))
print(reg_RF.best_params_)



5.237243637879109
2.288502488064872
{'max_depth': 19, 'min_samples_leaf': 20}
CPU times: user 4h 35min 19s, sys: 12.2 s, total: 4h 35min 32s
Wall time: 4h 35min 38s


In [0]:
from sklearn.externals import joblib
joblib.dump(reg_RF.best_estimator_, '/content/drive/My Drive/Colab_Notebooks/TFM_Kschool/randomforest_colab.pkl', compress = 1)

['/content/drive/My Drive/Colab_Notebooks/TFM_Kschool/randomforest_colab.pkl']

In [0]:
%%time
#XGBoost
reg_XGB=GridSearchCV(XGBRegressor(n_estimators=50,min_samples_leaf=20,max_depth=19),
                          param_grid={"min_samples_leaf":[20,30],"max_depth":range(19,20)},
                          scoring="neg_mean_squared_error"
                          )
reg_XGB.fit(X,y)
print(reg_XGB.best_score_)
print(np.sqrt(-reg_XGB.best_score_))
print(reg_XGB.best_params_)



-8.799743849180878
2.966436220312326
{'max_depth': 19, 'min_samples_leaf': 20}
CPU times: user 4h 55min 19s, sys: 29.9 s, total: 4h 55min 49s
Wall time: 4h 55min 51s


In [0]:
from sklearn.externals import joblib
joblib.dump(reg_XGB.best_estimator_, '/content/drive/My Drive/Colab_Notebooks/TFM_Kschool/xgb_colab.pkl', compress = 1)

['/content/drive/My Drive/Colab_Notebooks/TFM_Kschool/xgb_colab.pkl']

In [0]:
%%time
#Random Forest2
reg_RF2=GridSearchCV(RandomForestRegressor(n_estimators=100,min_samples_leaf=20,max_depth=28),
                          param_grid={"min_samples_leaf":[20,30],"max_depth":range(29,30)},
                          scoring="neg_mean_squared_error"
                          )
reg_RF2.fit(X,y)
print(reg_RF2.best_score_)
print(np.sqrt(-reg_RF2.best_score_))
print(reg_RF2.best_params_)

In [0]:
from sklearn.externals import joblib
joblib.dump(reg_RF2.best_estimator_, '/content/drive/My Drive/Colab_Notebooks/TFM_Kschool/randomforest2_colab_compr3.pkl', compress = 3)

['/content/drive/My Drive/Colab_Notebooks/randomforest2_colab_compr3.pkl']

In [0]:
%%time
#XGBoost2
reg_XGB2=GridSearchCV(XGBRegressor(n_estimators=100,min_samples_leaf=20,max_depth=28),
                          param_grid={"min_samples_leaf":[20,30],"max_depth":range(29,30)},
                          scoring="neg_mean_squared_error"
                          )
reg_XGB2.fit(X,y)
print(reg_XGB2.best_score_)
print(np.sqrt(-reg_XGB2.best_score_))
print(reg_XGB2.best_params_)



In [0]:
from sklearn.externals import joblib
joblib.dump(reg_XGB2.best_estimator_, '/content/drive/My Drive/Colab_Notebooks/TFM_Kschool/xgb2_colab.pkl', compress = 3)