In [1]:
## IMPORTACIÓN GENERAL DE LIBRERIAS Y VISUALIZACIÓN DE DATOS (matplotlib y seaborn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as DT
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import xgboost as xgb

%matplotlib inline
warnings.filterwarnings('ignore')
plt.style.use('default') 
sns.set(style="whitegrid") 
plt.rcParams['figure.figsize'] = (15, 10)
pd.set_option('display.max_columns', 1000)
np.set_printoptions(threshold=np.nan)



In [2]:
################################################################
## EVENTS.
################################################################
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES CSV.
events = pd.read_csv('../../events.csv')
## TRANSFORMACIÓN DE TIPOS PARA EL CSV (EVENTS)
events['wifi'] = events['wifi'].fillna(False).astype(bool)
events['connection_type'] = events['connection_type'].astype('category')
events['trans_id'] = events['trans_id'].astype('category')
events['date'] = pd.to_datetime(events['date'], infer_datetime_format=True)
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
events['mes'] = events['date'].dt.month
events['dia'] = events['date'].dt.day
events['hora'] = events['date'].dt.hour
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
events['hora_madrugada'] = 0
events.loc[((events.hora > -1) & (events.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
events['hora_maniana'] = 0
events.loc[((events.hora > 6) & (events.hora < 12)), 'hora_maniana'] = 1
# ALMUERZO de 12 a 13
events['hora_almuerzo'] = 0
events.loc[((events.hora > 11) & (events.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
events['hora_tarde'] = 0
events.loc[((events.hora > 13) & (events.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
events['hora_noche'] = 0
events.loc[((events.hora > 18) & (events.hora < 24)), 'hora_noche'] = 1
################################################################
################################################################

In [3]:
##################### connection_type
dummies = pd.get_dummies(events['connection_type'], drop_first=False)
events = pd.concat([events, dummies], axis=1)
del events['connection_type']
##################### event_uuid
del events['event_uuid']
##################### date
del events['date']
##################### wifi
events['wifi_value'] = 2
events.loc[events.wifi == False, 'wifi_value'] = 0
events.loc[events.wifi == True, 'wifi_value'] = 1
del events['wifi']
##################### attributed
events['attributed_value'] = 2
events.loc[events.attributed == False, 'attributed_value'] = 0
events.loc[events.attributed == True, 'attributed_value'] = 1
del events['attributed']
##################### trans_id
events['trans_id_value'] = events['trans_id'].cat.codes
events.loc[events.trans_id_value == -1, 'trans_id_value'] = 0
del events['trans_id']
##################### fillna
events.fillna(0, inplace = True)

In [16]:
################################################################
## Ventana 1: Del 18 al 20
## Ventana 2: Del 19 al 21
## Ventana 3: Del 20 al 22
## Ventana 4: Del 21 al 23
## Ventana 5: Del 22 al 24
################################################################
events_Ventana1 = events[(events['dia'] >= 18) & (events['dia'] <= 20)]
events_Ventana2 = events[(events['dia'] >= 19) & (events['dia'] <= 21)]
events_Ventana3 = events[(events['dia'] >= 20) & (events['dia'] <= 22)]
events_Ventana4 = events[(events['dia'] >= 21) & (events['dia'] <= 23)]
events_Ventana5 = events[(events['dia'] >= 22) & (events['dia'] <= 24)]
################################################################

In [34]:
################################################################
## Limpiamos el CSV cargado para que no ocupe memoria.
events = ''
events_Ventana1 = ''
events_Ventana2 = ''
events_Ventana3 = ''
events_Ventana4 = ''
events_Ventana5 = ''
################################################################

In [129]:
################################################################
## CLICKS.
################################################################
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES CSV.
clicks = pd.read_csv('../../clicks.csv')
## TRANSFORMACIÓN DE TIPOS PARA EL CSV (CLICKS)
clicks['created'] = pd.to_datetime(clicks['created'], infer_datetime_format=True)
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
clicks['mes'] = clicks['created'].dt.month
clicks['dia'] = clicks['created'].dt.day
clicks['hora'] = clicks['created'].dt.hour
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
clicks['hora_madrugada'] = 0
clicks.loc[((clicks.hora > -1) & (clicks.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
clicks['hora_maniana'] = 0
clicks.loc[((clicks.hora > 6) & (clicks.hora < 12)), 'hora_maniana'] = 1
# ALMUERZO de 12 a 13
clicks['hora_almuerzo'] = 0
clicks.loc[((clicks.hora > 11) & (clicks.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
clicks['hora_tarde'] = 0
clicks.loc[((clicks.hora > 13) & (clicks.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
clicks['hora_noche'] = 0
clicks.loc[((clicks.hora > 18) & (clicks.hora < 24)), 'hora_noche'] = 1
################################################################
################################################################

In [130]:
##################### trans_id
clicks.fillna(0, inplace = True)
clicks['trans_id'] = clicks['trans_id'].astype('category')
##################### date
del clicks['created']
##################### wifi_connection
clicks['wifi_value'] = 2
clicks.loc[clicks.wifi_connection == False, 'wifi_value'] = 0
clicks.loc[clicks.wifi_connection == True, 'wifi_value'] = 1
del clicks['wifi_connection']
##################### trans_id
clicks['trans_id_value'] = clicks['trans_id'].cat.codes
clicks.loc[clicks.trans_id_value == -1, 'trans_id_value'] = 0
del clicks['trans_id']
##################### touchXY
clicks.loc[clicks.touchX == 'Infinity', 'touchX'] = 2
clicks.loc[clicks.touchY == 'Infinity', 'touchY'] = 2
clicks['touchX'] = clicks['touchX'].astype(float).fillna(0.0)
clicks['touchY'] = clicks['touchY'].astype(float).fillna(0.0)

In [127]:
################################################################
## Ventana 1: Del 18 al 20
## Ventana 2: Del 19 al 21
## Ventana 3: Del 20 al 22
## Ventana 4: Del 21 al 23
## Ventana 5: Del 22 al 24
################################################################
clicks_Ventana1 = clicks[(clicks['dia'] >= 18) & (clicks['dia'] <= 20)]
clicks_Ventana2 = clicks[(clicks['dia'] >= 19) & (clicks['dia'] <= 21)]
clicks_Ventana3 = clicks[(clicks['dia'] >= 20) & (clicks['dia'] <= 22)]
clicks_Ventana4 = clicks[(clicks['dia'] >= 21) & (clicks['dia'] <= 23)]
clicks_Ventana5 = clicks[(clicks['dia'] >= 22) & (clicks['dia'] <= 24)]
################################################################

In [33]:
################################################################
## Limpiamos el CSV cargado para que no ocupe memoria.
clicks = ''
clicks_Ventana1 = ''
clicks_Ventana2 = ''
clicks_Ventana3 = ''
clicks_Ventana4 = ''
clicks_Ventana5 = ''
################################################################

In [2]:
################################################################
## INSTALLS.
################################################################
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES CSV.
installs = pd.read_csv('../../installs.csv')
## TRANSFORMACIÓN DE TIPOS PARA EL CSV (INSTALLS)
installs['kind'] = installs['kind'].astype('category')
installs['user_agent'] = installs['user_agent'].astype('category')
installs['session_user_agent'] = installs['session_user_agent'].astype('category')
installs['created'] = pd.to_datetime(installs['created'], infer_datetime_format=True)
installs['trans_id'] = installs['trans_id'].astype('category')
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
installs['mes'] = installs['created'].dt.month
installs['dia'] = installs['created'].dt.day
installs['hora'] = installs['created'].dt.hour
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
installs['hora_madrugada'] = 0
installs.loc[((installs.hora > -1) & (installs.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
installs['hora_maniana'] = 0
installs.loc[((installs.hora > 6) & (installs.hora < 12)), 'hora_maniana'] = 1
# ALMUERZO de 12 a 13
installs['hora_almuerzo'] = 0
installs.loc[((installs.hora > 11) & (installs.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
installs['hora_tarde'] = 0
installs.loc[((installs.hora > 13) & (installs.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
installs['hora_noche'] = 0
installs.loc[((installs.hora > 18) & (installs.hora < 24)), 'hora_noche'] = 1
################################################################
################################################################

In [3]:
##################### fillna
installs['click_hash'].fillna(0, inplace = True)
installs['device_brand'].fillna(0, inplace = True)
installs['device_model'].fillna(0, inplace = True)
installs['device_language'].fillna(0, inplace = True)
##################### created event
del installs['event_uuid']
#del installs['created']
##################### wifi
installs['wifi_value'] = 2
installs.loc[installs.wifi == False, 'wifi_value'] = 0
installs.loc[installs.wifi == True, 'wifi_value'] = 1
del installs['wifi']
##################### attributed
installs['attributed_value'] = 2
installs.loc[installs.attributed == False, 'attributed_value'] = 0
installs.loc[installs.attributed == True, 'attributed_value'] = 1
del installs['attributed']
##################### implicit
installs['implicit_value'] = 2
installs.loc[installs.implicit == False, 'implicit_value'] = 0
installs.loc[installs.implicit == True, 'implicit_value'] = 1
del installs['implicit']
##################### click_hash
installs['click_hash'] = installs['click_hash'].astype('category')
##################### session_user_agent
installs['session_user_agent_value'] = installs['session_user_agent'].cat.codes
del installs['session_user_agent']
installs.loc[installs.session_user_agent_value == -1, 'session_user_agent_value'] = 0
##################### click_hash
installs['click_hash_value'] = installs['click_hash'].cat.codes
installs.loc[installs.click_hash_value == -1, 'click_hash_value'] = 0
del installs['click_hash']
##################### user_agent
installs['user_agent_value'] = installs['user_agent'].cat.codes
installs.loc[installs.user_agent_value == -1, 'user_agent_value'] = 0
del installs['user_agent']
##################### kind
installs['kind_value'] = installs['kind'].cat.codes
installs.loc[installs.kind_value == -1, 'kind_value'] = 0
del installs['kind']
##################### trans_id
installs['trans_id_value'] = installs['trans_id'].cat.codes
installs.loc[installs.trans_id_value == -1, 'trans_id_value'] = 0
del installs['trans_id']

In [14]:
################################################################
## Ventana 1: Del 18 al 20
## Ventana 2: Del 19 al 21
## Ventana 3: Del 20 al 22
## Ventana 4: Del 21 al 23
## Ventana 5: Del 22 al 24
################################################################
installs_Ventana1 = installs[(installs['dia'] >= 18) & (installs['dia'] <= 20)]
installs_Ventana2 = installs[(installs['dia'] >= 19) & (installs['dia'] <= 21)]
installs_Ventana3 = installs[(installs['dia'] >= 20) & (installs['dia'] <= 22)]
installs_Ventana4 = installs[(installs['dia'] >= 21) & (installs['dia'] <= 23)]
installs_Ventana5 = installs[(installs['dia'] >= 22) & (installs['dia'] <= 24)]
################################################################

In [38]:
################################################################
## Limpiamos el CSV cargado para que no ocupe memoria.
installs = ''
installs_Ventana1 = ''
installs_Ventana2 = ''
installs_Ventana3 = ''
installs_Ventana4 = ''
installs_Ventana5 = ''
################################################################

In [2]:
################################################################
## AUCTIONS.
################################################################
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES CSV.
auctions = pd.read_csv('../../auctions.csv',nrows=5000000)
## TRANSFORMACIÓN DE TIPOS PARA EL CSV (AUCTIONS)
auctions['date'] = pd.to_datetime(auctions['date'], infer_datetime_format=True)
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
auctions['mes'] = auctions['date'].dt.month
auctions['dia'] = auctions['date'].dt.day
auctions['hora'] = auctions['date'].dt.hour
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
auctions['hora_madrugada'] = 0
auctions.loc[((auctions.hora > -1) & (auctions.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
auctions['hora_maniana'] = 0
auctions.loc[((auctions.hora > 6) & (auctions.hora < 12)), 'hora_maniana'] = 1
# ALMUERZO de 12 a 13
auctions['hora_almuerzo'] = 0
auctions.loc[((auctions.hora > 11) & (auctions.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
auctions['hora_tarde'] = 0
auctions.loc[((auctions.hora > 13) & (auctions.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
auctions['hora_noche'] = 0
auctions.loc[((auctions.hora > 18) & (auctions.hora < 24)), 'hora_noche'] = 1
# RENOMBRAMOS LA COLUMNA PARA LOS JOINS.
auctions=auctions.rename(columns = {'device_id':'ref_hash'})
################################################################
################################################################

In [3]:
##################### date
del auctions['date']

In [69]:
################################################################
## Ventana 1: Del 18 al 20
## Ventana 2: Del 19 al 21
## Ventana 3: Del 20 al 22
## Ventana 4: Del 21 al 23
## Ventana 5: Del 22 al 24
################################################################
auctions_Ventana1 = auctions[(auctions['dia'] >= 18) & (auctions['dia'] <= 20)]
auctions_Ventana2 = auctions[(auctions['dia'] >= 19) & (auctions['dia'] <= 21)]
auctions_Ventana3 = auctions[(auctions['dia'] >= 20) & (auctions['dia'] <= 22)]
auctions_Ventana4 = auctions[(auctions['dia'] >= 21) & (auctions['dia'] <= 23)]
auctions_Ventana5 = auctions[(auctions['dia'] >= 22) & (auctions['dia'] <= 24)]
################################################################

In [156]:
################################################################
## Limpiamos el CSV cargado para que no ocupe memoria.
auctions = ''
auctions_Ventana1 = ''
auctions_Ventana2 = ''
auctions_Ventana3 = ''
auctions_Ventana4 = ''
auctions_Ventana5 = ''
################################################################

In [16]:
################################################################
## TARGET.
################################################################
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES CSV.
targets = pd.read_csv('../../target.csv')
################################################################
################################################################

## =================================================================
## JUNTAMOS LOS DATAFRAMES Y EMPEZAMOS A ENTRENAR.
## =================================================================

In [17]:
targets['ref_hash'] = targets['ref_hash'].astype(str)

In [18]:
installs['ref_hash'] = installs['ref_hash'].astype(str)
installs['ref_hash'] = installs['ref_hash'] + '_sc'

In [19]:
#installs_ex = installs[installs['ref_hash']=='906973248467925335_sc']
#installs_Ventana1_asd = installs_Ventana1[installs_Ventana1['ref_hash']=='906973248467925335_sc']
#installs_Ventana2_asd = installs_Ventana2[installs_Ventana2['ref_hash']=='906973248467925335_sc']

In [20]:
installs_Ventana1['ref_hash'] = installs_Ventana1['ref_hash'].astype(str)
installs_Ventana1['ref_hash'] = installs_Ventana1['ref_hash'] + '_sc'
asd1 = installs_Ventana1.groupby('ref_hash').max()
asd1['tiempo'] = 0

In [21]:
asd1.head(5)

Unnamed: 0_level_0,created,application_id,ref_type,device_countrycode,device_brand,device_model,ip_address,device_language,mes,dia,hora,hora_madrugada,hora_maniana,hora_almuerzo,hora_tarde,hora_noche,wifi_value,attributed_value,implicit_value,session_user_agent_value,click_hash_value,user_agent_value,kind_value,trans_id_value,tiempo
ref_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1000001243847627208_sc,2019-04-20 01:43:37.206,309,1494519392962156891,6287817205707153877,0.0,7.538551e+18,5973746864904323892,4.06093e+18,4,20,1,1,0,0,0,0,2,0,0,4528,0,0,0,0,0
1000138003886484266_sc,2019-04-19 21:57:09.663,36,1891515180541284343,6287817205707153877,6.115026e+18,5.589506e+18,4346862682928323503,6.977049e+18,4,19,21,0,0,0,0,1,1,0,0,4530,0,693,0,0,0
1000172260833750114_sc,2019-04-18 20:06:31.763,167,1494519392962156891,6287817205707153877,0.0,7.805539e+18,4671039498250048505,3.301378e+18,4,18,20,0,0,0,0,1,2,0,0,4528,0,0,0,0,0
1000214925038058238_sc,2019-04-20 03:34:38.575,36,1891515180541284343,6287817205707153877,6.236248e+18,8.163747e+18,5469349952721087477,6.977049e+18,4,20,3,1,0,0,0,0,1,0,0,4530,0,3447,0,0,0
1000280583608032134_sc,2019-04-20 23:34:35.026,54,1494519392962156891,6287817205707153877,0.0,0.0,5806622878851581339,0.0,4,20,23,0,0,0,0,1,2,0,0,4431,0,0,0,0,0


In [22]:
installs_Ventana2['ref_hash'] = installs_Ventana2['ref_hash'].astype(str)
installs_Ventana2['ref_hash'] = installs_Ventana2['ref_hash'] + '_sc'
asd2 = installs_Ventana2.groupby('ref_hash').max()

In [23]:
asd2.head(5)

Unnamed: 0_level_0,created,application_id,ref_type,device_countrycode,device_brand,device_model,ip_address,device_language,mes,dia,hora,hora_madrugada,hora_maniana,hora_almuerzo,hora_tarde,hora_noche,wifi_value,attributed_value,implicit_value,session_user_agent_value,click_hash_value,user_agent_value,kind_value,trans_id_value
ref_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1000001243847627208_sc,2019-04-20 01:43:37.206,309,1494519392962156891,6287817205707153877,0.0,7.538551e+18,5973746864904323892,4.06093e+18,4,20,1,1,0,0,0,0,2,0,0,4528,0,0,0,0
1000138003886484266_sc,2019-04-19 21:57:09.663,36,1891515180541284343,6287817205707153877,6.115026e+18,5.589506e+18,4346862682928323503,6.977049e+18,4,19,21,0,0,0,0,1,1,0,0,4530,0,693,0,0
1000169251625791246_sc,2019-04-21 08:14:06.178,122,1891515180541284343,6287817205707153877,0.0,1.805456e+18,5483720475053818470,3.301378e+18,4,21,8,0,1,0,0,0,2,0,0,4528,0,0,0,0
1000214925038058238_sc,2019-04-20 03:34:38.575,36,1891515180541284343,6287817205707153877,6.236248e+18,8.163747e+18,5469349952721087477,6.977049e+18,4,20,3,1,0,0,0,0,1,0,0,4530,0,3447,0,0
1000280583608032134_sc,2019-04-20 23:34:35.026,54,1494519392962156891,6287817205707153877,0.0,0.0,5806622878851581339,0.0,4,20,23,0,0,0,0,1,2,0,0,4431,0,0,0,0


In [24]:
mi_asd = asd1.merge(asd2, on=('ref_hash'), suffixes=('_lefto', '_raito'))

In [25]:
mi_asd.head()

Unnamed: 0_level_0,created_lefto,application_id_lefto,ref_type_lefto,device_countrycode_lefto,device_brand_lefto,device_model_lefto,ip_address_lefto,device_language_lefto,mes_lefto,dia_lefto,hora_lefto,hora_madrugada_lefto,hora_maniana_lefto,hora_almuerzo_lefto,hora_tarde_lefto,hora_noche_lefto,wifi_value_lefto,attributed_value_lefto,implicit_value_lefto,session_user_agent_value_lefto,click_hash_value_lefto,user_agent_value_lefto,kind_value_lefto,trans_id_value_lefto,tiempo,created_raito,application_id_raito,ref_type_raito,device_countrycode_raito,device_brand_raito,device_model_raito,ip_address_raito,device_language_raito,mes_raito,dia_raito,hora_raito,hora_madrugada_raito,hora_maniana_raito,hora_almuerzo_raito,hora_tarde_raito,hora_noche_raito,wifi_value_raito,attributed_value_raito,implicit_value_raito,session_user_agent_value_raito,click_hash_value_raito,user_agent_value_raito,kind_value_raito,trans_id_value_raito
ref_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1
1000001243847627208_sc,2019-04-20 01:43:37.206,309,1494519392962156891,6287817205707153877,0.0,7.538551e+18,5973746864904323892,4.06093e+18,4,20,1,1,0,0,0,0,2,0,0,4528,0,0,0,0,0,2019-04-20 01:43:37.206,309,1494519392962156891,6287817205707153877,0.0,7.538551e+18,5973746864904323892,4.06093e+18,4,20,1,1,0,0,0,0,2,0,0,4528,0,0,0,0
1000138003886484266_sc,2019-04-19 21:57:09.663,36,1891515180541284343,6287817205707153877,6.115026e+18,5.589506e+18,4346862682928323503,6.977049e+18,4,19,21,0,0,0,0,1,1,0,0,4530,0,693,0,0,0,2019-04-19 21:57:09.663,36,1891515180541284343,6287817205707153877,6.115026e+18,5.589506e+18,4346862682928323503,6.977049e+18,4,19,21,0,0,0,0,1,1,0,0,4530,0,693,0,0
1000214925038058238_sc,2019-04-20 03:34:38.575,36,1891515180541284343,6287817205707153877,6.236248e+18,8.163747e+18,5469349952721087477,6.977049e+18,4,20,3,1,0,0,0,0,1,0,0,4530,0,3447,0,0,0,2019-04-20 03:34:38.575,36,1891515180541284343,6287817205707153877,6.236248e+18,8.163747e+18,5469349952721087477,6.977049e+18,4,20,3,1,0,0,0,0,1,0,0,4530,0,3447,0,0
1000280583608032134_sc,2019-04-20 23:34:35.026,54,1494519392962156891,6287817205707153877,0.0,0.0,5806622878851581339,0.0,4,20,23,0,0,0,0,1,2,0,0,4431,0,0,0,0,0,2019-04-20 23:34:35.026,54,1494519392962156891,6287817205707153877,0.0,0.0,5806622878851581339,0.0,4,20,23,0,0,0,0,1,2,0,0,4431,0,0,0,0
1000289045777700145_sc,2019-04-20 02:33:13.347,94,1891515180541284343,6287817205707153877,0.0,1.70914e+17,1178303880239917967,3.301378e+18,4,20,2,1,0,0,0,0,2,0,0,4528,0,0,0,0,0,2019-04-20 02:33:13.347,94,1891515180541284343,6287817205707153877,0.0,1.70914e+17,1178303880239917967,3.301378e+18,4,20,2,1,0,0,0,0,2,0,0,4528,0,0,0,0


In [26]:
mi_asd['tiempo'] = mi_asd['created_raito'] - mi_asd['created_lefto']

In [27]:
mi_asd = mi_asd.iloc[:, 0:25]

In [28]:
mi_asd.head()

Unnamed: 0_level_0,created_lefto,application_id_lefto,ref_type_lefto,device_countrycode_lefto,device_brand_lefto,device_model_lefto,ip_address_lefto,device_language_lefto,mes_lefto,dia_lefto,hora_lefto,hora_madrugada_lefto,hora_maniana_lefto,hora_almuerzo_lefto,hora_tarde_lefto,hora_noche_lefto,wifi_value_lefto,attributed_value_lefto,implicit_value_lefto,session_user_agent_value_lefto,click_hash_value_lefto,user_agent_value_lefto,kind_value_lefto,trans_id_value_lefto,tiempo
ref_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1000001243847627208_sc,2019-04-20 01:43:37.206,309,1494519392962156891,6287817205707153877,0.0,7.538551e+18,5973746864904323892,4.06093e+18,4,20,1,1,0,0,0,0,2,0,0,4528,0,0,0,0,0 days
1000138003886484266_sc,2019-04-19 21:57:09.663,36,1891515180541284343,6287817205707153877,6.115026e+18,5.589506e+18,4346862682928323503,6.977049e+18,4,19,21,0,0,0,0,1,1,0,0,4530,0,693,0,0,0 days
1000214925038058238_sc,2019-04-20 03:34:38.575,36,1891515180541284343,6287817205707153877,6.236248e+18,8.163747e+18,5469349952721087477,6.977049e+18,4,20,3,1,0,0,0,0,1,0,0,4530,0,3447,0,0,0 days
1000280583608032134_sc,2019-04-20 23:34:35.026,54,1494519392962156891,6287817205707153877,0.0,0.0,5806622878851581339,0.0,4,20,23,0,0,0,0,1,2,0,0,4431,0,0,0,0,0 days
1000289045777700145_sc,2019-04-20 02:33:13.347,94,1891515180541284343,6287817205707153877,0.0,1.70914e+17,1178303880239917967,3.301378e+18,4,20,2,1,0,0,0,0,2,0,0,4528,0,0,0,0,0 days


In [29]:
mi_asd['tiempo'] = mi_asd['tiempo'].astype(int)

In [30]:
mi_asd.head(100)

Unnamed: 0_level_0,created_lefto,application_id_lefto,ref_type_lefto,device_countrycode_lefto,device_brand_lefto,device_model_lefto,ip_address_lefto,device_language_lefto,mes_lefto,dia_lefto,hora_lefto,hora_madrugada_lefto,hora_maniana_lefto,hora_almuerzo_lefto,hora_tarde_lefto,hora_noche_lefto,wifi_value_lefto,attributed_value_lefto,implicit_value_lefto,session_user_agent_value_lefto,click_hash_value_lefto,user_agent_value_lefto,kind_value_lefto,trans_id_value_lefto,tiempo
ref_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1000001243847627208_sc,2019-04-20 01:43:37.206,309,1494519392962156891,6287817205707153877,0.000000e+00,7.538551e+18,5973746864904323892,4.060930e+18,4,20,1,1,0,0,0,0,2,0,0,4528,0,0,0,0,0
1000138003886484266_sc,2019-04-19 21:57:09.663,36,1891515180541284343,6287817205707153877,6.115026e+18,5.589506e+18,4346862682928323503,6.977049e+18,4,19,21,0,0,0,0,1,1,0,0,4530,0,693,0,0,0
1000214925038058238_sc,2019-04-20 03:34:38.575,36,1891515180541284343,6287817205707153877,6.236248e+18,8.163747e+18,5469349952721087477,6.977049e+18,4,20,3,1,0,0,0,0,1,0,0,4530,0,3447,0,0,0
1000280583608032134_sc,2019-04-20 23:34:35.026,54,1494519392962156891,6287817205707153877,0.000000e+00,0.000000e+00,5806622878851581339,0.000000e+00,4,20,23,0,0,0,0,1,2,0,0,4431,0,0,0,0,0
1000289045777700145_sc,2019-04-20 02:33:13.347,94,1891515180541284343,6287817205707153877,0.000000e+00,1.709140e+17,1178303880239917967,3.301378e+18,4,20,2,1,0,0,0,0,2,0,0,4528,0,0,0,0,0
1000289257101263364_sc,2019-04-20 14:46:46.493,302,1891515180541284343,6287817205707153877,4.567867e+18,8.325108e+18,2201573106829712001,6.977049e+18,4,20,14,0,0,0,1,0,1,0,0,4530,0,4212,0,0,0
1000303030927429183_sc,2019-04-20 07:21:05.760,210,1891515180541284343,6287817205707153877,0.000000e+00,3.803460e+18,4047999604584244448,4.060930e+18,4,20,7,0,1,0,0,0,2,0,0,4530,0,5309,0,0,0
1000395625957344683_sc,2019-04-20 05:59:26.928,121,1891515180541284343,6287817205707153877,6.115026e+18,8.058599e+17,6126933036758477711,6.977049e+18,4,20,5,1,0,0,0,0,1,0,1,4530,0,5664,55,0,0
1000400432115255220_sc,2019-04-20 03:29:18.824,121,1891515180541284343,6287817205707153877,3.083059e+17,6.093206e+18,8774611404674307414,6.977049e+18,4,20,3,1,0,0,0,0,1,0,0,4530,0,5676,0,0,0
100043173982509042_sc,2019-04-19 18:04:23.863,187,1891515180541284343,6287817205707153877,2.208835e+18,3.057402e+18,6682704907280172431,6.977049e+18,4,19,18,0,0,0,1,0,1,0,0,4530,0,5933,0,0,0


In [31]:
columnas_relevantes = list(mi_asd.select_dtypes(include=['int','float64','uint8']).columns)

In [32]:
mi_asd = mi_asd.loc[:, mi_asd.columns.isin(columnas_relevantes)]

In [33]:
mi_asd.head()

Unnamed: 0_level_0,application_id_lefto,ref_type_lefto,device_countrycode_lefto,device_brand_lefto,device_model_lefto,ip_address_lefto,device_language_lefto,mes_lefto,dia_lefto,hora_lefto,hora_madrugada_lefto,hora_maniana_lefto,hora_almuerzo_lefto,hora_tarde_lefto,hora_noche_lefto,wifi_value_lefto,attributed_value_lefto,implicit_value_lefto,tiempo
ref_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1000001243847627208_sc,309,1494519392962156891,6287817205707153877,0.0,7.538551e+18,5973746864904323892,4.06093e+18,4,20,1,1,0,0,0,0,2,0,0,0
1000138003886484266_sc,36,1891515180541284343,6287817205707153877,6.115026e+18,5.589506e+18,4346862682928323503,6.977049e+18,4,19,21,0,0,0,0,1,1,0,0,0
1000214925038058238_sc,36,1891515180541284343,6287817205707153877,6.236248e+18,8.163747e+18,5469349952721087477,6.977049e+18,4,20,3,1,0,0,0,0,1,0,0,0
1000280583608032134_sc,54,1494519392962156891,6287817205707153877,0.0,0.0,5806622878851581339,0.0,4,20,23,0,0,0,0,1,2,0,0,0
1000289045777700145_sc,94,1891515180541284343,6287817205707153877,0.0,1.70914e+17,1178303880239917967,3.301378e+18,4,20,2,1,0,0,0,0,2,0,0,0


In [34]:
mi_asd = mi_asd.iloc[:, 7:19]

In [35]:
mi_asd.head()

Unnamed: 0_level_0,mes_lefto,dia_lefto,hora_lefto,hora_madrugada_lefto,hora_maniana_lefto,hora_almuerzo_lefto,hora_tarde_lefto,hora_noche_lefto,wifi_value_lefto,attributed_value_lefto,implicit_value_lefto,tiempo
ref_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1000001243847627208_sc,4,20,1,1,0,0,0,0,2,0,0,0
1000138003886484266_sc,4,19,21,0,0,0,0,1,1,0,0,0
1000214925038058238_sc,4,20,3,1,0,0,0,0,1,0,0,0
1000280583608032134_sc,4,20,23,0,0,0,0,1,2,0,0,0
1000289045777700145_sc,4,20,2,1,0,0,0,0,2,0,0,0


In [36]:
X, y = mi_asd.iloc[:,:-1],mi_asd.iloc[:,-1]
data_dmatrix = xgb.DMatrix(data=X,label=y)

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [38]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [39]:
xg_reg.fit(X_train,y_train)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=10,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [40]:
preds = xg_reg.predict(X_test)

In [41]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 23320397813996.765625


In [42]:
preds

array([1.4948178e+12, 1.6359107e+12, 1.7131886e+12, 1.7871942e+12,
       1.6092852e+12, 1.7238511e+12, 1.7910818e+12, 1.6911067e+12,
       2.3174620e+12, 1.4709283e+12, 1.4709283e+12, 1.5430690e+12,
       1.4788712e+12, 1.6279093e+12, 1.8102126e+12, 1.6491985e+12,
       1.7703685e+12, 1.5132137e+12, 1.6524444e+12, 2.2909448e+12,
       1.3590806e+12, 1.6279093e+12, 1.5471684e+12, 1.4316268e+12,
       2.5153799e+12, 1.6279093e+12, 1.4788712e+12, 1.4469814e+12,
       1.5974812e+12, 1.6953967e+12, 1.6524444e+12, 1.3930351e+12,
       1.6911067e+12, 2.3174620e+12, 1.9957117e+12, 1.7910818e+12,
       1.7323194e+12, 1.5471684e+12, 2.3174620e+12, 2.0767097e+12,
       2.0767097e+12, 1.6755445e+12, 1.9988165e+12, 1.4316268e+12,
       1.8102126e+12, 1.6924753e+12, 2.2909448e+12, 1.6778189e+12,
       1.6279093e+12, 1.3020425e+12, 1.7809979e+12, 1.7323194e+12,
       1.4369735e+12, 2.2130519e+12, 1.5430690e+12, 1.3930351e+12,
       1.3930351e+12, 1.5373202e+12, 1.7871942e+12, 1.8102126e

In [67]:
##installs['ref_hash'].value_counts()
## 906973248467925335_sc     14
## 5446085605337844584_sc    14
## 5230262481930094112_sc    11

### ======================================================================
### ======================================================================
### ======================================================================

In [86]:
asd = installs.groupby('ref_hash').max()

In [87]:
asd.reset_index(inplace=True)

In [88]:
asd.head()

Unnamed: 0,ref_hash,application_id,ref_type,device_countrycode,device_brand,device_model,ip_address,device_language,mes,dia,hora,hora_madrugada,hora_maniana,hora_almuerzo,hora_tarde,hora_noche,wifi_value,attributed_value,implicit_value,session_user_agent_value,click_hash_value,user_agent_value,kind_value,trans_id_value
0,1000001243847627208_sc,309,1494519392962156891,6287817205707153877,0.0,7.538551e+18,5973746864904323892,4.06093e+18,4,20,1,1,0,0,0,0,2,0,0,4528,0,0,0,0
1,1000061425870948777_sc,122,1891515180541284343,6287817205707153877,0.0,3.706668e+18,4356981474261276461,3.301378e+18,4,26,2,1,0,0,0,0,2,0,0,4528,0,0,0,0
2,1000080701377762047_sc,36,1891515180541284343,6287817205707153877,3.246999e+16,4.07641e+18,572533549112567373,6.977049e+18,4,23,3,1,0,0,0,0,1,0,0,4530,0,4426,0,0
3,1000085014918096773_sc,86,1891515180541284343,6287817205707153877,0.0,4.825011e+17,7863998232088445596,3.301378e+18,4,24,1,1,0,0,0,0,2,0,0,4528,0,0,0,0
4,1000095322020146100_sc,121,1891515180541284343,6287817205707153877,1.174671e+18,1.343148e+18,5213201516763348413,6.977049e+18,4,22,2,1,0,0,0,0,1,0,0,4530,0,3111,0,0


In [None]:
clicks['ref_hash'] = clicks['ref_hash'].astype(str)
clicks['ref_hash'] = clicks['ref_hash'] + '_sc'

In [None]:
auctions['ref_hash'] = auctions['ref_hash'].astype(str)
auctions['ref_hash'] = auctions['ref_hash'] + '_sc'

In [11]:
events['ref_hash'] = events['ref_hash'].astype(str)
events['ref_hash'] = events['ref_hash'] + '_sc'

In [30]:
targets.count()

ref_hash    5930
obj         5930
dtype: int64

In [31]:
resultado_installs = installs.merge(targets, on=('ref_hash'), suffixes=('_l', '_r'))

In [32]:
resultado_installs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27 entries, 0 to 26
Data columns (total 25 columns):
application_id              27 non-null int64
ref_type                    27 non-null int64
ref_hash                    27 non-null object
device_countrycode          27 non-null int64
device_brand                27 non-null float64
device_model                27 non-null float64
ip_address                  27 non-null int64
device_language             27 non-null float64
mes                         27 non-null int64
dia                         27 non-null int64
hora                        27 non-null int64
hora_madrugada              27 non-null int64
hora_maniana                27 non-null int64
hora_almuerzo               27 non-null int64
hora_tarde                  27 non-null int64
hora_noche                  27 non-null int64
wifi_value                  27 non-null int64
attributed_value            27 non-null int64
implicit_value              27 non-null int64
session_user_age

In [138]:
resultado_clicks = clicks.merge(targets, on=('ref_hash'), suffixes=('_l', '_r'))

In [139]:
resultado_clicks.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2 entries, 0 to 1
Data columns (total 28 columns):
advertiser_id     2 non-null int64
action_id         2 non-null float64
source_id         2 non-null int64
country_code      2 non-null int64
latitude          2 non-null float64
longitude         2 non-null float64
carrier_id        2 non-null float64
os_minor          2 non-null float64
agent_device      2 non-null float64
os_major          2 non-null float64
specs_brand       2 non-null int64
brand             2 non-null float64
timeToClick       2 non-null float64
touchX            2 non-null float64
touchY            2 non-null float64
ref_type          2 non-null int64
ref_hash          2 non-null object
mes               2 non-null int64
dia               2 non-null int64
hora              2 non-null int64
hora_madrugada    2 non-null int64
hora_maniana      2 non-null int64
hora_almuerzo     2 non-null int64
hora_tarde        2 non-null int64
hora_noche        2 non-null int64
w

In [151]:
resultado_auctions = auctions.merge(targets, on=('ref_hash'), suffixes=('_l', '_r'))

In [154]:
resultado_auctions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 242 entries, 0 to 241
Data columns (total 12 columns):
ref_hash          242 non-null object
ref_type_id       242 non-null int64
source_id         242 non-null int64
mes               242 non-null int64
dia               242 non-null int64
hora              242 non-null int64
hora_madrugada    242 non-null int64
hora_maniana      242 non-null int64
hora_almuerzo     242 non-null int64
hora_tarde        242 non-null int64
hora_noche        242 non-null int64
obj               242 non-null int64
dtypes: int64(11), object(1)
memory usage: 24.6+ KB


In [13]:
resultado_events = events.merge(targets, on=('ref_hash'), suffixes=('_l', '_r'))

In [14]:
resultado_events.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 268 entries, 0 to 267
Data columns (total 33 columns):
index                 268 non-null int64
event_id              268 non-null int64
ref_type              268 non-null int64
ref_hash              268 non-null object
application_id        268 non-null int64
device_countrycode    268 non-null int64
device_os_version     268 non-null float64
device_brand          268 non-null float64
device_model          268 non-null float64
device_city           268 non-null float64
session_user_agent    268 non-null float64
user_agent            268 non-null float64
carrier               268 non-null float64
kind                  268 non-null float64
device_os             268 non-null float64
ip_address            268 non-null int64
device_language       268 non-null float64
mes                   268 non-null int64
dia                   268 non-null int64
hora                  268 non-null int64
hora_madrugada        268 non-null int64
hora_maniana 

In [11]:
resultado_auctions.head()

NameError: name 'resultado_auctions' is not defined

In [15]:
auctions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 11 columns):
ref_hash          int64
ref_type_id       int64
source_id         int64
mes               int64
hora              int64
hora_madrugada    int64
hora_maniana      int64
hora_almuerzo     int64
hora_tarde        int64
hora_noche        int64
dia               int64
dtypes: int64(11)
memory usage: 419.6 MB


### Pasamos la columna a predecir a la última posición para facilitar el trabajo.

In [14]:
columna_dia = auctions.pop('dia')
auctions['dia'] = columna_dia

### Separamos la variable a predecir

In [17]:
X, y = auctions.iloc[:,:-1],auctions.iloc[:,-1]

### Convertimos los datos a DMatrix

In [18]:
data_dmatrix = xgb.DMatrix(data=X,label=y)


### Hiper-parámetros

    learning_rate: tasa de aprendizaje
    max_depth: máxima profundidad de cada árbol
    subsample: porcentaje de muestras usadas para cada árbol (valor muy bajo, posible underfitting)
    colsample_bytree: porcentaje de features usadas para cada árbol (valores muy alto, posible overfitting)
    n_estimators: cantidad de árboles a construir.
    objective: función de error a utilizar (algunas: reg:linear para regresión, reg:logistic o binary:logistic para clasificación)

### Parámetros de regularización:

    gamma: umbral para hacer split basado en la reducción de error de hacer el nuevo split.
    alpha: regularización para los pesos de las hojas. Un valor más alto genera una mayor regularización.
    lambda: similar alpha pero para la sintonia fina.

### Creamos set de entrenamiento y test

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

### Instanciamos el regresor de XGBoost

In [20]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

### Entrenamos

In [21]:
xg_reg.fit(X_train,y_train)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=10,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

### Predecimos

In [22]:
preds = xg_reg.predict(X_test)

### Calculamos el error en las predicciones

In [25]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 7.808578
