In [1]:
## IMPORTACIÓN GENERAL DE LIBRERIAS Y VISUALIZACIÓN DE DATOS (matplotlib y seaborn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as DT
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import xgboost as xgb

%matplotlib inline
warnings.filterwarnings('ignore')
plt.style.use('default') 
sns.set(style="whitegrid") 
plt.rcParams['figure.figsize'] = (15, 10)
pd.set_option('display.max_columns', 1000)
np.set_printoptions(threshold=np.nan)



In [2]:
################################################################
## EVENTS.
################################################################
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES CSV.
events = pd.read_csv('../../events.csv')
## TRANSFORMACIÓN DE TIPOS PARA EL CSV (EVENTS)
events['wifi'] = events['wifi'].fillna(False).astype(bool)
events['connection_type'] = events['connection_type'].astype('category')
events['trans_id'] = events['trans_id'].astype('category')
events['date'] = pd.to_datetime(events['date'], infer_datetime_format=True)
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
events['mes'] = events['date'].dt.month
events['dia'] = events['date'].dt.day
events['hora'] = events['date'].dt.hour
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
events['hora_madrugada'] = 0
events.loc[((events.hora > -1) & (events.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
events['hora_maniana'] = 0
events.loc[((events.hora > 6) & (events.hora < 12)), 'hora_maniana'] = 1
# ALMUERZO de 12 a 13
events['hora_almuerzo'] = 0
events.loc[((events.hora > 11) & (events.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
events['hora_tarde'] = 0
events.loc[((events.hora > 13) & (events.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
events['hora_noche'] = 0
events.loc[((events.hora > 18) & (events.hora < 24)), 'hora_noche'] = 1
################################################################
################################################################

In [3]:
##################### connection_type
dummies = pd.get_dummies(events['connection_type'], drop_first=False)
events = pd.concat([events, dummies], axis=1)
del events['connection_type']

In [4]:
##################### event_uuid
del events['event_uuid']

In [5]:
##################### date
del events['date']

In [6]:
##################### wifi
events['wifi_value'] = 2
events.loc[events.wifi == False, 'wifi_value'] = 0
events.loc[events.wifi == True, 'wifi_value'] = 1
del events['wifi']

In [7]:
##################### attributed
events['attributed_value'] = 2
events.loc[events.attributed == False, 'attributed_value'] = 0
events.loc[events.attributed == True, 'attributed_value'] = 1
del events['attributed']

In [8]:
##################### trans_id
events['trans_id_value'] = events['trans_id'].cat.codes
events.loc[events.trans_id_value == -1, 'trans_id_value'] = 0
del events['trans_id']

In [9]:
##################### fillna
events.fillna(0, inplace = True)

In [16]:
################################################################
## Ventana 1: Del 18 al 20
## Ventana 2: Del 19 al 21
## Ventana 3: Del 20 al 22
## Ventana 4: Del 21 al 23
## Ventana 5: Del 22 al 24
################################################################
events_Ventana1 = events[(events['dia'] >= 18) & (events['dia'] <= 20)]
events_Ventana2 = events[(events['dia'] >= 19) & (events['dia'] <= 21)]
events_Ventana3 = events[(events['dia'] >= 20) & (events['dia'] <= 22)]
events_Ventana4 = events[(events['dia'] >= 21) & (events['dia'] <= 23)]
events_Ventana5 = events[(events['dia'] >= 22) & (events['dia'] <= 24)]
################################################################

In [34]:
################################################################
## Limpiamos el CSV cargado para que no ocupe memoria.
events = ''
events_Ventana1 = ''
events_Ventana2 = ''
events_Ventana3 = ''
events_Ventana4 = ''
events_Ventana5 = ''
################################################################

In [25]:
events.count()

index                 2261451
event_id              2261451
ref_type              2261451
ref_hash              2261451
application_id        2261451
device_countrycode    2261451
device_os_version      622552
device_brand           782220
device_model          1635481
device_city            503668
session_user_agent    2249034
user_agent            1014075
carrier                514371
kind                  2253036
device_os              477424
ip_address            2261451
device_language       1636247
mes                   2261451
dia                   2261451
hora                  2261451
hora_madrugada        2261451
hora_maniana          2261451
hora_almuerzo         2261451
hora_tarde            2261451
hora_noche            2261451
Cable/DSL             2261451
Cellular              2261451
Corporate             2261451
Dialup                2261451
wifi_value            2261451
attributed_value      2261451
trans_id_value        2261451
dtype: int64

In [129]:
################################################################
## CLICKS.
################################################################
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES CSV.
clicks = pd.read_csv('../../clicks.csv')
## TRANSFORMACIÓN DE TIPOS PARA EL CSV (CLICKS)
clicks['created'] = pd.to_datetime(clicks['created'], infer_datetime_format=True)
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
clicks['mes'] = clicks['created'].dt.month
clicks['dia'] = clicks['created'].dt.day
clicks['hora'] = clicks['created'].dt.hour
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
clicks['hora_madrugada'] = 0
clicks.loc[((clicks.hora > -1) & (clicks.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
clicks['hora_maniana'] = 0
clicks.loc[((clicks.hora > 6) & (clicks.hora < 12)), 'hora_maniana'] = 1
# ALMUERZO de 12 a 13
clicks['hora_almuerzo'] = 0
clicks.loc[((clicks.hora > 11) & (clicks.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
clicks['hora_tarde'] = 0
clicks.loc[((clicks.hora > 13) & (clicks.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
clicks['hora_noche'] = 0
clicks.loc[((clicks.hora > 18) & (clicks.hora < 24)), 'hora_noche'] = 1
################################################################
################################################################

In [130]:
##################### trans_id
clicks.fillna(0, inplace = True)
clicks['trans_id'] = clicks['trans_id'].astype('category')

In [131]:
##################### date
del clicks['created']

In [132]:
##################### wifi_connection
clicks['wifi_value'] = 2
clicks.loc[clicks.wifi_connection == False, 'wifi_value'] = 0
clicks.loc[clicks.wifi_connection == True, 'wifi_value'] = 1
del clicks['wifi_connection']

In [133]:
##################### trans_id
clicks['trans_id_value'] = clicks['trans_id'].cat.codes
clicks.loc[clicks.trans_id_value == -1, 'trans_id_value'] = 0
del clicks['trans_id']

In [134]:
##################### touchXY
clicks.loc[clicks.touchX == 'Infinity', 'touchX'] = 2
clicks.loc[clicks.touchY == 'Infinity', 'touchY'] = 2
clicks['touchX'] = clicks['touchX'].astype(float).fillna(0.0)
clicks['touchY'] = clicks['touchY'].astype(float).fillna(0.0)

In [127]:
################################################################
## Ventana 1: Del 18 al 20
## Ventana 2: Del 19 al 21
## Ventana 3: Del 20 al 22
## Ventana 4: Del 21 al 23
## Ventana 5: Del 22 al 24
################################################################
clicks_Ventana1 = clicks[(clicks['dia'] >= 18) & (clicks['dia'] <= 20)]
clicks_Ventana2 = clicks[(clicks['dia'] >= 19) & (clicks['dia'] <= 21)]
clicks_Ventana3 = clicks[(clicks['dia'] >= 20) & (clicks['dia'] <= 22)]
clicks_Ventana4 = clicks[(clicks['dia'] >= 21) & (clicks['dia'] <= 23)]
clicks_Ventana5 = clicks[(clicks['dia'] >= 22) & (clicks['dia'] <= 24)]
################################################################

In [33]:
################################################################
## Limpiamos el CSV cargado para que no ocupe memoria.
clicks = ''
clicks_Ventana1 = ''
clicks_Ventana2 = ''
clicks_Ventana3 = ''
clicks_Ventana4 = ''
clicks_Ventana5 = ''
################################################################

In [135]:
clicks.count()

advertiser_id     64296
action_id         64296
source_id         64296
country_code      64296
latitude          64296
longitude         64296
carrier_id        64296
os_minor          64296
agent_device      64296
os_major          64296
specs_brand       64296
brand             64296
timeToClick       64296
touchX            64296
touchY            64296
ref_type          64296
ref_hash          64296
mes               64296
dia               64296
hora              64296
hora_madrugada    64296
hora_maniana      64296
hora_almuerzo     64296
hora_tarde        64296
hora_noche        64296
wifi_value        64296
trans_id_value    64296
dtype: int64

In [15]:
################################################################
## INSTALLS.
################################################################
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES CSV.
installs = pd.read_csv('../../installs.csv')
## TRANSFORMACIÓN DE TIPOS PARA EL CSV (INSTALLS)
installs['kind'] = installs['kind'].astype('category')
installs['user_agent'] = installs['user_agent'].astype('category')
installs['session_user_agent'] = installs['session_user_agent'].astype('category')
installs['created'] = pd.to_datetime(installs['created'], infer_datetime_format=True)
installs['trans_id'] = installs['trans_id'].astype('category')
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
installs['mes'] = installs['created'].dt.month
installs['dia'] = installs['created'].dt.day
installs['hora'] = installs['created'].dt.hour
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
installs['hora_madrugada'] = 0
installs.loc[((installs.hora > -1) & (installs.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
installs['hora_maniana'] = 0
installs.loc[((installs.hora > 6) & (installs.hora < 12)), 'hora_maniana'] = 1
# ALMUERZO de 12 a 13
installs['hora_almuerzo'] = 0
installs.loc[((installs.hora > 11) & (installs.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
installs['hora_tarde'] = 0
installs.loc[((installs.hora > 13) & (installs.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
installs['hora_noche'] = 0
installs.loc[((installs.hora > 18) & (installs.hora < 24)), 'hora_noche'] = 1
################################################################
################################################################

In [16]:
##################### fillna
installs['click_hash'].fillna(0, inplace = True)
installs['device_brand'].fillna(0, inplace = True)
installs['device_model'].fillna(0, inplace = True)
installs['device_language'].fillna(0, inplace = True)

In [17]:
##################### created event
del installs['event_uuid']
del installs['created']

In [18]:
##################### wifi
installs['wifi_value'] = 2
installs.loc[installs.wifi == False, 'wifi_value'] = 0
installs.loc[installs.wifi == True, 'wifi_value'] = 1
del installs['wifi']

In [19]:
##################### attributed
installs['attributed_value'] = 2
installs.loc[installs.attributed == False, 'attributed_value'] = 0
installs.loc[installs.attributed == True, 'attributed_value'] = 1
del installs['attributed']

In [20]:
##################### implicit
installs['implicit_value'] = 2
installs.loc[installs.implicit == False, 'implicit_value'] = 0
installs.loc[installs.implicit == True, 'implicit_value'] = 1
del installs['implicit']

In [21]:
##################### click_hash
installs['click_hash'] = installs['click_hash'].astype('category')

In [22]:
##################### session_user_agent
installs['session_user_agent_value'] = installs['session_user_agent'].cat.codes
del installs['session_user_agent']
installs.loc[installs.session_user_agent_value == -1, 'session_user_agent_value'] = 0

In [23]:
##################### click_hash
installs['click_hash_value'] = installs['click_hash'].cat.codes
installs.loc[installs.click_hash_value == -1, 'click_hash_value'] = 0
del installs['click_hash']

In [24]:
##################### user_agent
installs['user_agent_value'] = installs['user_agent'].cat.codes
installs.loc[installs.user_agent_value == -1, 'user_agent_value'] = 0
del installs['user_agent']

In [25]:
##################### kind
installs['kind_value'] = installs['kind'].cat.codes
installs.loc[installs.kind_value == -1, 'kind_value'] = 0
del installs['kind']

In [26]:
##################### trans_id
installs['trans_id_value'] = installs['trans_id'].cat.codes
installs.loc[installs.trans_id_value == -1, 'trans_id_value'] = 0
del installs['trans_id']

In [93]:
################################################################
## Ventana 1: Del 18 al 20
## Ventana 2: Del 19 al 21
## Ventana 3: Del 20 al 22
## Ventana 4: Del 21 al 23
## Ventana 5: Del 22 al 24
################################################################
installs_Ventana1 = installs[(installs['dia'] >= 18) & (installs['dia'] <= 20)]
installs_Ventana2 = installs[(installs['dia'] >= 19) & (installs['dia'] <= 21)]
installs_Ventana3 = installs[(installs['dia'] >= 20) & (installs['dia'] <= 22)]
installs_Ventana4 = installs[(installs['dia'] >= 21) & (installs['dia'] <= 23)]
installs_Ventana5 = installs[(installs['dia'] >= 22) & (installs['dia'] <= 24)]
################################################################

In [35]:
################################################################
## Limpiamos el CSV cargado para que no ocupe memoria.
installs = ''
installs_Ventana1 = ''
installs_Ventana2 = ''
installs_Ventana3 = ''
installs_Ventana4 = ''
installs_Ventana5 = ''
################################################################

In [27]:
installs.count()

application_id              481511
ref_type                    481511
ref_hash                    481511
device_countrycode          481511
device_brand                481511
device_model                481511
ip_address                  481511
device_language             481511
mes                         481511
dia                         481511
hora                        481511
hora_madrugada              481511
hora_maniana                481511
hora_almuerzo               481511
hora_tarde                  481511
hora_noche                  481511
wifi_value                  481511
attributed_value            481511
implicit_value              481511
session_user_agent_value    481511
click_hash_value            481511
user_agent_value            481511
kind_value                  481511
trans_id_value              481511
dtype: int64

In [2]:
################################################################
## AUCTIONS.
################################################################
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES CSV.
auctions = pd.read_csv('../../auctions.csv',nrows=5000000)
## TRANSFORMACIÓN DE TIPOS PARA EL CSV (AUCTIONS)
auctions['date'] = pd.to_datetime(auctions['date'], infer_datetime_format=True)
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
auctions['mes'] = auctions['date'].dt.month
auctions['dia'] = auctions['date'].dt.day
auctions['hora'] = auctions['date'].dt.hour
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
auctions['hora_madrugada'] = 0
auctions.loc[((auctions.hora > -1) & (auctions.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
auctions['hora_maniana'] = 0
auctions.loc[((auctions.hora > 6) & (auctions.hora < 12)), 'hora_maniana'] = 1
# ALMUERZO de 12 a 13
auctions['hora_almuerzo'] = 0
auctions.loc[((auctions.hora > 11) & (auctions.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
auctions['hora_tarde'] = 0
auctions.loc[((auctions.hora > 13) & (auctions.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
auctions['hora_noche'] = 0
auctions.loc[((auctions.hora > 18) & (auctions.hora < 24)), 'hora_noche'] = 1
# RENOMBRAMOS LA COLUMNA PARA LOS JOINS.
auctions=auctions.rename(columns = {'device_id':'ref_hash'})
################################################################
################################################################

In [3]:
##################### date
del auctions['date']

In [69]:
################################################################
## Ventana 1: Del 18 al 20
## Ventana 2: Del 19 al 21
## Ventana 3: Del 20 al 22
## Ventana 4: Del 21 al 23
## Ventana 5: Del 22 al 24
################################################################
auctions_Ventana1 = auctions[(auctions['dia'] >= 18) & (auctions['dia'] <= 20)]
auctions_Ventana2 = auctions[(auctions['dia'] >= 19) & (auctions['dia'] <= 21)]
auctions_Ventana3 = auctions[(auctions['dia'] >= 20) & (auctions['dia'] <= 22)]
auctions_Ventana4 = auctions[(auctions['dia'] >= 21) & (auctions['dia'] <= 23)]
auctions_Ventana5 = auctions[(auctions['dia'] >= 22) & (auctions['dia'] <= 24)]
################################################################

In [156]:
################################################################
## Limpiamos el CSV cargado para que no ocupe memoria.
auctions = ''
auctions_Ventana1 = ''
auctions_Ventana2 = ''
auctions_Ventana3 = ''
auctions_Ventana4 = ''
auctions_Ventana5 = ''
################################################################

In [None]:
auctions.count()

In [4]:
################################################################
## TARGET.
################################################################
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES CSV.
targets = pd.read_csv('../../target.csv')
################################################################
################################################################

In [5]:
targets.count()

ref_hash    5930
obj         5930
dtype: int64

In [6]:
targets.head()

Unnamed: 0,ref_hash,obj
0,1000169251625791232_sc,0
1,1000169251625791232_st,0
2,1000395625957344640_sc,0
3,1000395625957344640_st,0
4,1006670001679961600_sc,0


## =================================================================
## JUNTAMOS LOS DATAFRAMES Y EMPEZAMOS A ENTRENAR.
## =================================================================

In [6]:
targets['ref_hash'] = targets['ref_hash'].astype(str)

In [28]:
installs['ref_hash'] = installs['ref_hash'].astype(str)
installs['ref_hash'] = installs['ref_hash'] + '_sc'

In [None]:
clicks['ref_hash'] = clicks['ref_hash'].astype(str)
clicks['ref_hash'] = clicks['ref_hash'] + '_sc'

In [None]:
auctions['ref_hash'] = auctions['ref_hash'].astype(str)
auctions['ref_hash'] = auctions['ref_hash'] + '_sc'

In [11]:
events['ref_hash'] = events['ref_hash'].astype(str)
events['ref_hash'] = events['ref_hash'] + '_sc'

In [30]:
targets.count()

ref_hash    5930
obj         5930
dtype: int64

In [31]:
resultado_installs = installs.merge(targets, on=('ref_hash'), suffixes=('_l', '_r'))

In [32]:
resultado_installs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27 entries, 0 to 26
Data columns (total 25 columns):
application_id              27 non-null int64
ref_type                    27 non-null int64
ref_hash                    27 non-null object
device_countrycode          27 non-null int64
device_brand                27 non-null float64
device_model                27 non-null float64
ip_address                  27 non-null int64
device_language             27 non-null float64
mes                         27 non-null int64
dia                         27 non-null int64
hora                        27 non-null int64
hora_madrugada              27 non-null int64
hora_maniana                27 non-null int64
hora_almuerzo               27 non-null int64
hora_tarde                  27 non-null int64
hora_noche                  27 non-null int64
wifi_value                  27 non-null int64
attributed_value            27 non-null int64
implicit_value              27 non-null int64
session_user_age

In [138]:
resultado_clicks = clicks.merge(targets, on=('ref_hash'), suffixes=('_l', '_r'))

In [139]:
resultado_clicks.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2 entries, 0 to 1
Data columns (total 28 columns):
advertiser_id     2 non-null int64
action_id         2 non-null float64
source_id         2 non-null int64
country_code      2 non-null int64
latitude          2 non-null float64
longitude         2 non-null float64
carrier_id        2 non-null float64
os_minor          2 non-null float64
agent_device      2 non-null float64
os_major          2 non-null float64
specs_brand       2 non-null int64
brand             2 non-null float64
timeToClick       2 non-null float64
touchX            2 non-null float64
touchY            2 non-null float64
ref_type          2 non-null int64
ref_hash          2 non-null object
mes               2 non-null int64
dia               2 non-null int64
hora              2 non-null int64
hora_madrugada    2 non-null int64
hora_maniana      2 non-null int64
hora_almuerzo     2 non-null int64
hora_tarde        2 non-null int64
hora_noche        2 non-null int64
w

In [151]:
resultado_auctions = auctions.merge(targets, on=('ref_hash'), suffixes=('_l', '_r'))

In [154]:
resultado_auctions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 242 entries, 0 to 241
Data columns (total 12 columns):
ref_hash          242 non-null object
ref_type_id       242 non-null int64
source_id         242 non-null int64
mes               242 non-null int64
dia               242 non-null int64
hora              242 non-null int64
hora_madrugada    242 non-null int64
hora_maniana      242 non-null int64
hora_almuerzo     242 non-null int64
hora_tarde        242 non-null int64
hora_noche        242 non-null int64
obj               242 non-null int64
dtypes: int64(11), object(1)
memory usage: 24.6+ KB


In [13]:
resultado_events = events.merge(targets, on=('ref_hash'), suffixes=('_l', '_r'))

In [14]:
resultado_events.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 268 entries, 0 to 267
Data columns (total 33 columns):
index                 268 non-null int64
event_id              268 non-null int64
ref_type              268 non-null int64
ref_hash              268 non-null object
application_id        268 non-null int64
device_countrycode    268 non-null int64
device_os_version     268 non-null float64
device_brand          268 non-null float64
device_model          268 non-null float64
device_city           268 non-null float64
session_user_agent    268 non-null float64
user_agent            268 non-null float64
carrier               268 non-null float64
kind                  268 non-null float64
device_os             268 non-null float64
ip_address            268 non-null int64
device_language       268 non-null float64
mes                   268 non-null int64
dia                   268 non-null int64
hora                  268 non-null int64
hora_madrugada        268 non-null int64
hora_maniana 

In [11]:
resultado_auctions.head()

NameError: name 'resultado_auctions' is not defined

In [15]:
auctions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 11 columns):
ref_hash          int64
ref_type_id       int64
source_id         int64
mes               int64
hora              int64
hora_madrugada    int64
hora_maniana      int64
hora_almuerzo     int64
hora_tarde        int64
hora_noche        int64
dia               int64
dtypes: int64(11)
memory usage: 419.6 MB


### Pasamos la columna a predecir a la última posición para facilitar el trabajo.

In [14]:
columna_dia = auctions.pop('dia')
auctions['dia'] = columna_dia

### Separamos la variable a predecir

In [17]:
X, y = auctions.iloc[:,:-1],auctions.iloc[:,-1]

### Convertimos los datos a DMatrix

In [18]:
data_dmatrix = xgb.DMatrix(data=X,label=y)


### Hiper-parámetros

    learning_rate: tasa de aprendizaje
    max_depth: máxima profundidad de cada árbol
    subsample: porcentaje de muestras usadas para cada árbol (valor muy bajo, posible underfitting)
    colsample_bytree: porcentaje de features usadas para cada árbol (valores muy alto, posible overfitting)
    n_estimators: cantidad de árboles a construir.
    objective: función de error a utilizar (algunas: reg:linear para regresión, reg:logistic o binary:logistic para clasificación)

### Parámetros de regularización:

    gamma: umbral para hacer split basado en la reducción de error de hacer el nuevo split.
    alpha: regularización para los pesos de las hojas. Un valor más alto genera una mayor regularización.
    lambda: similar alpha pero para la sintonia fina.

### Creamos set de entrenamiento y test

In [28]:
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

### Instanciamos el regresor de XGBoost

In [20]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

### Entrenamos

In [21]:
xg_reg.fit(X_train,y_train)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=10,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

### Predecimos

In [22]:
preds = xg_reg.predict(X_test)

### Calculamos el error en las predicciones

In [24]:
from sklearn.metrics import mean_squared_error

In [25]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 7.808578
