In [1]:
## IMPORTACIÓN GENERAL DE LIBRERIAS Y VISUALIZACIÓN DE DATOS (matplotlib y seaborn)
import numpy as np
import pandas as pd
import seaborn as sns
import datetime as DT
import warnings as wr
import xgboost as xgb
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
## from sklearn.model_selection import train_test_split

%matplotlib inline
warnings.filterwarnings('ignore')
plt.style.use('default') 
sns.set(style="whitegrid") 
plt.rcParams['figure.figsize'] = (15, 10)
pd.set_option('display.max_columns', 1000)
np.set_printoptions(threshold=np.nan)



In [2]:
################################################################
## EVENTS.
################################################################
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES CSV.
events = pd.read_csv('../../events.csv')
## TRANSFORMACIÓN DE TIPOS PARA EL CSV (EVENTS)
events['wifi'] = events['wifi'].fillna(False).astype(bool)
events['connection_type'] = events['connection_type'].astype('category')
events['trans_id'] = events['trans_id'].astype('category')
events['date'] = pd.to_datetime(events['date'], infer_datetime_format=True)
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
events['mes'] = events['date'].dt.month
events['dia'] = events['date'].dt.day
events['hora'] = events['date'].dt.hour
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
events['hora_madrugada'] = 0
events.loc[((events.hora > -1) & (events.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
events['hora_maniana'] = 0
events.loc[((events.hora > 6) & (events.hora < 12)), 'hora_maniana'] = 1
# ALMUERZO de 12 a 13
events['hora_almuerzo'] = 0
events.loc[((events.hora > 11) & (events.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
events['hora_tarde'] = 0
events.loc[((events.hora > 13) & (events.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
events['hora_noche'] = 0
events.loc[((events.hora > 18) & (events.hora < 24)), 'hora_noche'] = 1
################################################################
################################################################

In [3]:
##################### connection_type
dummies = pd.get_dummies(events['connection_type'], drop_first=False)
events = pd.concat([events, dummies], axis=1)
del events['connection_type']
##################### event_uuid
del events['event_uuid']
##################### date
del events['date']
##################### wifi
events['wifi_value'] = 2
events.loc[events.wifi == False, 'wifi_value'] = 0
events.loc[events.wifi == True, 'wifi_value'] = 1
del events['wifi']
##################### attributed
events['attributed_value'] = 2
events.loc[events.attributed == False, 'attributed_value'] = 0
events.loc[events.attributed == True, 'attributed_value'] = 1
del events['attributed']
##################### trans_id
events['trans_id_value'] = events['trans_id'].cat.codes
events.loc[events.trans_id_value == -1, 'trans_id_value'] = 0
del events['trans_id']
##################### fillna
events.fillna(0, inplace = True)

In [16]:
################################################################
## Ventana 1: Del 18 al 20
## Ventana 2: Del 19 al 21
## Ventana 3: Del 20 al 22
## Ventana 4: Del 21 al 23
## Ventana 5: Del 22 al 24
################################################################
events_Ventana1 = events[(events['dia'] >= 21) & (events['dia'] <= 23)]
events_Ventana2 = events[(events['dia'] >= 24) & (events['dia'] <= 26)]
#events_Ventana3 = events[(events['dia'] >= 20) & (events['dia'] <= 22)]
#events_Ventana4 = events[(events['dia'] >= 21) & (events['dia'] <= 23)]
#events_Ventana5 = events[(events['dia'] >= 22) & (events['dia'] <= 24)]
################################################################

In [34]:
################################################################
## Limpiamos el CSV cargado para que no ocupe memoria.
## events = ''
## events_Ventana1 = ''
## events_Ventana2 = ''
## events_Ventana3 = ''
## events_Ventana4 = ''
## events_Ventana5 = ''
################################################################

In [129]:
################################################################
## CLICKS.
################################################################
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES CSV.
clicks = pd.read_csv('../../clicks.csv')
## TRANSFORMACIÓN DE TIPOS PARA EL CSV (CLICKS)
clicks['created'] = pd.to_datetime(clicks['created'], infer_datetime_format=True)
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
clicks['mes'] = clicks['created'].dt.month
clicks['dia'] = clicks['created'].dt.day
clicks['hora'] = clicks['created'].dt.hour
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
clicks['hora_madrugada'] = 0
clicks.loc[((clicks.hora > -1) & (clicks.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
clicks['hora_maniana'] = 0
clicks.loc[((clicks.hora > 6) & (clicks.hora < 12)), 'hora_maniana'] = 1
# ALMUERZO de 12 a 13
clicks['hora_almuerzo'] = 0
clicks.loc[((clicks.hora > 11) & (clicks.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
clicks['hora_tarde'] = 0
clicks.loc[((clicks.hora > 13) & (clicks.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
clicks['hora_noche'] = 0
clicks.loc[((clicks.hora > 18) & (clicks.hora < 24)), 'hora_noche'] = 1
################################################################
################################################################

In [130]:
##################### trans_id
clicks.fillna(0, inplace = True)
clicks['trans_id'] = clicks['trans_id'].astype('category')
##################### date
del clicks['created']
##################### wifi_connection
clicks['wifi_value'] = 2
clicks.loc[clicks.wifi_connection == False, 'wifi_value'] = 0
clicks.loc[clicks.wifi_connection == True, 'wifi_value'] = 1
del clicks['wifi_connection']
##################### trans_id
clicks['trans_id_value'] = clicks['trans_id'].cat.codes
clicks.loc[clicks.trans_id_value == -1, 'trans_id_value'] = 0
del clicks['trans_id']
##################### touchXY
clicks.loc[clicks.touchX == 'Infinity', 'touchX'] = 2
clicks.loc[clicks.touchY == 'Infinity', 'touchY'] = 2
clicks['touchX'] = clicks['touchX'].astype(float).fillna(0.0)
clicks['touchY'] = clicks['touchY'].astype(float).fillna(0.0)

In [127]:
################################################################
## Ventana 1: Del 18 al 20
## Ventana 2: Del 19 al 21
## Ventana 3: Del 20 al 22
## Ventana 4: Del 21 al 23
## Ventana 5: Del 22 al 24
################################################################
clicks_Ventana1 = clicks[(clicks['dia'] >= 21) & (clicks['dia'] <= 23)]
clicks_Ventana2 = clicks[(clicks['dia'] >= 24) & (clicks['dia'] <= 26)]
#clicks_Ventana3 = clicks[(clicks['dia'] >= 20) & (clicks['dia'] <= 22)]
#clicks_Ventana4 = clicks[(clicks['dia'] >= 21) & (clicks['dia'] <= 23)]
#clicks_Ventana5 = clicks[(clicks['dia'] >= 22) & (clicks['dia'] <= 24)]
################################################################

In [33]:
################################################################
## Limpiamos el CSV cargado para que no ocupe memoria.
## clicks = ''
## clicks_Ventana1 = ''
## clicks_Ventana2 = ''
## clicks_Ventana3 = ''
## clicks_Ventana4 = ''
## clicks_Ventana5 = ''
################################################################

In [2]:
################################################################
## INSTALLS.
################################################################
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES CSV.
installs = pd.read_csv('../../installs.csv')
## TRANSFORMACIÓN DE TIPOS PARA EL CSV (INSTALLS)
installs['kind'] = installs['kind'].astype('category')
installs['user_agent'] = installs['user_agent'].astype('category')
installs['session_user_agent'] = installs['session_user_agent'].astype('category')
installs['created'] = pd.to_datetime(installs['created'], infer_datetime_format=True)
installs['trans_id'] = installs['trans_id'].astype('category')
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
installs['mes'] = installs['created'].dt.month
installs['dia'] = installs['created'].dt.day
installs['hora'] = installs['created'].dt.hour
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
installs['hora_madrugada'] = 0
installs.loc[((installs.hora > -1) & (installs.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
installs['hora_maniana'] = 0
installs.loc[((installs.hora > 6) & (installs.hora < 12)), 'hora_maniana'] = 1
# ALMUERZO de 12 a 13
installs['hora_almuerzo'] = 0
installs.loc[((installs.hora > 11) & (installs.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
installs['hora_tarde'] = 0
installs.loc[((installs.hora > 13) & (installs.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
installs['hora_noche'] = 0
installs.loc[((installs.hora > 18) & (installs.hora < 24)), 'hora_noche'] = 1
#installs['cantidad'] = 1
################################################################
################################################################

In [3]:
##################### fillna
installs['click_hash'].fillna(0, inplace = True)
installs['device_brand'].fillna(0, inplace = True)
installs['device_model'].fillna(0, inplace = True)
installs['device_language'].fillna(0, inplace = True)
##################### created event
del installs['event_uuid']
#del installs['created']
##################### wifi
installs['wifi_value'] = 2
installs.loc[installs.wifi == False, 'wifi_value'] = 0
installs.loc[installs.wifi == True, 'wifi_value'] = 1
del installs['wifi']
##################### attributed
installs['attributed_value'] = 2
installs.loc[installs.attributed == False, 'attributed_value'] = 0
installs.loc[installs.attributed == True, 'attributed_value'] = 1
del installs['attributed']
##################### implicit
installs['implicit_value'] = 2
installs.loc[installs.implicit == False, 'implicit_value'] = 0
installs.loc[installs.implicit == True, 'implicit_value'] = 1
del installs['implicit']
##################### click_hash
installs['click_hash'] = installs['click_hash'].astype('category')
##################### session_user_agent
installs['session_user_agent_value'] = installs['session_user_agent'].cat.codes
del installs['session_user_agent']
installs.loc[installs.session_user_agent_value == -1, 'session_user_agent_value'] = 0
##################### click_hash
installs['click_hash_value'] = installs['click_hash'].cat.codes
installs.loc[installs.click_hash_value == -1, 'click_hash_value'] = 0
del installs['click_hash']
##################### user_agent
installs['user_agent_value'] = installs['user_agent'].cat.codes
installs.loc[installs.user_agent_value == -1, 'user_agent_value'] = 0
del installs['user_agent']
##################### kind
installs['kind_value'] = installs['kind'].cat.codes
installs.loc[installs.kind_value == -1, 'kind_value'] = 0
del installs['kind']
##################### trans_id
installs['trans_id_value'] = installs['trans_id'].cat.codes
installs.loc[installs.trans_id_value == -1, 'trans_id_value'] = 0
del installs['trans_id']

In [4]:
################################################################
## Ventana 1: Del 18 al 20
## Ventana 2: Del 19 al 21
## Ventana 3: Del 20 al 22
## Ventana 4: Del 21 al 23
## Ventana 5: Del 22 al 24
################################################################
installs_Ventana1 = installs[(installs['dia'] >= 21) & (installs['dia'] <= 23)]
installs_Ventana2 = installs[(installs['dia'] >= 24) & (installs['dia'] <= 26)]
#installs_Ventana3 = installs[(installs['dia'] >= 20) & (installs['dia'] <= 22)]
#installs_Ventana4 = installs[(installs['dia'] >= 21) & (installs['dia'] <= 23)]
#installs_Ventana5 = installs[(installs['dia'] >= 22) & (installs['dia'] <= 24)]
################################################################

In [5]:
################################################################
## Limpiamos el CSV cargado para que no ocupe memoria.
## installs = ''
## installs_Ventana1 = ''
## installs_Ventana2 = ''
## installs_Ventana3 = ''
## installs_Ventana4 = ''
## installs_Ventana5 = ''
################################################################

In [6]:
################################################################
## AUCTIONS.
################################################################
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES CSV.
auctions = pd.read_csv('../../auctions.csv',nrows=5000000)
## TRANSFORMACIÓN DE TIPOS PARA EL CSV (AUCTIONS)
auctions['date'] = pd.to_datetime(auctions['date'], infer_datetime_format=True)
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
auctions['mes'] = auctions['date'].dt.month
auctions['dia'] = auctions['date'].dt.day
auctions['hora'] = auctions['date'].dt.hour
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
auctions['hora_madrugada'] = 0
auctions.loc[((auctions.hora > -1) & (auctions.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
auctions['hora_maniana'] = 0
auctions.loc[((auctions.hora > 6) & (auctions.hora < 12)), 'hora_maniana'] = 1
# ALMUERZO de 12 a 13
auctions['hora_almuerzo'] = 0
auctions.loc[((auctions.hora > 11) & (auctions.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
auctions['hora_tarde'] = 0
auctions.loc[((auctions.hora > 13) & (auctions.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
auctions['hora_noche'] = 0
auctions.loc[((auctions.hora > 18) & (auctions.hora < 24)), 'hora_noche'] = 1
# RENOMBRAMOS LA COLUMNA PARA LOS JOINS.
auctions=auctions.rename(columns = {'device_id':'ref_hash'})
auctions=auctions.rename(columns = {'date':'created'})
#auctions['cantidad'] = 1
################################################################
################################################################

In [7]:
##################### date
#del auctions['date']

In [8]:
################################################################
## Ventana 1: Del 18 al 20
## Ventana 2: Del 19 al 21
## Ventana 3: Del 20 al 22
## Ventana 4: Del 21 al 23
## Ventana 5: Del 22 al 24
################################################################
auctions_Ventana1 = auctions[(auctions['dia'] >= 21) & (auctions['dia'] <= 23)]
auctions_Ventana2 = auctions[(auctions['dia'] >= 24) & (auctions['dia'] <= 26)]
#auctions_Ventana3 = auctions[(auctions['dia'] >= 20) & (auctions['dia'] <= 22)]
#auctions_Ventana4 = auctions[(auctions['dia'] >= 20) & (auctions['dia'] <= 23)]
#auctions_Ventana5 = auctions[(auctions['dia'] >= 23) & (auctions['dia'] <= 27)]
################################################################

In [9]:
################################################################
## Limpiamos el CSV cargado para que no ocupe memoria.
## auctions = ''
## auctions_Ventana1 = ''
## auctions_Ventana2 = ''
## auctions_Ventana3 = ''
## auctions_Ventana4 = ''
## auctions_Ventana5 = ''
################################################################

In [10]:
################################################################
## TARGET.
################################################################
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES CSV.
targets = pd.read_csv('../../target.csv')
targets_sc = targets[targets['ref_hash'].str.contains('_sc')]
targets_st = targets[targets['ref_hash'].str.contains('_st')]
################################################################
################################################################

## =================================================================
## JUNTAMOS LOS DATAFRAMES Y EMPEZAMOS A ENTRENAR (SC = INSTALLS).
## =================================================================

In [11]:
targets['ref_hash'] = targets['ref_hash'].astype(str)
targets_st['ref_hash'] = targets_st['ref_hash'].astype(str)
targets_sc['ref_hash'] = targets_sc['ref_hash'].astype(str)
installs['ref_hash'] = installs['ref_hash'].astype(str)
installs['ref_hash'] = installs['ref_hash'] + '_sc'

In [12]:
installs_Ventana1['ref_hash'] = installs_Ventana1['ref_hash'].astype(str)
installs_Ventana1['ref_hash'] = installs_Ventana1['ref_hash'] + '_sc'
inst_1 = installs_Ventana1.groupby('ref_hash').min()#.agg(k:np.sum if k == 'cantidad' else k:np.sum for k in installs_Ventana1.columns)
installs_Ventana2['ref_hash'] = installs_Ventana2['ref_hash'].astype(str)
installs_Ventana2['ref_hash'] = installs_Ventana2['ref_hash'] + '_sc'
inst_2 = installs_Ventana2.groupby('ref_hash').min()#.agg(k:np.sum if k == 'cantidad' else k:np.sum for k in installs_Ventana2.columns)
inst_1['obj'] = 0

In [13]:
inst_comb = inst_1.merge(inst_2, on=('ref_hash'), suffixes=('_lefto', '_raito'))
inst_comb['obj'] = (inst_comb['created_raito'] - inst_comb['created_lefto']).dt.total_seconds()

In [14]:
inst_comb = inst_comb.iloc[:, 0:25]
inst_comb['obj'] = inst_comb['obj'].astype(int)

In [15]:
columnas_relevantes_inst = list(inst_comb.select_dtypes(include=['int','float64','uint8']).columns)
inst_comb = inst_comb.loc[:, inst_comb.columns.isin(columnas_relevantes_inst)]

In [16]:
inst_comb = inst_comb.iloc[:, 7:19]
columnas_relevantes_inst = list(inst_2.select_dtypes(include=['int','float64','uint8']).columns)
inst_2 = inst_2.loc[:, inst_2.columns.isin(columnas_relevantes_inst)]
inst_ventana2 = inst_2.iloc[:, 7:19]
target_result_sc = pd.merge(targets_sc, inst_ventana2, how='left', left_on='ref_hash', right_on='ref_hash')

In [17]:
X_inst, y_inst = inst_comb.iloc[:,:-1],inst_comb.iloc[:,-1]
data_dmatrix_inst = xgb.DMatrix(data=X_inst,label=y_inst)
X_train_inst, X_test_inst, y_train_inst, y_test_inst = train_test_split(X_inst, y_inst, test_size=0.2, random_state=123)

In [18]:
xg_reg_inst = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, 
              learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 10)

In [19]:
xg_reg_inst.fit(X_train_inst,y_train_inst)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=10,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [20]:
inst_ventana2.columns = [str(col) + '_lefto' for col in inst_ventana2.columns]
inst_ventana2['obj'] = 0
X_target_inst, y_target_inst = inst_ventana2.iloc[:,:-1],inst_ventana2.iloc[:,-1]
preds_sc = xg_reg_inst.predict(X_target_inst)

In [21]:
## =================================================================================================
## ARMAMOS EN BASE A LA PREDICCIÓN QUE TENEMOS UN CSV PARA SUBIR A KAGGLE CON EL FORMATO INDICADO.
## =================================================================================================
inst_ventana2 = inst_ventana2.reset_index()
submission_sc = pd.DataFrame({ 'resultado': preds_sc, 'ref_hash': inst_ventana2['ref_hash'] })
final_sc = pd.merge(targets_sc, submission_sc, how='left', left_on='ref_hash', right_on='ref_hash')
final_sc['obj'] = final_sc['resultado']
del final_sc['resultado']
final_sc.fillna(0, inplace = True)

In [22]:
final_sc.head(10)

Unnamed: 0,ref_hash,obj
0,1000169251625791246_sc,0.0
1,1000395625957344683_sc,0.0
2,1003027494996471685_sc,0.0
3,1006670001679961544_sc,0.0
4,1007573308966476713_sc,0.0
5,1010070503877148763_sc,145693.5
6,1010265377387765028_sc,0.0
7,1010531372912327058_sc,0.0
8,1011610998357271358_sc,0.0
9,1013543838965040946_sc,141122.375


## =================================================================
## JUNTAMOS LOS DATAFRAMES Y EMPEZAMOS A ENTRENAR (ST = AUCTIONS).
## =================================================================

In [23]:
targets['ref_hash'] = targets['ref_hash'].astype(str)
targets_st['ref_hash'] = targets_st['ref_hash'].astype(str)
targets_sc['ref_hash'] = targets_sc['ref_hash'].astype(str)
auctions['ref_hash'] = auctions['ref_hash'].astype(str)
auctions['ref_hash'] = auctions['ref_hash'] + '_st'

In [24]:
auctions_Ventana1['ref_hash'] = auctions_Ventana1['ref_hash'].astype(str)
auctions_Ventana1['ref_hash'] = auctions_Ventana1['ref_hash'] + '_st'
auct_1 = auctions_Ventana1.groupby('ref_hash').min()
auctions_Ventana2['ref_hash'] = auctions_Ventana2['ref_hash'].astype(str)
auctions_Ventana2['ref_hash'] = auctions_Ventana2['ref_hash'] + '_st'
auct_2 = auctions_Ventana2.groupby('ref_hash').min()
auct_1['obj'] = 0

In [25]:
auct_comb = auct_1.merge(auct_2, on=('ref_hash'), suffixes=('_lefto', '_raito'))
auct_comb['obj'] = (auct_comb['created_raito'] - auct_comb['created_lefto']).dt.total_seconds()

In [26]:
auct_comb = auct_comb.iloc[:, 0:12]
auct_comb['obj'] = auct_comb['obj'].astype(int)
columnas_relevantes_auct = list(auct_comb.select_dtypes(include=['int','float64','uint8']).columns)

In [27]:
auct_comb = auct_comb.loc[:, auct_comb.columns.isin(columnas_relevantes_auct)]
auct_comb = auct_comb.iloc[:, 2:12]
columnas_relevantes_auct = list(auct_2.select_dtypes(include=['int','float64','uint8']).columns)
auct_2 = auct_2.loc[:, auct_2.columns.isin(columnas_relevantes_auct)]
auct_ventana2 = auct_2.iloc[:, 2:12]
target_result_st = pd.merge(targets_st, auct_ventana2, how='left', left_on='ref_hash', right_on='ref_hash')

In [28]:
X_auct, y_auct = auct_comb.iloc[:,:-1],auct_comb.iloc[:,-1]
data_dmatrix_auct = xgb.DMatrix(data=X_auct,label=y_auct)
X_train_auct, X_test_auct, y_train_auct, y_test_auct = train_test_split(X_auct, y_auct, test_size=0.2, random_state=123)

In [29]:
xg_reg_auct = xgb.XGBRegressor(objective = 'reg:linear', colsample_bytree = 0.3, 
         learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 10)

In [30]:
xg_reg_auct.fit(X_train_auct,y_train_auct)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=10,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [31]:
auct_ventana2.columns = [str(col) + '_lefto' for col in auct_ventana2.columns]
auct_ventana2['obj'] = 0
X_target_auct, y_target_auct = auct_ventana2.iloc[:,:-1],auct_ventana2.iloc[:,-1]
preds_st = xg_reg_auct.predict(X_target_auct)

In [32]:
## =================================================================================================
## ARMAMOS EN BASE A LA PREDICCIÓN QUE TENEMOS UN CSV PARA SUBIR A KAGGLE CON EL FORMATO INDICADO.
## =================================================================================================
auct_ventana2 = auct_ventana2.reset_index()
submission_st = pd.DataFrame({ 'resultado': preds_st, 'ref_hash': auct_ventana2['ref_hash'] })
final_st = pd.merge(targets_st, submission_st, how='left', left_on='ref_hash', right_on='ref_hash')
final_st['obj'] = final_st['resultado']
del final_st['resultado']
final_st.fillna(0, inplace = True)

In [33]:
final_st.head(10)

Unnamed: 0,ref_hash,obj
0,1000169251625791246_st,0.0
1,1000395625957344683_st,101929.5
2,1003027494996471685_st,140999.015625
3,1006670001679961544_st,105994.859375
4,1007573308966476713_st,147762.1875
5,1010070503877148763_st,0.0
6,1010265377387765028_st,0.0
7,1010531372912327058_st,0.0
8,1011610998357271358_st,150649.78125
9,1013543838965040946_st,148993.546875


In [34]:
frames = [final_sc, final_st]
final = pd.concat(frames)
final.to_csv("submission_grupo34_003.csv", index=False)

### ======================================================================
### ======================================================================

In [41]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 23320397813996.765625


### ======================================================================
### ======================================================================

In [86]:
asd = installs.groupby('ref_hash').max()

In [87]:
asd.reset_index(inplace=True)

In [None]:
clicks['ref_hash'] = clicks['ref_hash'].astype(str)
clicks['ref_hash'] = clicks['ref_hash'] + '_sc'
auctions['ref_hash'] = auctions['ref_hash'].astype(str)
auctions['ref_hash'] = auctions['ref_hash'] + '_sc'
events['ref_hash'] = events['ref_hash'].astype(str)
events['ref_hash'] = events['ref_hash'] + '_sc'

In [138]:
resultado_clicks = clicks.merge(targets, on=('ref_hash'), suffixes=('_l', '_r'))

In [151]:
resultado_auctions = auctions.merge(targets, on=('ref_hash'), suffixes=('_l', '_r'))

In [13]:
resultado_events = events.merge(targets, on=('ref_hash'), suffixes=('_l', '_r'))

### ======================================================================
### ======================================================================
### ======================================================================

### Pasamos la columna a predecir a la última posición para facilitar el trabajo.

In [14]:
columna_dia = auctions.pop('dia')
auctions['dia'] = columna_dia

### Separamos la variable a predecir

In [17]:
X, y = auctions.iloc[:,:-1],auctions.iloc[:,-1]

### Convertimos los datos a DMatrix

In [18]:
data_dmatrix = xgb.DMatrix(data=X,label=y)


### Hiper-parámetros

    learning_rate: tasa de aprendizaje
    max_depth: máxima profundidad de cada árbol
    subsample: porcentaje de muestras usadas para cada árbol (valor muy bajo, posible underfitting)
    colsample_bytree: porcentaje de features usadas para cada árbol (valores muy alto, posible overfitting)
    n_estimators: cantidad de árboles a construir.
    objective: función de error a utilizar (algunas: reg:linear para regresión, reg:logistic o binary:logistic para clasificación)

### Parámetros de regularización:

    gamma: umbral para hacer split basado en la reducción de error de hacer el nuevo split.
    alpha: regularización para los pesos de las hojas. Un valor más alto genera una mayor regularización.
    lambda: similar alpha pero para la sintonia fina.

### Creamos set de entrenamiento y test

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

### Instanciamos el regresor de XGBoost

In [20]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

### Entrenamos

In [21]:
xg_reg.fit(X_train,y_train)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=10,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

### Predecimos

In [22]:
preds = xg_reg.predict(X_test)

### Calculamos el error en las predicciones

In [25]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 7.808578
