In [1]:
## IMPORTACIÓN GENERAL DE LIBRERIAS Y VISUALIZACIÓN DE DATOS (matplotlib y seaborn)
import numpy as np
import pandas as pd
import seaborn as sns
import datetime as DT
import warnings as wr
import xgboost as xgb
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
## from sklearn.model_selection import train_test_split

%matplotlib inline
warnings.filterwarnings('ignore')
plt.style.use('default') 
sns.set(style="whitegrid") 
plt.rcParams['figure.figsize'] = (15, 10)
pd.set_option('display.max_columns', 1000)
np.set_printoptions(threshold=np.nan)



In [2]:
################################################################
## EVENTS.
################################################################
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES CSV.
events = pd.read_csv('../../events.csv')
## TRANSFORMACIÓN DE TIPOS PARA EL CSV (EVENTS)
events['wifi'] = events['wifi'].fillna(False).astype(bool)
events['connection_type'] = events['connection_type'].astype('category')
events['trans_id'] = events['trans_id'].astype('category')
events['date'] = pd.to_datetime(events['date'], infer_datetime_format=True)
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
events['mes'] = events['date'].dt.month
events['dia'] = events['date'].dt.day
events['hora'] = events['date'].dt.hour
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
events['hora_madrugada'] = 0
events.loc[((events.hora > -1) & (events.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
events['hora_maniana'] = 0
events.loc[((events.hora > 6) & (events.hora < 12)), 'hora_maniana'] = 1
# ALMUERZO de 12 a 13
events['hora_almuerzo'] = 0
events.loc[((events.hora > 11) & (events.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
events['hora_tarde'] = 0
events.loc[((events.hora > 13) & (events.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
events['hora_noche'] = 0
events.loc[((events.hora > 18) & (events.hora < 24)), 'hora_noche'] = 1
################################################################
################################################################

In [3]:
##################### connection_type
dummies = pd.get_dummies(events['connection_type'], drop_first=False)
events = pd.concat([events, dummies], axis=1)
del events['connection_type']
##################### event_uuid
del events['event_uuid']
##################### date
del events['date']
##################### wifi
events['wifi_value'] = 2
events.loc[events.wifi == False, 'wifi_value'] = 0
events.loc[events.wifi == True, 'wifi_value'] = 1
del events['wifi']
##################### attributed
events['attributed_value'] = 2
events.loc[events.attributed == False, 'attributed_value'] = 0
events.loc[events.attributed == True, 'attributed_value'] = 1
del events['attributed']
##################### trans_id
events['trans_id_value'] = events['trans_id'].cat.codes
events.loc[events.trans_id_value == -1, 'trans_id_value'] = 0
del events['trans_id']
##################### fillna
events.fillna(0, inplace = True)

In [16]:
################################################################
## Ventana 1: Del 18 al 20
## Ventana 2: Del 19 al 21
## Ventana 3: Del 20 al 22
## Ventana 4: Del 21 al 23
## Ventana 5: Del 22 al 24
################################################################
events_Ventana1 = events[(events['dia'] >= 21) & (events['dia'] <= 23)]
events_Ventana2 = events[(events['dia'] >= 24) & (events['dia'] <= 26)]
#events_Ventana3 = events[(events['dia'] >= 20) & (events['dia'] <= 22)]
#events_Ventana4 = events[(events['dia'] >= 21) & (events['dia'] <= 23)]
#events_Ventana5 = events[(events['dia'] >= 22) & (events['dia'] <= 24)]
################################################################

In [34]:
################################################################
## Limpiamos el CSV cargado para que no ocupe memoria.
## events = ''
## events_Ventana1 = ''
## events_Ventana2 = ''
## events_Ventana3 = ''
## events_Ventana4 = ''
## events_Ventana5 = ''
################################################################

In [129]:
################################################################
## CLICKS.
################################################################
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES CSV.
clicks = pd.read_csv('../../clicks.csv')
## TRANSFORMACIÓN DE TIPOS PARA EL CSV (CLICKS)
clicks['created'] = pd.to_datetime(clicks['created'], infer_datetime_format=True)
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
clicks['mes'] = clicks['created'].dt.month
clicks['dia'] = clicks['created'].dt.day
clicks['hora'] = clicks['created'].dt.hour
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
clicks['hora_madrugada'] = 0
clicks.loc[((clicks.hora > -1) & (clicks.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
clicks['hora_maniana'] = 0
clicks.loc[((clicks.hora > 6) & (clicks.hora < 12)), 'hora_maniana'] = 1
# ALMUERZO de 12 a 13
clicks['hora_almuerzo'] = 0
clicks.loc[((clicks.hora > 11) & (clicks.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
clicks['hora_tarde'] = 0
clicks.loc[((clicks.hora > 13) & (clicks.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
clicks['hora_noche'] = 0
clicks.loc[((clicks.hora > 18) & (clicks.hora < 24)), 'hora_noche'] = 1
################################################################
################################################################

In [130]:
##################### trans_id
clicks.fillna(0, inplace = True)
clicks['trans_id'] = clicks['trans_id'].astype('category')
##################### date
del clicks['created']
##################### wifi_connection
clicks['wifi_value'] = 2
clicks.loc[clicks.wifi_connection == False, 'wifi_value'] = 0
clicks.loc[clicks.wifi_connection == True, 'wifi_value'] = 1
del clicks['wifi_connection']
##################### trans_id
clicks['trans_id_value'] = clicks['trans_id'].cat.codes
clicks.loc[clicks.trans_id_value == -1, 'trans_id_value'] = 0
del clicks['trans_id']
##################### touchXY
clicks.loc[clicks.touchX == 'Infinity', 'touchX'] = 2
clicks.loc[clicks.touchY == 'Infinity', 'touchY'] = 2
clicks['touchX'] = clicks['touchX'].astype(float).fillna(0.0)
clicks['touchY'] = clicks['touchY'].astype(float).fillna(0.0)

In [127]:
################################################################
## Ventana 1: Del 18 al 20
## Ventana 2: Del 19 al 21
## Ventana 3: Del 20 al 22
## Ventana 4: Del 21 al 23
## Ventana 5: Del 22 al 24
################################################################
clicks_Ventana1 = clicks[(clicks['dia'] >= 21) & (clicks['dia'] <= 23)]
clicks_Ventana2 = clicks[(clicks['dia'] >= 24) & (clicks['dia'] <= 26)]
#clicks_Ventana3 = clicks[(clicks['dia'] >= 20) & (clicks['dia'] <= 22)]
#clicks_Ventana4 = clicks[(clicks['dia'] >= 21) & (clicks['dia'] <= 23)]
#clicks_Ventana5 = clicks[(clicks['dia'] >= 22) & (clicks['dia'] <= 24)]
################################################################

In [33]:
################################################################
## Limpiamos el CSV cargado para que no ocupe memoria.
## clicks = ''
## clicks_Ventana1 = ''
## clicks_Ventana2 = ''
## clicks_Ventana3 = ''
## clicks_Ventana4 = ''
## clicks_Ventana5 = ''
################################################################

In [2]:
################################################################
## INSTALLS.
################################################################
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES CSV.
installs = pd.read_csv('../../installs.csv')
## TRANSFORMACIÓN DE TIPOS PARA EL CSV (INSTALLS)
installs['kind'] = installs['kind'].astype('category')
installs['user_agent'] = installs['user_agent'].astype('category')
installs['session_user_agent'] = installs['session_user_agent'].astype('category')
installs['created'] = pd.to_datetime(installs['created'], infer_datetime_format=True)
installs['trans_id'] = installs['trans_id'].astype('category')
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
installs['mes'] = installs['created'].dt.month
installs['dia'] = installs['created'].dt.day
installs['hora'] = installs['created'].dt.hour
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
installs['hora_madrugada'] = 0
installs.loc[((installs.hora > -1) & (installs.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
installs['hora_maniana'] = 0
installs.loc[((installs.hora > 6) & (installs.hora < 12)), 'hora_maniana'] = 1
# ALMUERZO de 12 a 13
installs['hora_almuerzo'] = 0
installs.loc[((installs.hora > 11) & (installs.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
installs['hora_tarde'] = 0
installs.loc[((installs.hora > 13) & (installs.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
installs['hora_noche'] = 0
installs.loc[((installs.hora > 18) & (installs.hora < 24)), 'hora_noche'] = 1
installs['cantidad'] = 1
################################################################
################################################################

In [3]:
##################### fillna
installs['click_hash'].fillna(0, inplace = True)
installs['device_brand'].fillna(0, inplace = True)
installs['device_model'].fillna(0, inplace = True)
installs['device_language'].fillna(0, inplace = True)
##################### created event
del installs['event_uuid']
#del installs['created']
##################### wifi
installs['wifi_value'] = 2
installs.loc[installs.wifi == False, 'wifi_value'] = 0
installs.loc[installs.wifi == True, 'wifi_value'] = 1
del installs['wifi']
##################### attributed
installs['attributed_value'] = 2
installs.loc[installs.attributed == False, 'attributed_value'] = 0
installs.loc[installs.attributed == True, 'attributed_value'] = 1
del installs['attributed']
##################### implicit
installs['implicit_value'] = 2
installs.loc[installs.implicit == False, 'implicit_value'] = 0
installs.loc[installs.implicit == True, 'implicit_value'] = 1
del installs['implicit']
##################### click_hash
installs['click_hash'] = installs['click_hash'].astype('category')
##################### session_user_agent
installs['session_user_agent_value'] = installs['session_user_agent'].cat.codes
del installs['session_user_agent']
installs.loc[installs.session_user_agent_value == -1, 'session_user_agent_value'] = 0
##################### click_hash
installs['click_hash_value'] = installs['click_hash'].cat.codes
installs.loc[installs.click_hash_value == -1, 'click_hash_value'] = 0
del installs['click_hash']
##################### user_agent
installs['user_agent_value'] = installs['user_agent'].cat.codes
installs.loc[installs.user_agent_value == -1, 'user_agent_value'] = 0
del installs['user_agent']
##################### kind
installs['kind_value'] = installs['kind'].cat.codes
installs.loc[installs.kind_value == -1, 'kind_value'] = 0
del installs['kind']
##################### trans_id
installs['trans_id_value'] = installs['trans_id'].cat.codes
installs.loc[installs.trans_id_value == -1, 'trans_id_value'] = 0
del installs['trans_id']
##################### pasamos columnas a string.
installs['ref_type'] = installs['ref_type'].astype(str)
installs['wifi_value'] = installs['wifi_value'].astype(str)
installs['implicit_value'] = installs['implicit_value'].astype(str)

installs.loc[installs.ref_type == '1891515180541284343', 'ref_type'] = 'ref_type_1'
installs.loc[installs.ref_type == '1494519392962156891', 'ref_type'] = 'ref_type_2'

installs.loc[installs.wifi_value == '0', 'wifi_value'] = 'wifi_value_0'
installs.loc[installs.wifi_value == '1', 'wifi_value'] = 'wifi_value_1'
installs.loc[installs.wifi_value == '2', 'wifi_value'] = 'wifi_value_2'

installs.loc[installs.implicit_value == '0', 'implicit_value'] = 'implicit_value_0'
installs.loc[installs.implicit_value == '1', 'implicit_value'] = 'implicit_value_1'

dummies = pd.get_dummies(installs['ref_type'], drop_first=False)
installs = pd.concat([installs, dummies], axis=1)
dummies = pd.get_dummies(installs['wifi_value'], drop_first=False)
installs = pd.concat([installs, dummies], axis=1)
dummies = pd.get_dummies(installs['implicit_value'], drop_first=False)
installs = pd.concat([installs, dummies], axis=1)

del installs['ref_type']
del installs['wifi_value']
del installs['implicit_value']

In [4]:
################################################################
## Ventana 1: Del 18 al 20
## Ventana 2: Del 19 al 21
## Ventana 3: Del 20 al 22
## Ventana 4: Del 21 al 23
## Ventana 5: Del 22 al 24
################################################################
installs_Ventana1 = installs[(installs['dia'] >= 21) & (installs['dia'] <= 23)]
installs_Ventana2 = installs[(installs['dia'] >= 24) & (installs['dia'] <= 26)]
#installs_Ventana3 = installs[(installs['dia'] >= 20) & (installs['dia'] <= 22)]
#installs_Ventana4 = installs[(installs['dia'] >= 21) & (installs['dia'] <= 23)]
#installs_Ventana5 = installs[(installs['dia'] >= 22) & (installs['dia'] <= 24)]
################################################################

In [5]:
################################################################
installs['ref_hash'] = installs['ref_hash'].astype(str)
installs['ref_hash'] = installs['ref_hash'] + '_sc'
installs_Ventana1['ref_hash'] = installs_Ventana1['ref_hash'].astype(str)
installs_Ventana1['ref_hash'] = installs_Ventana1['ref_hash'] + '_sc'
installs_Ventana2['ref_hash'] = installs_Ventana2['ref_hash'].astype(str)
installs_Ventana2['ref_hash'] = installs_Ventana2['ref_hash'] + '_sc'
################################################################
installs_cantidad_0 = installs[['ref_hash', 'cantidad', 'ref_type_1', 'ref_type_2', 'wifi_value_0', 'wifi_value_1', 'wifi_value_2', 'implicit_value_0', 'implicit_value_1']].copy()
installs_cantidad_1 = installs_Ventana1[['ref_hash', 'cantidad', 'ref_type_1', 'ref_type_2', 'wifi_value_0', 'wifi_value_1', 'wifi_value_2', 'implicit_value_0', 'implicit_value_1']].copy()
installs_cantidad_2 = installs_Ventana2[['ref_hash', 'cantidad', 'ref_type_1', 'ref_type_2', 'wifi_value_0', 'wifi_value_1', 'wifi_value_2', 'implicit_value_0', 'implicit_value_1']].copy()
inst_cant_0 = installs_cantidad_0.groupby('ref_hash').sum()
inst_cant_1 = installs_cantidad_1.groupby('ref_hash').sum()
inst_cant_2 = installs_cantidad_2.groupby('ref_hash').sum()
################################################################
installs.drop(['cantidad', 'ref_type_1', 'ref_type_2', 'wifi_value_0', 'wifi_value_1', 'wifi_value_2', 'implicit_value_0', 'implicit_value_1'], axis=1, inplace=True)
installs_Ventana1.drop(['cantidad', 'ref_type_1', 'ref_type_2', 'wifi_value_0', 'wifi_value_1', 'wifi_value_2', 'implicit_value_0', 'implicit_value_1'], axis=1, inplace=True)
installs_Ventana2.drop(['cantidad', 'ref_type_1', 'ref_type_2', 'wifi_value_0', 'wifi_value_1', 'wifi_value_2', 'implicit_value_0', 'implicit_value_1'], axis=1, inplace=True)
inst_1 = installs_Ventana1.groupby('ref_hash').min()
inst_2 = installs_Ventana2.groupby('ref_hash').min()
################################################################

In [6]:
inst_comb_1 = inst_1.merge(inst_cant_0, on=('ref_hash'), suffixes=('', '_r01'))
inst_comb_2 = inst_2.merge(inst_cant_0, on=('ref_hash'), suffixes=('', '_r02'))
inst_comb_01 = inst_comb_1.merge(inst_cant_1, on=('ref_hash'), suffixes=('', '_r11'))
inst_comb_02 = inst_comb_2.merge(inst_cant_2, on=('ref_hash'), suffixes=('', '_r22'))

In [7]:
################################################################
## Limpiamos el CSV cargado para que no ocupe memoria.
## installs = ''
## installs_Ventana1 = ''
## installs_Ventana2 = ''
## installs_Ventana3 = ''
## installs_Ventana4 = ''
## installs_Ventana5 = ''
################################################################

In [None]:
################################################################
## AUCTIONS.
################################################################
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES CSV.
auctions = pd.read_csv('../../auctions.csv',nrows=5000000)
## TRANSFORMACIÓN DE TIPOS PARA EL CSV (AUCTIONS)
auctions['date'] = pd.to_datetime(auctions['date'], infer_datetime_format=True)
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
auctions['mes'] = auctions['date'].dt.month
auctions['dia'] = auctions['date'].dt.day
auctions['hora'] = auctions['date'].dt.hour
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
auctions['hora_madrugada'] = 0
auctions.loc[((auctions.hora > -1) & (auctions.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
auctions['hora_maniana'] = 0
auctions.loc[((auctions.hora > 6) & (auctions.hora < 12)), 'hora_maniana'] = 1
# ALMUERZO de 12 a 13
auctions['hora_almuerzo'] = 0
auctions.loc[((auctions.hora > 11) & (auctions.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
auctions['hora_tarde'] = 0
auctions.loc[((auctions.hora > 13) & (auctions.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
auctions['hora_noche'] = 0
auctions.loc[((auctions.hora > 18) & (auctions.hora < 24)), 'hora_noche'] = 1
# RENOMBRAMOS LA COLUMNA PARA LOS JOINS.
auctions=auctions.rename(columns = {'device_id':'ref_hash'})
auctions=auctions.rename(columns = {'date':'created'})
#auctions['cantidad'] = 1
################################################################
################################################################

In [None]:
##################### date
#del auctions['date']

In [None]:
################################################################
## Ventana 1: Del 18 al 20
## Ventana 2: Del 19 al 21
## Ventana 3: Del 20 al 22
## Ventana 4: Del 21 al 23
## Ventana 5: Del 22 al 24
################################################################
auctions_Ventana1 = auctions[(auctions['dia'] >= 21) & (auctions['dia'] <= 23)]
auctions_Ventana2 = auctions[(auctions['dia'] >= 24) & (auctions['dia'] <= 26)]
#auctions_Ventana3 = auctions[(auctions['dia'] >= 20) & (auctions['dia'] <= 22)]
#auctions_Ventana4 = auctions[(auctions['dia'] >= 20) & (auctions['dia'] <= 23)]
#auctions_Ventana5 = auctions[(auctions['dia'] >= 23) & (auctions['dia'] <= 27)]
################################################################

In [None]:
auctions['ref_hash'] = auctions['ref_hash'].astype(str)
auctions['ref_hash'] = auctions['ref_hash'] + '_st'
auctions_Ventana1['ref_hash'] = auctions_Ventana1['ref_hash'].astype(str)
auctions_Ventana1['ref_hash'] = auctions_Ventana1['ref_hash'] + '_st'
auctions_Ventana2['ref_hash'] = auctions_Ventana2['ref_hash'].astype(str)
auctions_Ventana2['ref_hash'] = auctions_Ventana2['ref_hash'] + '_st'

In [None]:
################################################################
## Limpiamos el CSV cargado para que no ocupe memoria.
## auctions = ''
## auctions_Ventana1 = ''
## auctions_Ventana2 = ''
## auctions_Ventana3 = ''
## auctions_Ventana4 = ''
## auctions_Ventana5 = ''
################################################################

In [None]:
################################################################
## TARGET.
################################################################
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES CSV.
targets = pd.read_csv('../../target.csv')
targets_sc = targets[targets['ref_hash'].str.contains('_sc')]
targets_st = targets[targets['ref_hash'].str.contains('_st')]
################################################################
################################################################

## =================================================================
## JUNTAMOS LOS DATAFRAMES Y EMPEZAMOS A ENTRENAR (SC = INSTALLS).
## =================================================================

In [14]:
targets['ref_hash'] = targets['ref_hash'].astype(str)
targets_st['ref_hash'] = targets_st['ref_hash'].astype(str)
targets_sc['ref_hash'] = targets_sc['ref_hash'].astype(str)

In [15]:
#inst_1 = installs_Ventana1.groupby('ref_hash').min()
#inst_2 = installs_Ventana2.groupby('ref_hash').min()
#########################################################################
# Ahora esto lo tenemos hecho más arriba en inst_comb_01 e inst_comb_02
#########################################################################
inst_comb_01['obj'] = 0

In [16]:
inst_comb = inst_comb_01.merge(inst_comb_02, on=('ref_hash'), suffixes=('_lefto', '_raito'))
inst_comb['obj'] = (inst_comb['created_raito'] - inst_comb['created_lefto']).dt.total_seconds()

In [17]:
inst_comb = inst_comb.iloc[:, 0:38]
inst_comb['obj'] = inst_comb['obj'].astype(int)

In [18]:
columnas_relevantes_inst = list(inst_comb.select_dtypes(include=['int','float64','uint8']).columns)
inst_comb = inst_comb.loc[:, inst_comb.columns.isin(columnas_relevantes_inst)]

In [19]:
inst_comb = inst_comb.iloc[:, 6:32]
columnas_relevantes_inst = list(inst_comb_02.select_dtypes(include=['int','float64','uint8']).columns)
inst_comb_02 = inst_comb_02.loc[:, inst_comb_02.columns.isin(columnas_relevantes_inst)]

In [20]:
inst_ventana2 = inst_comb_02.iloc[:, 6:32]

In [21]:
target_result_sc = pd.merge(targets_sc, inst_ventana2, how='left', left_on='ref_hash', right_on='ref_hash')

In [22]:
X_inst, y_inst = inst_ventana2.iloc[:,:-1],inst_ventana2.iloc[:,-1]

## =================================================================
## JUNTAMOS LOS DATAFRAMES Y EMPEZAMOS A ENTRENAR (ST = AUCTIONS).
## =================================================================

In [23]:
targets['ref_hash'] = targets['ref_hash'].astype(str)
targets_st['ref_hash'] = targets_st['ref_hash'].astype(str)
targets_sc['ref_hash'] = targets_sc['ref_hash'].astype(str)

In [24]:
auct_1 = auctions_Ventana1.groupby('ref_hash').min()
auct_2 = auctions_Ventana2.groupby('ref_hash').min()
auct_1['obj'] = 0

In [25]:
auct_comb = auct_1.merge(auct_2, on=('ref_hash'), suffixes=('_lefto', '_raito'))
auct_comb['obj'] = (auct_comb['created_raito'] - auct_comb['created_lefto']).dt.total_seconds()

In [26]:
auct_comb = auct_comb.iloc[:, 0:12]
auct_comb['obj'] = auct_comb['obj'].astype(int)
columnas_relevantes_auct = list(auct_comb.select_dtypes(include=['int','float64','uint8']).columns)

In [27]:
auct_comb = auct_comb.loc[:, auct_comb.columns.isin(columnas_relevantes_auct)]
auct_comb = auct_comb.iloc[:, 2:12]
columnas_relevantes_auct = list(auct_2.select_dtypes(include=['int','float64','uint8']).columns)
auct_2 = auct_2.loc[:, auct_2.columns.isin(columnas_relevantes_auct)]
auct_ventana2 = auct_2.iloc[:, 2:12]
target_result_st = pd.merge(targets_st, auct_ventana2, how='left', left_on='ref_hash', right_on='ref_hash')

In [28]:
X_auct, y_auct = auct_ventana2.iloc[:,:-1],auct_ventana2.iloc[:,-1]

In [29]:
X_auct.head()

Unnamed: 0_level_0,mes,dia,hora,hora_madrugada,hora_maniana,hora_almuerzo,hora_tarde
ref_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1000061425870948777_st,4,24,4,1,0,0,0
1000067364236969361_st,4,26,4,0,0,0,0
1000095322020146100_st,4,26,18,0,0,0,1
1000193663803871158_st,4,26,3,1,0,0,0
1000214925038058238_st,4,25,18,0,0,0,1


In [30]:
X_inst.head()

Unnamed: 0_level_0,mes,dia,hora,hora_madrugada,hora_maniana,hora_almuerzo,hora_tarde,hora_noche,attributed_value,cantidad,ref_type_1,ref_type_2,wifi_value_0,wifi_value_1,wifi_value_2,implicit_value_0,implicit_value_1,cantidad_r22,ref_type_1_r22,ref_type_2_r22,wifi_value_0_r22,wifi_value_1_r22,wifi_value_2_r22,implicit_value_0_r22
ref_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1000061425870948777_sc,4,26,2,1,0,0,0,0,0,1,1,0,0,0,1,1,0,1,1,0,0,0,1,1
1000085014918096773_sc,4,24,1,1,0,0,0,0,0,1,1,0,0,0,1,1,0,1,1,0,0,0,1,1
1000193663803871158_sc,4,26,18,0,0,0,1,0,0,1,1,0,0,1,0,1,0,1,1,0,0,1,0,1
1000252911429490816_sc,4,26,19,0,0,0,0,1,0,1,1,0,0,1,0,1,0,1,1,0,0,1,0,1
1000298337185304784_sc,4,26,4,1,0,0,0,0,0,1,1,0,0,0,1,1,0,1,1,0,0,0,1,1


### ===========================================================================
### UNIFICAMOS LOS DATOS PARA HACER LOS ENTRENAMIENTOS Y LA PREDICCIÓN.
### ===========================================================================

In [30]:
data_dmatrix_auct = xgb.DMatrix(data=X_auct,label=y_auct)
X_train_auct, X_test_auct, y_train_auct, y_test_auct = train_test_split(X_auct, y_auct, test_size=0.2, random_state=123)
xg_reg_auct = xgb.XGBRegressor(objective = 'reg:linear', colsample_bytree = 0.3, 
         learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 10)

In [None]:
data_dmatrix_inst = xgb.DMatrix(data=X_inst,label=y_inst)
X_train_inst, X_test_inst, y_train_inst, y_test_inst = train_test_split(X_inst, y_inst, test_size=0.2, random_state=123)
xg_reg_inst = xgb.XGBRegressor(objective = 'reg:linear', colsample_bytree = 0.3, 
              learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 10)

In [31]:
inst_ventana2.columns = [str(col) + '_lefto' for col in inst_ventana2.columns]
inst_ventana2.columns = [str(col).replace('_r22_lefto','_r11') for col in inst_ventana2.columns]
inst_ventana2['obj'] = 0
X_target_inst, y_target_inst = inst_ventana2.iloc[:,:-1],inst_ventana2.iloc[:,-1]

In [32]:
X_train_auct = X_train_auct.reset_index()
X_train_auct['ref_hash'] = X_train_auct['ref_hash'].str.replace('_sc','_st')
X_train_auct.set_index('ref_hash', inplace=True)
X_train_auct.head()

Unnamed: 0_level_0,mes_lefto,dia_lefto,hora_lefto,hora_madrugada_lefto,hora_maniana_lefto,hora_almuerzo_lefto,hora_tarde_lefto,hora_noche_lefto
ref_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5325377530382703329_st,4,21,8,0,0,0,0,0
121444358661639585_st,4,22,1,0,0,0,0,0
3725585911637994872_st,4,21,1,1,0,0,0,0
638193427561846283_st,4,21,3,0,0,0,0,0
978331204120535460_st,4,21,15,0,0,0,0,0


In [33]:
X_train_inst = X_train_inst.reset_index()
X_train_inst['ref_hash'] = X_train_inst['ref_hash'].str.replace('_sc','_st')
X_train_inst.set_index('ref_hash', inplace=True)
X_train_inst.head()

Unnamed: 0_level_0,mes_lefto,dia_lefto,hora_lefto,hora_madrugada_lefto,hora_maniana_lefto,hora_almuerzo_lefto,hora_tarde_lefto,hora_noche_lefto,attributed_value_lefto,cantidad_lefto,ref_type_1_lefto,ref_type_2_lefto,wifi_value_0_lefto,wifi_value_1_lefto,wifi_value_2_lefto,implicit_value_0_lefto,implicit_value_1_lefto,cantidad_r11,ref_type_1_r11,ref_type_2_r11,wifi_value_0_r11,wifi_value_1_r11,wifi_value_2_r11,implicit_value_0_r11,implicit_value_1_r11
ref_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
5205183633280682022_st,4,21,20,0,0,0,0,1,0,2,2,0,0,0,2,1,1,1,1,0,0,0,1,1,0
1844243464983998520_st,4,23,23,0,0,0,0,1,0,2,0,2,1,0,1,2,0,1,0,1,1,0,0,1,0
1560011100777836377_st,4,23,15,0,0,0,1,0,0,2,2,0,0,1,1,1,1,1,1,0,0,0,1,1,0
3999251345709802744_st,4,22,12,0,0,1,0,0,0,2,2,0,0,1,1,2,0,1,1,0,0,0,1,1,0
8138185005763472071_st,4,22,2,1,0,0,0,0,0,2,2,0,1,0,1,2,0,1,1,0,1,0,0,1,0


In [34]:
X_train_auct = pd.merge(X_train_auct, X_train_inst, how='left', left_on='ref_hash', right_on='ref_hash', suffixes=('_l', '_r'))

In [35]:
X_train_auct = X_train_auct.reset_index()
X_train_auct['ref_hash'] = X_train_auct['ref_hash'].str.replace('_st','_sc')
X_train_auct.set_index('ref_hash', inplace=True)
xg_reg_inst.fit(X_train_auct,y_train_inst)

XGBoostError: [20:05:16] /workspace/src/objective/regression_obj.cu:66: Check failed: preds.Size() == info.labels_.Size() (80343 vs. 2966) labels are not correctly providedpreds.size=80343, label.size=2966

Stack trace returned 10 entries:
[bt] (0) /usr/local/lib/python2.7/dist-packages/xgboost/./lib/libxgboost.so(dmlc::StackTrace()+0x3d) [0x7fc41ceb35cd]
[bt] (1) /usr/local/lib/python2.7/dist-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x18) [0x7fc41ceb39c8]
[bt] (2) /usr/local/lib/python2.7/dist-packages/xgboost/./lib/libxgboost.so(xgboost::obj::RegLossObj<xgboost::obj::LinearSquareLoss>::GetGradient(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*)+0x1c3) [0x7fc41d0aae03]
[bt] (3) /usr/local/lib/python2.7/dist-packages/xgboost/./lib/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*)+0x362) [0x7fc41cf2a1e2]
[bt] (4) /usr/local/lib/python2.7/dist-packages/xgboost/./lib/libxgboost.so(XGBoosterUpdateOneIter+0x35) [0x7fc41ceabab5]
[bt] (5) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7fc461d6ce18]
[bt] (6) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x32a) [0x7fc461d6c87a]
[bt] (7) /usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so(_ctypes_callproc+0x2a4) [0x7fc461f7f844]
[bt] (8) /usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so(+0x10245) [0x7fc461f7f245]
[bt] (9) /usr/bin/python(PyEval_EvalFrameEx+0x54c0) [0x55f9d36d8650]



In [36]:
X_train_auct = X_train_auct.reset_index()
X_train_auct['ref_hash'] = X_train_auct['ref_hash'].str.replace('_sc','_st')
X_train_auct.set_index('ref_hash', inplace=True)
xg_reg_auct.fit(X_train_auct,y_train_auct)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=10,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [37]:
auct_ventana2.columns = [str(col) + '_lefto' for col in auct_ventana2.columns]
auct_ventana2['obj'] = 0
X_target_auct, y_target_auct = auct_ventana2.iloc[:,:-1],auct_ventana2.iloc[:,-1]

In [38]:
X_target_auct = X_target_auct.reset_index()
X_target_auct['ref_hash'] = X_target_auct['ref_hash'].str.replace('_sc','_st')
X_target_auct.set_index('ref_hash', inplace=True)
X_target_auct.head()

Unnamed: 0_level_0,mes_lefto,dia_lefto,hora_lefto,hora_madrugada_lefto,hora_maniana_lefto,hora_almuerzo_lefto,hora_tarde_lefto,hora_noche_lefto
ref_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1000061425870948777_st,4,24,4,1,0,0,0,0
1000067364236969361_st,4,26,4,0,0,0,0,0
1000095322020146100_st,4,26,18,0,0,0,1,0
1000193663803871158_st,4,26,3,1,0,0,0,0
1000214925038058238_st,4,25,18,0,0,0,1,0


In [41]:
X_target_inst = X_target_inst.reset_index()
X_target_inst['ref_hash'] = X_target_inst['ref_hash'].str.replace('_sc','_st')
X_target_inst.set_index('ref_hash', inplace=True)
X_target_inst.head()

Unnamed: 0_level_0,mes_lefto,dia_lefto,hora_lefto,hora_madrugada_lefto,hora_maniana_lefto,hora_almuerzo_lefto,hora_tarde_lefto,hora_noche_lefto,attributed_value_lefto,cantidad_lefto,ref_type_1_lefto,ref_type_2_lefto,wifi_value_0_lefto,wifi_value_1_lefto,wifi_value_2_lefto,implicit_value_0_lefto,implicit_value_1_lefto,cantidad_r11,ref_type_1_r11,ref_type_2_r11,wifi_value_0_r11,wifi_value_1_r11,wifi_value_2_r11,implicit_value_0_r11,implicit_value_1_r11
ref_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1000061425870948777_st,4,26,2,1,0,0,0,0,0,1,1,0,0,0,1,1,0,1,1,0,0,0,1,1,0
1000085014918096773_st,4,24,1,1,0,0,0,0,0,1,1,0,0,0,1,1,0,1,1,0,0,0,1,1,0
1000193663803871158_st,4,26,18,0,0,0,1,0,0,1,1,0,0,1,0,1,0,1,1,0,0,1,0,1,0
1000252911429490816_st,4,26,19,0,0,0,0,1,0,1,1,0,0,1,0,1,0,1,1,0,0,1,0,1,0
1000298337185304784_st,4,26,4,1,0,0,0,0,0,1,1,0,0,0,1,1,0,1,1,0,0,0,1,1,0


In [42]:
X_target_auct = pd.merge(X_target_auct, X_target_inst, how='left', left_on='ref_hash', right_on='ref_hash', suffixes=('_l', '_r'))

In [None]:
X_target_auct = X_target_auct.reset_index()
X_target_auct['ref_hash'] = X_target_auct['ref_hash'].str.replace('_st','_sc')
X_target_auct.set_index('ref_hash', inplace=True)

In [43]:
preds_sc = xg_reg_inst.predict(X_target_auct)
## =================================================================================================
## ARMAMOS EN BASE A LA PREDICCIÓN QUE TENEMOS UN CSV PARA SUBIR A KAGGLE CON EL FORMATO INDICADO.
## =================================================================================================
inst_ventana2 = inst_ventana2.reset_index()
submission_sc = pd.DataFrame({ 'resultado': preds_sc, 'ref_hash': inst_ventana2['ref_hash'] })
final_sc = pd.merge(targets_sc, submission_sc, how='left', left_on='ref_hash', right_on='ref_hash')
final_sc['obj'] = final_sc['resultado']
del final_sc['resultado']
final_sc.fillna(0, inplace = True)

In [None]:
X_target_auct = X_target_auct.reset_index()
X_target_auct['ref_hash'] = X_target_auct['ref_hash'].str.replace('_sc','_st')
X_target_auct.set_index('ref_hash', inplace=True)

In [44]:
preds_st = xg_reg_auct.predict(X_target_auct)
## =================================================================================================
## ARMAMOS EN BASE A LA PREDICCIÓN QUE TENEMOS UN CSV PARA SUBIR A KAGGLE CON EL FORMATO INDICADO.
## =================================================================================================
auct_ventana2 = auct_ventana2.reset_index()
submission_st = pd.DataFrame({ 'resultado': preds_st, 'ref_hash': auct_ventana2['ref_hash'] })
final_st = pd.merge(targets_st, submission_st, how='left', left_on='ref_hash', right_on='ref_hash')
final_st['obj'] = final_st['resultado']
del final_st['resultado']
final_st.fillna(0, inplace = True)

### ======================================================================
### GENERAMOS EL CSV PARA SUBIR A LA COMPETENCIA.
### ======================================================================

In [46]:
frames = [final_sc, final_st]
final = pd.concat(frames)
final.to_csv("submission_grupo34_008.csv", index=False)

### ======================================================================
### ======================================================================
### ======================================================================