### =======================================================================
### IMPORTACIÓN DE DATOS.
### =======================================================================

In [1]:
## IMPORTACIÓN GENERAL DE LIBRERIAS Y VISUALIZACIÓN DE DATOS (matplotlib y seaborn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as DT
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import xgboost as xgb

%matplotlib inline
warnings.filterwarnings('ignore')
plt.style.use('default') 
sns.set(style="whitegrid") 
plt.rcParams['figure.figsize'] = (15, 10)
pd.set_option('display.max_columns', 1000)
np.set_printoptions(threshold=np.nan)




In [2]:

## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES EVENTOS.
eventos = pd.read_csv('events_up_to_01062018.csv', encoding = 'utf-8')
## OBTENEMOS TODA LA INFORMACIÓN DEL SET DE ENTRENAMIENTO.
#y_train = pd.read_csv('labels_training_set.csv', encoding = 'utf-8')
## OBTENEMOS TODA LA INFORMACIÓN A TESTEAR.
#test = pd.read_csv('trocafone_kaggle_test.csv', encoding = 'utf-8')
## OBTENEMOS EJEMPLO DE SUBMIT
#example = pd.read_csv('trocafone_kaggle_submit_sample_all_0.csv', encoding = 'utf-8')

In [3]:
eventos.head(50)

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,,,,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,,,,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,,,,,,,,,,,,,
5,2018-05-18 00:44:27,searched products,4c8a8b93,,,,,,,"10240,9987,10322,10085,9944,9931,13404,10154,1...",iPhone se,,,,,,,,,,,,
6,2018-05-18 00:44:14,viewed product,1b9f7cf6,,2831.0,iPhone 6,Bom,16GB,Dourado,,,,,,,,,,,,,,
7,2018-05-18 00:44:02,viewed product,29ebb414,,2845.0,iPhone 6 Plus,Bom,128GB,Cinza espacial,,,,,,,,,,,,,,
8,2018-05-18 00:43:59,viewed product,de8fe91b,,12548.0,Motorola Moto G5 Plus,Bom,32GB,Platinum,,,,,,,,,,,,,,
9,2018-05-18 00:43:40,ad campaign hit,45baf068,/,,,,,,,,,google,,,,,,,,,,


### =======================================================================
### ARMADO DE FEATURES.
### =======================================================================\

In [4]:
## PASAMOS LAS COLUMNAS QUE TIENEN UNA CANTIDAD DE VALORES LIMITADA A UN TIPO CATEGORY
eventos['event'] = eventos['event'].astype('category')
eventos['condition'] = eventos['condition'].astype('category')
eventos['storage'] = eventos['storage'].astype('category')
eventos['search_engine'] = eventos['search_engine'].astype('category')
eventos['channel'] = eventos['channel'].astype('category')
eventos['new_vs_returning'] = eventos['new_vs_returning'].astype('category')
eventos['device_type'] = eventos['device_type'].astype('category')
eventos['color'] = eventos['color'].astype('category')
eventos['region'] = eventos['region'].astype('category')
eventos['country'] = eventos['country'].astype('category')
eventos['operating_system_version'] = eventos['operating_system_version'].astype('category')
eventos['city'] = eventos['city'].astype('category')
eventos['browser_version'] = eventos['browser_version'].astype('category')
eventos['screen_resolution'] = eventos['screen_resolution'].astype('category')
eventos['timestamp'] = pd.to_datetime(eventos['timestamp'], infer_datetime_format=True)

In [5]:
## ORDENAMOS LOS DATOS ṔOR PERSONAS EN PRIMER LUGAR Y TIEMPO EN SEGUNDO.
eventos.sort_values(['person', 'timestamp'], ascending=[True, True], inplace=True)

In [6]:
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
eventos['mes'] = eventos['timestamp'].dt.month
eventos['dia'] = eventos['timestamp'].dt.day
eventos['hora'] = eventos['timestamp'].dt.hour

In [7]:
## ARMAMOS UNA COLUMNA PARA EL DÍA DE LA SEMANA COMO NOMBRE.
eventos['diasemana'] = eventos['timestamp'].dt.weekday_name
## PONEMOS LOS NOMBRES DE MANERA MÁS PROLIJA PARA LOS GRÁFICOS.
eventos.loc[eventos.diasemana.str.contains('Monday', na=False), 'diasemana'] = 'lunes'
eventos.loc[eventos.diasemana.str.contains('Tuesday', na=False), 'diasemana'] = 'martes'
eventos.loc[eventos.diasemana.str.contains('Wednesday', na=False), 'diasemana'] = 'miercoles'
eventos.loc[eventos.diasemana.str.contains('Thursday', na=False), 'diasemana'] = 'jueves'
eventos.loc[eventos.diasemana.str.contains('Friday', na=False), 'diasemana'] = 'viernes'
eventos.loc[eventos.diasemana.str.contains('Saturday', na=False), 'diasemana'] = 'sabado'
eventos.loc[eventos.diasemana.str.contains('Sunday', na=False), 'diasemana'] = 'domingo'

In [8]:
# AGREGAMOS UNA COLUMNA PARA INDICAR SI EL EVENTO OCURRIO UN FIN DE SEMANA
eventos['evento_en_finde'] = 0
eventos.loc[(eventos.diasemana.str.contains('DOM', na=False) | eventos.diasemana.str.contains('SAB', na=False)), 'evento_en_finde'] = 1

In [9]:
# DEFINIMOS EL MES COMO NOMBRE PARA FACILITAR LAS COLUMNAS
eventos['mesMayus'] = ''
eventos.loc[eventos.mes == 1, 'mesMayus'] = 'enero'
eventos.loc[eventos.mes == 2, 'mesMayus'] = 'febrero'
eventos.loc[eventos.mes == 3, 'mesMayus'] = 'marzo'
eventos.loc[eventos.mes == 4, 'mesMayus'] = 'abril'
eventos.loc[eventos.mes == 5, 'mesMayus'] = 'mayo'
eventos.loc[eventos.mes == 6, 'mesMayus'] = 'junio'
eventos.loc[eventos.mes == 7, 'mesMayus'] = 'julio'
eventos.loc[eventos.mes == 8, 'mesMayus'] = 'agosto'
eventos.loc[eventos.mes == 9, 'mesMayus'] = 'septiembre'
eventos.loc[eventos.mes == 10, 'mesMayus'] = 'octubre'
eventos.loc[eventos.mes == 11, 'mesMayus'] = 'noviembre'
eventos.loc[eventos.mes == 12, 'mesMayus'] = 'diciembre'

In [10]:
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
eventos['hora_madrugada'] = 0
eventos.loc[((eventos.hora > -1) & (eventos.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
eventos['hora_mañana'] = 0
eventos.loc[((eventos.hora > 6) & (eventos.hora < 12)), 'hora_mañana'] = 1
# ALMUERZO de 12 a 13
eventos['hora_almuerzo'] = 0
eventos.loc[((eventos.hora > 11) & (eventos.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
eventos['hora_tarde'] = 0
eventos.loc[((eventos.hora > 13) & (eventos.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
eventos['hora_noche'] = 0
eventos.loc[((eventos.hora > 18) & (eventos.hora < 24)), 'hora_noche'] = 1
# TRANSFORMAMOS EN CATEGÓRICAS EL DÍA DE LA SEMANA Y EL MES.
eventos['diasemana'] = eventos['diasemana'].astype('category')
eventos['mesMayus'] = eventos['mesMayus'].astype('category')

In [11]:
# COMO TENEMOS UN EVENTO CON EL MISMO NOMBRE SE GENERA CONFLICTOS, 
# ERGO LE MODIFICAMOS EL NOMBRE PARA NO TENER DOS COLUMNAS CON = NOMBRE Y DISTINTO TIPO.
eventos.rename(columns={'staticpage': 'Genstatpage'}, inplace=True)

In [12]:
# COLUMNAS DONDE POR CADA REGISTRO SABEMOS QUE TENEMOS UN VALOR (SIEMPRE PRESENTES)
dummies = pd.get_dummies(eventos['diasemana'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = pd.get_dummies(eventos['mesMayus'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = pd.get_dummies(eventos['event'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
# lE SACO ESTO
#dummies = pd.get_dummies(eventos['storage'], drop_first=False)
#eventos = pd.concat([eventos, dummies], axis=1)
dummies = pd.get_dummies(eventos['condition'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
# LE AGREGO ESTO
dummies = pd.get_dummies(eventos['new_vs_returning'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)


In [13]:
eventos.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,Genstatpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version,mes,dia,hora,diasemana,evento_en_finde,mesMayus,hora_madrugada,hora_mañana,hora_almuerzo,hora_tarde,hora_noche,domingo,jueves,lunes,martes,miercoles,sabado,viernes,abril,enero,febrero,marzo,mayo,ad campaign hit,brand listing,checkout,conversion,generic listing,lead,search engine hit,searched products,staticpage,viewed product,visited site,Bom,Bom - Sem Touch ID,Excelente,Muito Bom,Novo,New,Returning
1507286,2018-05-17 12:27:47,checkout,0008ed71,,3372.0,Samsung Galaxy S6 Flat,Muito Bom,32GB,Dourado,,,,,,,,,,,,,,,5,17,12,jueves,0,mayo,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2336760,2018-05-17 13:44:59,visited site,0008ed71,,,,,,,,,,,,Referral,New,Unknown,Unknown,Brazil,Computer,1920x1080,Windows 10,Chrome 66.0,5,17,13,jueves,0,mayo,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
1507716,2018-05-17 13:45:00,checkout,0008ed71,,8247.0,iPhone SE,Bom,64GB,Cinza espacial,,,,,,,,,,,,,,,5,17,13,jueves,0,mayo,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2336761,2018-05-17 16:21:54,visited site,0008ed71,,,,,,,,,,,,Referral,Returning,Unknown,Unknown,Brazil,Computer,1920x1080,Windows 10,Chrome 66.0,5,17,16,jueves,0,mayo,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
2122051,2018-05-17 16:22:06,generic listing,0008ed71,,,,,,,"6594,6651,6664,7253,2820,6706,6721,12606,480,1...",,,,,,,,,,,,,,5,17,16,jueves,0,mayo,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [14]:
eventos.rename(columns={'generic listing': 'geneList', 'staticpage': 'statpage', 'staticpage': 'SP'}, inplace=True)

In [15]:
eventos.columns

Index(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'Genstatpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version', 'mes', 'dia', 'hora',
       'diasemana', 'evento_en_finde', 'mesMayus', 'hora_madrugada',
       'hora_mañana', 'hora_almuerzo', 'hora_tarde', 'hora_noche', 'domingo',
       'jueves', 'lunes', 'martes', 'miercoles', 'sabado', 'viernes', 'abril',
       'enero', 'febrero', 'marzo', 'mayo', 'ad campaign hit', 'brand listing',
       'checkout', 'conversion', 'geneList', 'lead', 'search engine hit',
       'searched products', 'SP', 'viewed product', 'visited site', 'Bom',
       'Bom - Sem Touch ID', 'Excelente', 'Muito Bom', 'Novo', 'New',
       'Returning'],
      dtype='object')

In [43]:
columnas_relevantes = [
        'evento_en_finde', 'hora_madrugada',
       'hora_mañana', 'hora_almuerzo', 'hora_tarde', 'hora_noche', 
        #'domingo','jueves', 'lunes', 'martes', 'miercoles', 'sabado', 'viernes', 'abril',
       'enero', 'febrero', 'marzo', 'mayo', 'brand listing',
       'checkout', 'conversion', 'geneList', 'lead', 'search engine hit',
       'searched products', 'SP', 'viewed product', 'visited site', 'Bom',
       'Bom - Sem Touch ID', 'Excelente', 'Muito Bom', 'Novo','New', 'Returning']

columnas_relevantes.append('person')
eventos_filtrados = eventos.loc[:, eventos.columns.isin(columnas_relevantes)]
columnas_relevantes.remove('person')

eventos_por_usuario = eventos_filtrados.groupby('person')[columnas_relevantes].sum().astype(np.float16).reset_index()


### =======================================================================

In [44]:
eventos_por_usuario.head()

Unnamed: 0,person,evento_en_finde,hora_madrugada,hora_mañana,hora_almuerzo,hora_tarde,hora_noche,enero,febrero,marzo,mayo,brand listing,checkout,conversion,geneList,lead,search engine hit,searched products,SP,viewed product,visited site,Bom,Bom - Sem Touch ID,Excelente,Muito Bom,Novo,New,Returning
0,0008ed71,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,6.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,1.0,0.0,1.0,1.0
1,00091926,0.0,313.0,0.0,0.0,50.0,85.0,0.0,0.0,0.0,448.0,25.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,372.0,34.0,102.0,0.0,108.0,163.0,1.0,1.0,33.0
2,00091a7a,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,10.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,0.0
3,000ba417,0.0,0.0,57.0,68.0,81.0,0.0,0.0,0.0,0.0,206.0,24.0,6.0,1.0,14.0,0.0,1.0,0.0,0.0,153.0,6.0,115.0,0.0,11.0,34.0,0.0,1.0,5.0
4,000c79fe,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,1.0,0.0,1.0,0.0,1.0,9.0,0.0,3.0,1.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0



### ENTRENAMIENTO Y PREDICCIÓN.
### =======================================================================

In [45]:
## OBTENEMOS TODA LA INFORMACIÓN DEL SET DE ENTRENAMIENTO.
y_train = pd.read_csv('labels_training_set.csv', encoding = 'utf-8')
test_users = pd.read_csv('trocafone_kaggle_test.csv', encoding = 'utf-8')

In [48]:
## Filtramos los eventos para los usuarios que se encuentran en el set de entrenamiento
train = pd.merge(eventos_por_usuario, y_train, on='person', how='inner')
test = pd.merge(eventos_por_usuario, test_users)

In [49]:
train[train['lead'] > 0].sort_values('lead', ascending=False).head(10)

Unnamed: 0,person,evento_en_finde,hora_madrugada,hora_mañana,hora_almuerzo,hora_tarde,hora_noche,enero,febrero,marzo,mayo,brand listing,checkout,conversion,geneList,lead,search engine hit,searched products,SP,viewed product,visited site,Bom,Bom - Sem Touch ID,Excelente,Muito Bom,Novo,New,Returning,label
7198,5f10bcb4,0.0,544.0,131.0,55.0,138.0,539.0,0.0,311.0,483.0,335.0,56.0,8.0,0.0,168.0,22.0,10.0,0.0,18.0,981.0,137.0,394.0,1.0,320.0,269.0,5.0,1.0,136.0,0
2073,1b373281,0.0,105.0,214.0,42.0,182.0,482.0,452.0,0.0,267.0,49.0,21.0,19.0,0.0,74.0,15.0,25.0,14.0,10.0,640.0,124.0,381.0,0.0,126.0,149.0,3.0,1.0,123.0,0
3397,2d99ad11,0.0,24.0,204.0,26.0,96.0,115.0,9.0,0.0,0.0,87.0,6.0,15.0,5.0,159.0,11.0,22.0,3.0,6.0,116.0,83.0,79.0,0.0,18.0,39.0,0.0,1.0,82.0,1
10344,8965245f,0.0,18.0,100.0,19.0,140.0,229.0,0.0,0.0,0.0,429.0,23.0,5.0,0.0,47.0,8.0,43.0,101.0,0.0,187.0,32.0,67.0,0.0,57.0,68.0,0.0,1.0,31.0,0
3205,2aeeab07,0.0,131.0,13.0,18.0,233.0,136.0,0.0,0.0,0.0,482.0,62.0,9.0,1.0,49.0,7.0,13.0,3.0,6.0,313.0,51.0,39.0,1.0,153.0,122.0,8.0,1.0,50.0,0
4914,4115546b,0.0,258.0,54.0,33.0,154.0,113.0,7.0,593.0,0.0,12.0,130.0,5.0,2.0,96.0,7.0,107.0,0.0,4.0,103.0,54.0,60.0,0.0,10.0,40.0,0.0,1.0,53.0,0
5017,4276d5de,0.0,110.0,16.0,45.0,115.0,142.0,0.0,0.0,225.0,136.0,153.0,0.0,0.0,65.0,7.0,11.0,3.0,1.0,129.0,56.0,50.0,0.0,57.0,22.0,0.0,1.0,55.0,1
2185,1cd70e8c,0.0,0.0,35.0,50.0,241.0,111.0,36.0,0.0,87.0,192.0,8.0,7.0,0.0,6.0,6.0,39.0,0.0,1.0,206.0,73.0,179.0,0.0,20.0,14.0,0.0,1.0,72.0,0
14747,c36bf9e5,0.0,454.0,123.0,64.0,166.0,561.0,0.0,0.0,0.0,246.0,74.0,21.0,0.0,72.0,6.0,12.0,5.0,4.0,1077.0,86.0,165.0,5.0,742.0,185.0,1.0,1.0,85.0,0
11401,97b0c0d1,0.0,994.0,9.0,66.0,584.0,280.0,0.0,0.0,33.0,572.0,168.0,1.0,0.0,109.0,5.0,22.0,15.0,12.0,1392.0,153.0,716.0,53.0,163.0,457.0,4.0,1.0,152.0,0


In [50]:
features = list(columnas_relevantes)


1149.8542582417583

### =======================================================================
### Entrenamiento y Metricas
### =======================================================================

In [26]:
x = train[features]
y = train['label']

In [27]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y)

In [28]:
print(xtrain.shape)
print(ytrain.shape)
print(xtest.shape)
print(ytest.shape)

xtest.head()

(14560, 36)
(14560,)
(4854, 36)
(4854,)


Unnamed: 0,evento_en_finde,hora_madrugada,hora_mañana,hora_almuerzo,hora_tarde,hora_noche,domingo,jueves,lunes,martes,miercoles,sabado,viernes,abril,enero,febrero,marzo,mayo,ad campaign hit,brand listing,checkout,conversion,geneList,lead,search engine hit,searched products,SP,viewed product,visited site,Bom,Bom - Sem Touch ID,Excelente,Muito Bom,Novo,New,Returning
16718,0.0,17.0,2.0,8.0,19.0,12.0,3.0,0.0,29.0,21.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,58.0,5.0,1.0,5.0,0.0,0.0,0.0,0.0,10.0,0.0,27.0,10.0,22.0,0.0,1.0,9.0,0.0,1.0,9.0
19056,0.0,16.0,0.0,6.0,25.0,2.0,0.0,8.0,19.0,6.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,38.0,5.0,11.0,0.0,20.0,7.0,0.0,1.0,4.0
8064,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,2.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,3.0,1.0,2.0,0.0,2.0,0.0,0.0,1.0,0.0
18980,0.0,0.0,0.0,1.0,6.0,4.0,0.0,3.0,2.0,2.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0
6910,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0


In [61]:
for estimators in range(95,105):
    #for depth in range(1,5):
    xgboost = xgb.XGBClassifier(n_estimators=estimators, max_depth=depth, learning_rate=0.05)
    xgboost.fit(xtrain, ytrain)
    y_pred_rf = xgboost.predict_proba(xtest)[:,1]
    print('estimators:' + str(estimators) + ' depth:' + str(depth) + '-' + str(np.sqrt(metrics.mean_squared_error(y_pred_rf, ytest))))

estimators:95 depth:1-0.20991517042421287
estimators:95 depth:2-0.20696428248763213
estimators:95 depth:3-0.20695858871766182
estimators:95 depth:4-0.2080723956790849
estimators:96 depth:1-0.2098508155067193
estimators:96 depth:2-0.2069748768993133
estimators:96 depth:3-0.20697137921031505
estimators:96 depth:4-0.20811810070700598
estimators:97 depth:1-0.2097926939343502
estimators:97 depth:2-0.20696253552745927
estimators:97 depth:3-0.20699851411193704
estimators:97 depth:4-0.20813473683471392
estimators:98 depth:1-0.20978330006658602
estimators:98 depth:2-0.20695491753268141
estimators:98 depth:3-0.2069981077312177
estimators:98 depth:4-0.20813426727456522
estimators:99 depth:1-0.2097507499764183
estimators:99 depth:2-0.2069474332764661
estimators:99 depth:3-0.20694180332195258
estimators:99 depth:4-0.20812760275596365
estimators:100 depth:1-0.20972702895023007
estimators:100 depth:2-0.20689675919443204
estimators:100 depth:3-0.20694138862629813
estimators:100 depth:4-0.2081003162691

22.787101424803716

In [62]:
xgboost = xgb.XGBClassifier(learning_rate=0.05)
xgboost.fit(x, y)
y_pred_rf = xgboost.predict_proba(test[features])[:,1]

In [63]:
## =================================================================================================
## ARMAMOS EN BASE A LA PREDICCIÓN QUE TENEMOS UN CSV PARA SUBIR A KAGGLE CON EL FORMATO INDICADO.
## =================================================================================================
submission = pd.DataFrame({ 'label': y_pred_rf, 'person': test['person'] })
submission.to_csv("submission_grupo17_XGB.csv", index=False)