### =======================================================================
### IMPORTACIÓN DE DATOS.
### =======================================================================

In [136]:
## IMPORTACIÓN GENERAL DE LIBRERIAS Y VISUALIZACIÓN DE DATOS (matplotlib y seaborn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as DT
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import xgboost as xgb

%matplotlib inline
warnings.filterwarnings('ignore')
plt.style.use('default') 
sns.set(style="whitegrid") 
plt.rcParams['figure.figsize'] = (15, 10)
pd.set_option('display.max_columns', 1000)
np.set_printoptions(threshold=np.nan)


In [137]:

## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES EVENTOS.
eventos = pd.read_csv('events_up_to_01062018.csv', encoding = 'utf-8')
## OBTENEMOS TODA LA INFORMACIÓN DEL SET DE ENTRENAMIENTO.
#y_train = pd.read_csv('labels_training_set.csv', encoding = 'utf-8')
## OBTENEMOS TODA LA INFORMACIÓN A TESTEAR.
#test = pd.read_csv('trocafone_kaggle_test.csv', encoding = 'utf-8')
## OBTENEMOS EJEMPLO DE SUBMIT
#example = pd.read_csv('trocafone_kaggle_submit_sample_all_0.csv', encoding = 'utf-8')

In [138]:
eventos.head(50)

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,,,,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,,,,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,,,,,,,,,,,,,
5,2018-05-18 00:44:27,searched products,4c8a8b93,,,,,,,"10240,9987,10322,10085,9944,9931,13404,10154,1...",iPhone se,,,,,,,,,,,,
6,2018-05-18 00:44:14,viewed product,1b9f7cf6,,2831.0,iPhone 6,Bom,16GB,Dourado,,,,,,,,,,,,,,
7,2018-05-18 00:44:02,viewed product,29ebb414,,2845.0,iPhone 6 Plus,Bom,128GB,Cinza espacial,,,,,,,,,,,,,,
8,2018-05-18 00:43:59,viewed product,de8fe91b,,12548.0,Motorola Moto G5 Plus,Bom,32GB,Platinum,,,,,,,,,,,,,,
9,2018-05-18 00:43:40,ad campaign hit,45baf068,/,,,,,,,,,google,,,,,,,,,,


### =======================================================================
### ARMADO DE FEATURES.
### =======================================================================\

In [139]:
## PASAMOS LAS COLUMNAS QUE TIENEN UNA CANTIDAD DE VALORES LIMITADA A UN TIPO CATEGORY
eventos['event'] = eventos['event'].astype('category')
eventos['condition'] = eventos['condition'].astype('category')
eventos['storage'] = eventos['storage'].astype('category')
eventos['search_engine'] = eventos['search_engine'].astype('category')
eventos['channel'] = eventos['channel'].astype('category')
eventos['new_vs_returning'] = eventos['new_vs_returning'].astype('category')
eventos['device_type'] = eventos['device_type'].astype('category')
eventos['color'] = eventos['color'].astype('category')
eventos['region'] = eventos['region'].astype('category')
eventos['country'] = eventos['country'].astype('category')
eventos['operating_system_version'] = eventos['operating_system_version'].astype('category')
eventos['city'] = eventos['city'].astype('category')
eventos['browser_version'] = eventos['browser_version'].astype('category')
eventos['screen_resolution'] = eventos['screen_resolution'].astype('category')
eventos['timestamp'] = pd.to_datetime(eventos['timestamp'], infer_datetime_format=True)

In [140]:
## ORDENAMOS LOS DATOS ṔOR PERSONAS EN PRIMER LUGAR Y TIEMPO EN SEGUNDO.
eventos.sort_values(['person', 'timestamp'], ascending=[True, True], inplace=True)

In [151]:
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
eventos['mes'] = eventos['timestamp'].dt.month
eventos['dia'] = eventos['timestamp'].dt.day
eventos['hora'] = eventos['timestamp'].dt.hour

In [152]:
## ARMAMOS UNA COLUMNA PARA EL DÍA DE LA SEMANA COMO NOMBRE.
eventos['diasemana'] = eventos['timestamp'].dt.weekday_name
## PONEMOS LOS NOMBRES DE MANERA MÁS PROLIJA PARA LOS GRÁFICOS.
eventos.loc[eventos.diasemana.str.contains('Monday', na=False), 'diasemana'] = 'lunes'
eventos.loc[eventos.diasemana.str.contains('Tuesday', na=False), 'diasemana'] = 'martes'
eventos.loc[eventos.diasemana.str.contains('Wednesday', na=False), 'diasemana'] = 'miercoles'
eventos.loc[eventos.diasemana.str.contains('Thursday', na=False), 'diasemana'] = 'jueves'
eventos.loc[eventos.diasemana.str.contains('Friday', na=False), 'diasemana'] = 'viernes'
eventos.loc[eventos.diasemana.str.contains('Saturday', na=False), 'diasemana'] = 'sabado'
eventos.loc[eventos.diasemana.str.contains('Sunday', na=False), 'diasemana'] = 'domingo'

In [153]:
# AGREGAMOS UNA COLUMNA PARA INDICAR SI EL EVENTO OCURRIO UN FIN DE SEMANA
eventos['evento_en_finde'] = 0
eventos.loc[(eventos.diasemana.str.contains('DOM', na=False) | eventos.diasemana.str.contains('SAB', na=False)), 'evento_en_finde'] = 1

In [154]:
# DEFINIMOS EL MES COMO NOMBRE PARA FACILITAR LAS COLUMNAS
eventos['mesMayus'] = ''
eventos.loc[eventos.mes == 1, 'mesMayus'] = 'enero'
eventos.loc[eventos.mes == 2, 'mesMayus'] = 'febrero'
eventos.loc[eventos.mes == 3, 'mesMayus'] = 'marzo'
eventos.loc[eventos.mes == 4, 'mesMayus'] = 'abril'
eventos.loc[eventos.mes == 5, 'mesMayus'] = 'mayo'
eventos.loc[eventos.mes == 6, 'mesMayus'] = 'junio'
eventos.loc[eventos.mes == 7, 'mesMayus'] = 'julio'
eventos.loc[eventos.mes == 8, 'mesMayus'] = 'agosto'
eventos.loc[eventos.mes == 9, 'mesMayus'] = 'septiembre'
eventos.loc[eventos.mes == 10, 'mesMayus'] = 'octubre'
eventos.loc[eventos.mes == 11, 'mesMayus'] = 'noviembre'
eventos.loc[eventos.mes == 12, 'mesMayus'] = 'diciembre'

In [155]:
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
eventos['hora_madrugada'] = 0
eventos.loc[((eventos.hora > -1) & (eventos.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
eventos['hora_mañana'] = 0
eventos.loc[((eventos.hora > 6) & (eventos.hora < 12)), 'hora_mañana'] = 1
# ALMUERZO de 12 a 13
eventos['hora_almuerzo'] = 0
eventos.loc[((eventos.hora > 11) & (eventos.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
eventos['hora_tarde'] = 0
eventos.loc[((eventos.hora > 13) & (eventos.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
eventos['hora_noche'] = 0
eventos.loc[((eventos.hora > 18) & (eventos.hora < 24)), 'hora_noche'] = 1
# TRANSFORMAMOS EN CATEGÓRICAS EL DÍA DE LA SEMANA Y EL MES.
eventos['diasemana'] = eventos['diasemana'].astype('category')
eventos['mesMayus'] = eventos['mesMayus'].astype('category')

In [156]:
# COMO TENEMOS UN EVENTO CON EL MISMO NOMBRE SE GENERA CONFLICTOS, 
# ERGO LE MODIFICAMOS EL NOMBRE PARA NO TENER DOS COLUMNAS CON = NOMBRE Y DISTINTO TIPO.
eventos.rename(columns={'staticpage': 'Genstatpage'}, inplace=True)

In [157]:
# COLUMNAS DONDE POR CADA REGISTRO SABEMOS QUE TENEMOS UN VALOR (SIEMPRE PRESENTES)
dummies = pd.get_dummies(eventos['diasemana'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = pd.get_dummies(eventos['mesMayus'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = pd.get_dummies(eventos['event'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
# lE SACO ESTO
#dummies = pd.get_dummies(eventos['storage'], drop_first=False)
#eventos = pd.concat([eventos, dummies], axis=1)
dummies = pd.get_dummies(eventos['condition'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
# LE AGREGO ESTO
dummies = pd.get_dummies(eventos['new_vs_returning'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)


In [158]:
eventos.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,Genstatpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version,diasemana,evento_en_finde,mesMayus,mes,hora_madrugada,dia,hora,hora_mañana,hora_almuerzo,hora_tarde,hora_noche,domingo,jueves,lunes,martes,miercoles,sabado,viernes,abril,enero,febrero,marzo,mayo,ad campaign hit,brand listing,checkout,conversion,generic listing,lead,search engine hit,searched products,staticpage,viewed product,visited site,Bom,Bom - Sem Touch ID,Excelente,Muito Bom,Novo,New,Returning
1507286,2018-05-17 12:27:47,checkout,0008ed71,,3372.0,Samsung Galaxy S6 Flat,Muito Bom,32GB,Dourado,,,,,,,,,,,,,,,jueves,0,mayo,5,0,17,12,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2336760,2018-05-17 13:44:59,visited site,0008ed71,,,,,,,,,,,,Referral,New,Unknown,Unknown,Brazil,Computer,1920x1080,Windows 10,Chrome 66.0,jueves,0,mayo,5,0,17,13,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
1507716,2018-05-17 13:45:00,checkout,0008ed71,,8247.0,iPhone SE,Bom,64GB,Cinza espacial,,,,,,,,,,,,,,,jueves,0,mayo,5,0,17,13,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2336761,2018-05-17 16:21:54,visited site,0008ed71,,,,,,,,,,,,Referral,Returning,Unknown,Unknown,Brazil,Computer,1920x1080,Windows 10,Chrome 66.0,jueves,0,mayo,5,0,17,16,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
2122051,2018-05-17 16:22:06,generic listing,0008ed71,,,,,,,"6594,6651,6664,7253,2820,6706,6721,12606,480,1...",,,,,,,,,,,,,,jueves,0,mayo,5,0,17,16,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [159]:
eventos.rename(columns={'generic listing': 'geneList', 'staticpage': 'statpage', 'staticpage': 'SP'}, inplace=True)

In [160]:
eventos.columns

Index(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'Genstatpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version', 'diasemana',
       'evento_en_finde', 'mesMayus', 'mes', 'hora_madrugada', 'dia', 'hora',
       'hora_mañana', 'hora_almuerzo', 'hora_tarde', 'hora_noche', 'domingo',
       'jueves', 'lunes', 'martes', 'miercoles', 'sabado', 'viernes', 'abril',
       'enero', 'febrero', 'marzo', 'mayo', 'ad campaign hit', 'brand listing',
       'checkout', 'conversion', 'geneList', 'lead', 'search engine hit',
       'searched products', 'SP', 'viewed product', 'visited site', 'Bom',
       'Bom - Sem Touch ID', 'Excelente', 'Muito Bom', 'Novo', 'New',
       'Returning'],
      dtype='object')

In [169]:
columnas_relevantes = [
        'evento_en_finde', 'hora_madrugada',
       'hora_mañana', 'hora_almuerzo', 'hora_tarde', 'hora_noche', 'domingo',
       'jueves', 'lunes', 'martes', 'miercoles', 'sabado', 'viernes', 'abril',
       'enero', 'febrero', 'marzo', 'mayo', 'ad campaign hit', 'brand listing',
       'checkout', 'conversion', 'geneList', 'lead', 'search engine hit',
       'searched products', 'SP', 'viewed product', 'visited site', 'Bom',
       'Bom - Sem Touch ID', 'Excelente', 'Muito Bom', 'Novo','New', 'Returning']

columnas_relevantes.append('person')
eventos_filtrados = eventos.loc[:, eventos.columns.isin(columnas_relevantes)]
columnas_relevantes.remove('person')

eventos_por_usuario = eventos_filtrados.groupby('person')[columnas_relevantes].sum().astype(np.float16).reset_index()



### PRUEBAS
### =======================================================================

### =======================================================================

In [170]:
eventos_por_usuario.head()

Unnamed: 0,person,evento_en_finde,hora_madrugada,hora_mañana,hora_almuerzo,hora_tarde,hora_noche,domingo,jueves,lunes,martes,miercoles,sabado,viernes,abril,enero,febrero,marzo,mayo,ad campaign hit,brand listing,checkout,conversion,geneList,lead,search engine hit,searched products,SP,viewed product,visited site,Bom,Bom - Sem Touch ID,Excelente,Muito Bom,Novo,New,Returning
0,0008ed71,0.0,0.0,0.0,3.0,3.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,1.0,0.0,1.0,1.0
1,00091926,0.0,313.0,0.0,0.0,50.0,85.0,131.0,84.0,32.0,43.0,40.0,32.0,86.0,0.0,0.0,0.0,0.0,448.0,15.0,25.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,372.0,34.0,102.0,0.0,108.0,163.0,1.0,1.0,33.0
2,00091a7a,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,0.0
3,000ba417,0.0,0.0,57.0,68.0,81.0,0.0,0.0,147.0,0.0,0.0,0.0,59.0,0.0,0.0,0.0,0.0,0.0,206.0,1.0,24.0,6.0,1.0,14.0,0.0,1.0,0.0,0.0,153.0,6.0,115.0,0.0,11.0,34.0,0.0,1.0,5.0
4,000c79fe,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,9.0,0.0,3.0,1.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0



### ENTRENAMIENTO Y PREDICCIÓN.
### =======================================================================

In [171]:
## OBTENEMOS TODA LA INFORMACIÓN DEL SET DE ENTRENAMIENTO.
y_train = pd.read_csv('labels_training_set.csv', encoding = 'utf-8')
test_users = pd.read_csv('trocafone_kaggle_test.csv', encoding = 'utf-8')

In [172]:
## Filtramos los eventos para los usuarios que se encuentran en el set de entrenamiento
train = pd.merge(eventos_por_usuario, y_train, on='person', how='inner')
test = pd.merge(eventos_por_usuario, test_users)

In [173]:
train.shape

(19414, 38)

In [174]:
features = list(columnas_relevantes)
# Quitamos la columna person porque no aporta nada
#features.remove('person')

In [175]:
# Creamos un clasificador con Random Forest..
clf = RandomForestClassifier(n_estimators=26)
# Entrenamos.
clf.fit(train[features], train['label'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=26, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [176]:
# Comparamos con el set de testing para ver que haya salido todo bien en un principio
y_pred_rf = clf.predict_proba(test[features])[:,1]

In [177]:
submission = pd.DataFrame({ 'label': y_pred_rf, 'person': test['person'] })
submission.to_csv("submission_grupo17_RF.csv", index=False)

In [178]:
y_pred_rf.sum()

1123.9081501831502

### =======================================================================
### Metricas
### =======================================================================

In [179]:
x = train[features]
y = train['label']

In [180]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y)

In [181]:
print(xtrain.shape)
print(ytrain.shape)
print(xtest.shape)
print(ytest.shape)

xtest.head()

(14560, 36)
(14560,)
(4854, 36)
(4854,)


Unnamed: 0,evento_en_finde,hora_madrugada,hora_mañana,hora_almuerzo,hora_tarde,hora_noche,domingo,jueves,lunes,martes,miercoles,sabado,viernes,abril,enero,febrero,marzo,mayo,ad campaign hit,brand listing,checkout,conversion,geneList,lead,search engine hit,searched products,SP,viewed product,visited site,Bom,Bom - Sem Touch ID,Excelente,Muito Bom,Novo,New,Returning
3109,0.0,117.0,0.0,0.0,95.0,0.0,0.0,98.0,39.0,0.0,0.0,0.0,75.0,0.0,0.0,0.0,0.0,212.0,1.0,10.0,3.0,0.0,2.0,0.0,4.0,0.0,0.0,187.0,5.0,94.0,2.0,34.0,59.0,1.0,1.0,4.0
1223,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,10.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,5.0,1.0,6.0,0.0,0.0,0.0,0.0,1.0,0.0
13955,0.0,0.0,0.0,10.0,13.0,0.0,13.0,0.0,0.0,6.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,23.0,7.0,2.0,2.0,0.0,2.0,0.0,1.0,0.0,0.0,6.0,3.0,4.0,0.0,2.0,2.0,0.0,1.0,2.0
7811,0.0,0.0,0.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,3.0,0.0,1.0,0.0,1.0,0.0,4.0,0.0,0.0,13.0,1.0,4.0,0.0,2.0,8.0,0.0,1.0,0.0
8344,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0


In [185]:
for estimators in range(90,110):
    for depth in range(1,5):
        xgboost = xgb.XGBClassifier(n_estimators=estimators, max_depth=depth)
        xgboost.fit(xtrain, ytrain)
        y_pred_rf = xgboost.predict_proba(xtest)[:,1]
        print('estimators:' + str(estimators) + ' depth:' + str(depth) + '-' + str(np.sqrt(metrics.mean_squared_error(y_pred_rf, ytest))))

estimators:90 depth:1-0.1992138886822439
estimators:90 depth:2-0.19828722438448165
estimators:90 depth:3-0.19865507713095726
estimators:90 depth:4-0.19811285398235431
estimators:91 depth:1-0.19922503852565354
estimators:91 depth:2-0.19828063528592782
estimators:91 depth:3-0.1986608114583835
estimators:91 depth:4-0.19816342216126243
estimators:92 depth:1-0.19919017231950362
estimators:92 depth:2-0.19827606446516577
estimators:92 depth:3-0.19866251150246184
estimators:92 depth:4-0.1982033542811128
estimators:93 depth:1-0.19915982884094652
estimators:93 depth:2-0.1983124513770103
estimators:93 depth:3-0.1986668785328677
estimators:93 depth:4-0.19822134548800147
estimators:94 depth:1-0.19917169949249366
estimators:94 depth:2-0.19833066138135394
estimators:94 depth:3-0.19867946691155505
estimators:94 depth:4-0.19822483136291985
estimators:95 depth:1-0.19916206113213297
estimators:95 depth:2-0.19838676782926964
estimators:95 depth:3-0.1986885274681593
estimators:95 depth:4-0.1982335990156932

In [183]:
9-0.2216941625305168
23-0.21289857519628275

22.787101424803716

In [None]:
x = train[features]
y = train['label']

Unnamed: 0,evento_en_finde,hora_madrugada,hora_mañana,hora_almuerzo,hora_tarde,hora_noche,domingo,jueves,lunes,martes,miercoles,sabado,viernes,abril,enero,febrero,marzo,mayo,ad campaign hit,brand listing,checkout,conversion,geneList,lead,search engine hit,searched products,SP,viewed product,visited site,Bom,Bom - Sem Touch ID,Excelente,Muito Bom,Novo,New,Returning
0,0.0,313.0,0.0,0.0,50.0,85.0,131.0,84.0,32.0,43.0,40.0,32.0,86.0,0.0,0.0,0.0,0.0,448.0,15.0,25.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,372.0,34.0,102.0,0.0,108.0,163.0,1.0,1.0,33.0
1,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,0.0
2,0.0,0.0,57.0,68.0,81.0,0.0,0.0,147.0,0.0,0.0,0.0,59.0,0.0,0.0,0.0,0.0,0.0,206.0,1.0,24.0,6.0,1.0,14.0,0.0,1.0,0.0,0.0,153.0,6.0,115.0,0.0,11.0,34.0,0.0,1.0,5.0
3,0.0,0.0,27.0,273.0,109.0,2.0,0.0,29.0,27.0,140.0,207.0,0.0,8.0,0.0,0.0,0.0,0.0,411.0,19.0,17.0,1.0,0.0,17.0,0.0,5.0,0.0,0.0,339.0,13.0,124.0,0.0,53.0,163.0,0.0,1.0,12.0
4,0.0,0.0,0.0,19.0,49.0,0.0,0.0,27.0,0.0,10.0,9.0,0.0,22.0,0.0,0.0,0.0,0.0,68.0,6.0,11.0,1.0,0.0,8.0,0.0,3.0,6.0,0.0,28.0,5.0,14.0,0.0,7.0,8.0,0.0,1.0,4.0


In [189]:
xgboost = xgb.XGBClassifier()
xgboost.fit(x, y)
y_pred_rf = xgboost.predict_proba(test[features])[:,1]

In [190]:
## =================================================================================================
## ARMAMOS EN BASE A LA PREDICCIÓN QUE TENEMOS UN CSV PARA SUBIR A KAGGLE CON EL FORMATO INDICADO.
## =================================================================================================
submission = pd.DataFrame({ 'label': y_pred_rf, 'person': test['person'] })
submission.to_csv("submission_grupo17_XGB.csv", index=False)