### =======================================================================
### IMPORTACIÓN DE DATOS.
### =======================================================================

In [506]:
## IMPORTACIÓN GENERAL DE LIBRERIAS Y VISUALIZACIÓN DE DATOS (matplotlib y seaborn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as DT
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import xgboost as xgb

%matplotlib inline
warnings.filterwarnings('ignore')
plt.style.use('default') 
sns.set(style="whitegrid") 
plt.rcParams['figure.figsize'] = (15, 10)
pd.set_option('display.max_columns', 1000)
np.set_printoptions(threshold=np.nan)

In [507]:
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES EVENTOS.
eventos = pd.read_csv('events_up_to_01062018.csv', encoding = 'utf-8')

### =======================================================================
### ARMADO DE FEATURES.
### =======================================================================\

In [508]:
## PASAMOS LAS COLUMNAS QUE TIENEN UNA CANTIDAD DE VALORES LIMITADA A UN TIPO CATEGORY
eventos['event'] = eventos['event'].astype('category')
eventos['search_engine'] = eventos['search_engine'].astype('category')
eventos['channel'] = eventos['channel'].astype('category')
eventos['new_vs_returning'] = eventos['new_vs_returning'].astype('category')
eventos['device_type'] = eventos['device_type'].astype('category')
eventos['color'] = eventos['color'].astype('category')
eventos['region'] = eventos['region'].astype('category')
#eventos['city'] = eventos['city'].astype('category')
eventos['browser_version'] = eventos['browser_version'].astype('category')
eventos['screen_resolution'] = eventos['screen_resolution'].astype('category')
eventos['timestamp'] = pd.to_datetime(eventos['timestamp'], infer_datetime_format=True)

In [509]:
## ORDENAMOS LOS DATOS ṔOR PERSONAS EN PRIMER LUGAR Y TIEMPO EN SEGUNDO.
eventos.sort_values(['person', 'timestamp'], ascending=[True, True], inplace=True)

In [510]:
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
eventos['mes'] = eventos['timestamp'].dt.month
eventos['dia'] = eventos['timestamp'].dt.day
eventos['hora'] = eventos['timestamp'].dt.hour

In [511]:
## ARMAMOS UNA COLUMNA PARA EL DÍA DE LA SEMANA COMO NOMBRE.
eventos['diasemana'] = eventos['timestamp'].dt.weekday_name
## PONEMOS LOS NOMBRES DE MANERA MÁS PROLIJA PARA LOS GRÁFICOS.
eventos.loc[eventos.diasemana.str.contains('Monday', na=False), 'diasemana'] = 'lunes'
eventos.loc[eventos.diasemana.str.contains('Tuesday', na=False), 'diasemana'] = 'martes'
eventos.loc[eventos.diasemana.str.contains('Wednesday', na=False), 'diasemana'] = 'miercoles'
eventos.loc[eventos.diasemana.str.contains('Thursday', na=False), 'diasemana'] = 'jueves'
eventos.loc[eventos.diasemana.str.contains('Friday', na=False), 'diasemana'] = 'viernes'
eventos.loc[eventos.diasemana.str.contains('Saturday', na=False), 'diasemana'] = 'sabado'
eventos.loc[eventos.diasemana.str.contains('Sunday', na=False), 'diasemana'] = 'domingo'

In [512]:
# AGREGAMOS UNA COLUMNA PARA INDICAR SI EL EVENTO OCURRIO UN FIN DE SEMANA
eventos['evento_en_finde'] = 0
eventos.loc[(eventos.diasemana.str.contains('DOM', na=False) | eventos.diasemana.str.contains('SAB', na=False)), 'evento_en_finde'] = 1

In [513]:
# DEFINIMOS EL MES COMO NOMBRE PARA FACILITAR LAS COLUMNAS
eventos['mesMayus'] = ''
eventos.loc[eventos.mes == 1, 'mesMayus'] = 'enero'
eventos.loc[eventos.mes == 2, 'mesMayus'] = 'febrero'
eventos.loc[eventos.mes == 3, 'mesMayus'] = 'marzo'
eventos.loc[eventos.mes == 4, 'mesMayus'] = 'abril'
eventos.loc[eventos.mes == 5, 'mesMayus'] = 'mayo'
eventos.loc[eventos.mes == 6, 'mesMayus'] = 'junio'
eventos.loc[eventos.mes == 7, 'mesMayus'] = 'julio'
eventos.loc[eventos.mes == 8, 'mesMayus'] = 'agosto'
eventos.loc[eventos.mes == 9, 'mesMayus'] = 'septiembre'
eventos.loc[eventos.mes == 10, 'mesMayus'] = 'octubre'
eventos.loc[eventos.mes == 11, 'mesMayus'] = 'noviembre'
eventos.loc[eventos.mes == 12, 'mesMayus'] = 'diciembre'

In [514]:
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
eventos['hora_madrugada'] = 0
eventos.loc[((eventos.hora > -1) & (eventos.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
eventos['hora_maniana'] = 0
eventos.loc[((eventos.hora > 6) & (eventos.hora < 12)), 'hora_maniana'] = 1
# ALMUERZO de 12 a 13
eventos['hora_almuerzo'] = 0
eventos.loc[((eventos.hora > 11) & (eventos.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
eventos['hora_tarde'] = 0
eventos.loc[((eventos.hora > 13) & (eventos.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
eventos['hora_noche'] = 0
eventos.loc[((eventos.hora > 18) & (eventos.hora < 24)), 'hora_noche'] = 1
# TRANSFORMAMOS EN CATEGÓRICAS EL DÍA DE LA SEMANA Y EL MES.
eventos['diasemana'] = eventos['diasemana'].astype('category')
eventos['mesMayus'] = eventos['mesMayus'].astype('category')

In [515]:
## CREAMOS UNA COLUMNA CON CONTENIDO VACIO.
eventos['sistema'] = 'OtrosSistemas'
## SEGÚN QUE FAMILIA DE SO POSEA ACTUALIZAMOS NUESTRA NUEVA COLUMNA.
eventos.loc[eventos.operating_system_version.str.contains('Mac', na=False), 'sistema'] = 'MacOS'
eventos.loc[eventos.operating_system_version.str.contains('iOS', na=False), 'sistema'] = 'iOS'
eventos.loc[eventos.operating_system_version.str.contains('Chrome', na=False), 'sistema'] = 'Chrome'
eventos.loc[eventos.operating_system_version.str.contains('Tizen', na=False), 'sistema'] = 'Tizen'
eventos.loc[eventos.operating_system_version.str.contains('Android', na=False), 'sistema'] = 'Android'
eventos.loc[eventos.operating_system_version.str.contains('Windows Phone', na=False), 'sistema'] = 'Windows Phone'
eventos.loc[eventos.operating_system_version.str.contains('Ubuntu', na=False), 'sistema'] = 'Ubuntu'
eventos.loc[eventos.operating_system_version.str.contains('Linux', na=False), 'sistema'] = 'Linux'
## CASO PARTICULAR, WINDOWS Y WINDOWS PHONE COMPARTEN LA PALABRA.
eventos.loc[(eventos.operating_system_version.str.contains('Windows', na=False) & ~eventos.operating_system_version.str.contains('Phone', na=False)), 'sistema'] = 'Windows'
## AHORA SI PASAMOS LA COLUMNA DE SISTEMAS OPERATIVOS A UN ENUMERATIVO.
eventos['operating_system_version'] = eventos['operating_system_version'].astype('category')
eventos['sistema'] = eventos['sistema'].astype('category')

In [516]:
## AGREGAMOS UNA COLUMNA PARA DIFERENCIAR LAS MARCAS.
eventos['marca'] = 'OtrasMarcas'
## VAMOS OBTENIENDO LAS MARCAS SEGÚN LAS PALABRAS CLAVES QUE LAS CONFORMAN.
eventos.loc[eventos.model.str.contains('Samsung', na=False), 'marca'] = 'Samsung'
eventos.loc[eventos.model.str.contains('Motorola', na=False), 'marca'] = 'Motorola'
eventos.loc[eventos.model.str.contains('Sony', na=False), 'marca'] = 'Sony'
eventos.loc[eventos.model.str.contains('LG ', na=False), 'marca'] = 'LG'
eventos.loc[eventos.model.str.contains('iPad', na=False), 'marca'] = 'iPad'
eventos.loc[eventos.model.str.contains('Asus', na=False), 'marca'] = 'Asus'
eventos.loc[eventos.model.str.contains('iPhone', na=False), 'marca'] = 'iPhone'
eventos.loc[eventos.model.str.contains('Quantum', na=False), 'marca'] = 'Quantum'
eventos.loc[eventos.model.str.contains('Lenovo', na=False), 'marca'] = 'Lenovo'
## AHORA SI PASAMOS LA COLUMNA DE MODELOS A UN ENUMERATIVO.
eventos['model'] = eventos['model'].astype('category')
eventos['marca'] = eventos['marca'].astype('category')

In [517]:
## AGREGAMOS UNA COLUMNA PARA DIFERENCIAR BRASIL DEL RESTO.
# eventos['pais'] = 'OtrosPaises'
# eventos.loc[eventos.country.str.contains('Brazil', na=False), 'pais'] = 'Brasil'
# ## AHORA SI PASAMOS LA COLUMNA DE PAISES A UN ENUMERATIVO.
# eventos['pais'] = eventos['pais'].astype('category')
# eventos['country'] = eventos['country'].astype('category')
eventos['pais'] = 'OtrosPaises'
paisesMasPopulares = eventos['country'].value_counts().to_frame().reset_index().head()['index']
eventos.loc[eventos['country'].isin(paisesMasPopulares), 'pais'] = eventos['country']
eventos['pais'] = eventos['pais'].astype('category')
eventos['pais'].value_counts()

OtrosPaises      2137877
Brazil            197699
Unknown             5273
United States        634
Argentina            123
Canada                75
Name: pais, dtype: int64

In [518]:
## COLUMNA DE CIUDAD
eventos['ciudad'] = 'OtraCiudad'
ciudadesMasPopulares = eventos['city'].value_counts().to_frame().reset_index().head(200)['index']
eventos.loc[eventos['city'].isin(ciudadesMasPopulares), 'ciudad'] = eventos['city']
eventos['ciudad'] = eventos['ciudad'].astype('category')

In [519]:
## COLUMNA DE NAVEGADOR
eventos['navegador'] = 'OtroNavegador'
navegadoresMasPopulares = eventos['browser_version'].value_counts().to_frame().reset_index().head(50)['index']
eventos.loc[eventos['browser_version'].isin(navegadoresMasPopulares), 'navegador'] = eventos['browser_version']
eventos['navegador'] = eventos['navegador'].astype('category')

In [520]:
eventos['Capacidad_en_GB'] = 0
eventos.loc[eventos.storage.str.contains('16GB', na=False), 'Capacidad_en_GB'] = 16
eventos.loc[eventos.storage.str.contains('32GB', na=False), 'Capacidad_en_GB'] = 32
eventos.loc[eventos.storage.str.contains('64GB', na=False), 'Capacidad_en_GB'] = 64
eventos.loc[eventos.storage.str.contains('128GB', na=False), 'Capacidad_en_GB'] = 128
eventos.loc[eventos.storage.str.contains('8GB', na=False), 'Capacidad_en_GB'] = 8
eventos.loc[eventos.storage.str.contains('256GB', na=False), 'Capacidad_en_GB'] = 256
eventos.loc[eventos.storage.str.contains('4GB', na=False), 'Capacidad_en_GB'] = 4
eventos.loc[eventos.storage.str.contains('512MB', na=False), 'Capacidad_en_GB'] = 0.512
eventos['storage'] = eventos['storage'].astype('category')

In [521]:
## CREAMOS UNA COLUMNA PARA DIFERENCIAR LA CONDICION.
eventos['Puntaje'] = 0
## ABSTRAEMOS SOLO AQUELLOS QUE TIENEN UNA CONDICIÓN ASOCIADA Y LO PASAMOS A NUESTRO IDIOMA.
eventos.loc[eventos.condition.str.contains('Excelente', na=False), 'Puntaje'] = 4
eventos.loc[eventos.condition.str.contains('Muito Bom', na=False), 'Puntaje'] = 3
eventos.loc[eventos.condition.str.contains('Novo', na=False), 'Puntaje'] = 5
## SOLO NOS QUEDAN DOS TIPOS DE EVENTOS QUE SON REFERENCIA A LOS 'BUENOS'
eventos.loc[eventos.Puntaje == 0, 'Puntaje'] = 2
eventos['condition'] = eventos['condition'].astype('category')

In [522]:
# COMO TENEMOS UN EVENTO CON EL MISMO NOMBRE SE GENERA CONFLICTOS, 
# ERGO LE MODIFICAMOS EL NOMBRE PARA NO TENER DOS COLUMNAS CON = NOMBRE Y DISTINTO TIPO.
eventos.rename(columns={'staticpage': 'Genstatpage'}, inplace=True)
aux = eventos


In [523]:
eventos = aux

In [524]:
# COLUMNAS DONDE POR CADA REGISTRO SABEMOS QUE TENEMOS UN VALOR (SIEMPRE PRESENTES)
##################### DIA DE SEMANA
#dummies = pd.get_dummies(eventos['diasemana'], drop_first=False)
#eventos = pd.concat([eventos, dummies], axis=1)
##################### MES
dummies = pd.get_dummies(eventos['mesMayus'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
##################### TIPOS DE EVENTOS
dummies = pd.get_dummies(eventos['event'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
##################### ALMACENAMIENTO
dummies = pd.get_dummies(eventos['storage'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
##################### CONDICION DEL DISPOSITIVO
dummies = pd.get_dummies(eventos['condition'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
##################### USUARIO NUEVO O NO
dummies = pd.get_dummies(eventos['new_vs_returning'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
##################### SISTEMA OPERATIVO
dummies = pd.get_dummies(eventos['sistema'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
##################### MARCA
dummies = pd.get_dummies(eventos['marca'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
##################### PAIS DE EVENTO
dummies = pd.get_dummies(eventos['pais'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
eventos.rename(columns={'Unknown': 'Unknown_country'}, inplace=True)
##################### TIPO DE DISPOSITIVO
dummies = pd.get_dummies(eventos['device_type'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
eventos.rename(columns={'Unknown': 'Unknown_device_type'}, inplace=True)
##################### CIUDAD
#dummies = pd.get_dummies(eventos['ciudad'], drop_first=False)
#eventos = pd.concat([eventos, dummies], axis=1)
#eventos.drop(['Unknown'], axis=1)
##################### NAVEGADOR
dummies = pd.get_dummies(eventos['navegador'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
#####################


In [525]:
eventos.rename(columns={'generic listing': 'geneList', 'staticpage': 'statpage', 'Unknown': 'Unknown_city', 'staticpage': 'SP'}, inplace=True)

In [526]:
columnas_relevantes = list(eventos.select_dtypes(include=['int','float64','uint8']).columns)

columnas_relevantes.remove('sku')
columnas_relevantes.append('person')
eventos_filtrados = eventos.loc[:, eventos.columns.isin(columnas_relevantes)]
columnas_relevantes.remove('person')

eventos_por_usuario = eventos_filtrados.groupby('person')[columnas_relevantes].sum().astype(np.float16).reset_index()


In [527]:
eventos_por_usuario.head()

Unnamed: 0,person,Capacidad_en_GB,abril,enero,febrero,marzo,mayo,ad campaign hit,brand listing,checkout,conversion,geneList,lead,search engine hit,searched products,SP,viewed product,visited site,128GB,16GB,256GB,32GB,4GB,512MB,64GB,8GB,Bom,Bom - Sem Touch ID,Excelente,Muito Bom,Novo,New,Returning,Android,Chrome,Linux,MacOS,OtrosSistemas,Tizen,Ubuntu,Windows,Windows Phone,iOS,Asus,LG,Lenovo,Motorola,OtrasMarcas,Quantum,Samsung,Sony,iPad,iPhone,Argentina,Brazil,Canada,OtrosPaises,United States,Unknown_country,Computer,Smartphone,Tablet,Unknown_device_type,Chrome 49.0,Chrome 58.0,Chrome 61.0,Chrome 62.0,Chrome 63.0,Chrome 64.0,Chrome 65.0,Chrome 66.0,Chrome 67.0,Chrome Mobile 28.0,Chrome Mobile 34.0,Chrome Mobile 39,Chrome Mobile 40.0,Chrome Mobile 43.0,Chrome Mobile 45.0,Chrome Mobile 50.0,Chrome Mobile 53.0,Chrome Mobile 54.0,Chrome Mobile 55.0,Chrome Mobile 56.0,Chrome Mobile 57.0,Chrome Mobile 58.0,Chrome Mobile 59.0,Chrome Mobile 60.0,Chrome Mobile 61.0,Chrome Mobile 62.0,Chrome Mobile 63.0,Chrome Mobile 64.0,Chrome Mobile 65.0,Chrome Mobile 66.0,Chrome Mobile iOS 66.0,Edge 15.15063,Edge 16.16299,Facebook 170,Facebook 171,Facebook 172,Facebook 173,Firefox 52,Firefox 58,Firefox 59,Firefox 60,IE 11,IE Mobile 11,Mobile Safari 10,Mobile Safari 11,Mobile Safari 9,Opera 52.0,Opera 53.0,OtroNavegador,Samsung Internet 3.3,Samsung Internet 6.4
0,0008ed71,68.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,4.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
1,00091926,9152.0,0.0,0.0,0.0,0.0,448.0,15.0,25.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,372.0,34.0,48.0,104.0,10.0,132.0,0.0,0.0,80.0,0.0,102.0,0.0,108.0,163.0,1.0,1.0,33.0,0.0,0.0,0.0,0.0,414.0,0.0,0.0,34.0,0.0,0.0,0.0,2.0,1.0,55.0,74.0,0.0,61.0,1.0,1.0,253.0,0.0,34.0,0.0,414.0,0.0,0.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,414.0,0.0,0.0
2,00091a7a,28.0,0.0,0.0,0.0,10.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,9.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0
3,000ba417,2920.0,0.0,0.0,0.0,0.0,206.0,1.0,24.0,6.0,1.0,14.0,0.0,1.0,0.0,0.0,153.0,6.0,0.0,115.0,1.0,20.0,1.0,0.0,1.0,22.0,115.0,0.0,11.0,34.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,200.0,0.0,0.0,6.0,0.0,0.0,0.0,4.0,0.0,37.0,46.0,0.0,110.0,1.0,0.0,8.0,0.0,6.0,0.0,200.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,200.0,0.0,0.0
4,000c79fe,32.0,0.0,0.0,0.0,0.0,17.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,9.0,0.0,3.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,16.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0


In [528]:
columnas_relevantes

['Capacidad_en_GB',
 'abril',
 'enero',
 'febrero',
 'marzo',
 'mayo',
 'ad campaign hit',
 'brand listing',
 'checkout',
 'conversion',
 'geneList',
 'lead',
 'search engine hit',
 'searched products',
 'SP',
 'viewed product',
 'visited site',
 '128GB',
 '16GB',
 '256GB',
 '32GB',
 '4GB',
 '512MB',
 '64GB',
 '8GB',
 'Bom',
 'Bom - Sem Touch ID',
 'Excelente',
 'Muito Bom',
 'Novo',
 'New',
 'Returning',
 'Android',
 'Chrome',
 'Linux',
 'MacOS',
 'OtrosSistemas',
 'Tizen',
 'Ubuntu',
 'Windows',
 'Windows Phone',
 'iOS',
 'Asus',
 'LG',
 'Lenovo',
 'Motorola',
 'OtrasMarcas',
 'Quantum',
 'Samsung',
 'Sony',
 'iPad',
 'iPhone',
 'Argentina',
 'Brazil',
 'Canada',
 'OtrosPaises',
 'United States',
 'Unknown_country',
 'Computer',
 'Smartphone',
 'Tablet',
 'Unknown_device_type',
 'Chrome 49.0',
 'Chrome 58.0',
 'Chrome 61.0',
 'Chrome 62.0',
 'Chrome 63.0',
 'Chrome 64.0',
 'Chrome 65.0',
 'Chrome 66.0',
 'Chrome 67.0',
 'Chrome Mobile 28.0',
 'Chrome Mobile 34.0',
 'Chrome Mobile 39'

### =======================================================================
### ENTRENAMIENTO Y PREDICCIÓN.
### =======================================================================

In [529]:
## OBTENEMOS TODA LA INFORMACIÓN DEL SET DE ENTRENAMIENTO.
y_train = pd.read_csv('labels_training_set.csv', encoding = 'utf-8')
test_users = pd.read_csv('trocafone_kaggle_test.csv', encoding = 'utf-8')

In [530]:
## Filtramos los eventos para los usuarios que se encuentran en el set de entrenamiento
train = pd.merge(eventos_por_usuario, y_train, on='person', how='inner')
test = pd.merge(eventos_por_usuario, test_users)

In [531]:
features = list(columnas_relevantes)
x = train[features]
y = train['label']

In [532]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state = 1)

In [533]:
for estimators in range(197, 203):
    #for depth in range(1,5):
    xgboost = xgb.XGBClassifier(n_estimators=estimators, max_depth=3, learning_rate=0.05)
    xgboost.fit(xtrain, ytrain)
    y_pred_rf = xgboost.predict_proba(xtest)[:,1]
    print('estimators:' + str(estimators) + ' - ' + str(metrics.roc_auc_score(ytest, y_pred_rf)))

estimators:197 - 0.8751576343205059
estimators:198 - 0.875212075614495
estimators:199 - 0.8752210004167883
estimators:200 - 0.875209398173807
estimators:201 - 0.8751585268007351
estimators:202 - 0.8751612042414231


In [534]:
estimators:197 - 0.8751576343205059
estimators:198 - 0.875212075614495
estimators:199 - 0.8752210004167883
estimators:200 - 0.875209398173807
estimators:201 - 0.8751585268007351
estimators:202 - 0.8751612042414231

In [535]:
xgboost = xgb.XGBClassifier(n_estimators=200, learning_rate=0.05)
xgboost.fit(x, y)
y_pred_rf = xgboost.predict_proba(test[features])[:,1]

In [536]:
## =================================================================================================
## ARMAMOS EN BASE A LA PREDICCIÓN QUE TENEMOS UN CSV PARA SUBIR A KAGGLE CON EL FORMATO INDICADO.
## =================================================================================================
submission = pd.DataFrame({ 'label': y_pred_rf, 'person': test['person'] })
submission.to_csv("submission_grupo17_XGB.csv", index=False)

In [537]:
## SE AGREGÓ EL FEATURE QUE ENGLOBA LOS SISTEMAS OPERATIVOS MÁS COMUNES UTILIZADOS O BUSCADOS POR LOS USUARIOS:
## PREVIAMENTE TENÍAMOS: 0.84077
## RESULTADO DEL SUBMIT: 0.85216 >> VEMOS UNA MEJORA.

In [538]:
## SE AGREGÓ EL FEATURE QUE ENGLOBA LOS MODELOS MÁS COMUNES UTILIZADOS O BUSCADOS POR LOS USUARIOS:
## PREVIAMENTE TENÍAMOS: 0.85216
## RESULTADO DEL SUBMIT:0.85294 >> VEMOS UNA MEJORA.

In [539]:
## SE AGREGÓ UN PAR DE FEATURES (pais, device_type, Capacidad_en_GB, Puntaje):
## PREVIAMENTE TENÍAMOS: 0.85294
## RESULTADO DEL SUBMIT: 0.85524 >> VEMOS UNA MEJORA.

In [540]:
## AHORA LO QUE SE BUSCA ES HACER EL PROMEDIO EN LUGAR DE LA SUMA:
## PREVIAMENTE TENÍAMOS: 0.85524
## RESULTADO DEL SUBMIT: 0.85519 >> CON LO CUAL EMPEORA.

## Tocando los hiperparametros >> 0.85546

In [541]:
## AGREGAMOS LAS CIUDADES CON MAS EVENTOS
## PREVIAMENTE TENÍAMOS: 0.85546
## RESULTADO DEL SUBMIT: 0.85438 >> CON LO CUAL EMPEORA.