### =======================================================================
### IMPORTACIÓN DE DATOS.
### =======================================================================

In [1]:
## IMPORTACIÓN GENERAL DE LIBRERIAS Y VISUALIZACIÓN DE DATOS (matplotlib y seaborn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as DT
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')
plt.style.use('default') 
sns.set(style="whitegrid") 
plt.rcParams['figure.figsize'] = (15, 10)
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [2]:
training = ''
eventos = ''
dummies = ''
test = ''

# QUEREMOS VER TODAS LAS COLUMNAS/FEATURES.
pd.options.display.max_columns = 350

In [3]:
## OBTENEMOS TODA LA INFORMACIÓN DEL SET DE ENTRENAMIENTO.
training = pd.read_csv('labels_training_set.csv', encoding = 'utf-8')
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES EVENTOS.
eventos = pd.read_csv('events_up_to_01062018.csv', encoding = 'utf-8')
## OBTENEMOS TODA LA INFORMACIÓN A TESTEAR.
test = pd.read_csv('trocafone_kaggle_test.csv', encoding = 'utf-8')

In [4]:
## PASAMOS LAS COLUMNAS QUE TIENEN UNA CANTIDAD DE VALORES LIMITADA A UN TIPO CATEGORY
eventos['person'] = eventos['person'].astype('category')
eventos['event'] = eventos['event'].astype('category')
eventos['condition'] = eventos['condition'].astype('category')
eventos['storage'] = eventos['storage'].astype('category')
eventos['search_engine'] = eventos['search_engine'].astype('category')
eventos['channel'] = eventos['channel'].astype('category')
eventos['new_vs_returning'] = eventos['new_vs_returning'].astype('category')
eventos['device_type'] = eventos['device_type'].astype('category')
eventos['color'] = eventos['color'].astype('category')
eventos['region'] = eventos['region'].astype('category')
eventos['country'] = eventos['country'].astype('category')
eventos['operating_system_version'] = eventos['operating_system_version'].astype('category')
eventos['city'] = eventos['city'].astype('category')
eventos['browser_version'] = eventos['browser_version'].astype('category')
eventos['screen_resolution'] = eventos['screen_resolution'].astype('category')
eventos['model'] = eventos['model'].astype('category')
eventos['timestamp'] = pd.to_datetime(eventos['timestamp'], infer_datetime_format=True)

### =======================================================================
### ARMADO DE FEATURES.
### =======================================================================

In [5]:
## ORDENAMOS LOS DATOS ṔOR PERSONAS EN PRIMER LUGAR Y TIEMPO EN SEGUNDO.
eventos.sort_values(['person', 'timestamp'], ascending=[True, True], inplace=True)
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
eventos['mes'] = eventos['timestamp'].dt.month
eventos['dia'] = eventos['timestamp'].dt.day
eventos['hora'] = eventos['timestamp'].dt.hour
## ARMAMOS UNA COLUMNA PARA EL DÍA DE LA SEMANA COMO NOMBRE.
eventos['diasemana'] = eventos['timestamp'].dt.weekday_name
## PONEMOS LOS NOMBRES DE MANERA MÁS PROLIJA PARA LOS GRÁFICOS.
eventos.loc[eventos.diasemana.str.contains('Monday', na=False), 'diasemana'] = 'lunes'
eventos.loc[eventos.diasemana.str.contains('Tuesday', na=False), 'diasemana'] = 'martes'
eventos.loc[eventos.diasemana.str.contains('Wednesday', na=False), 'diasemana'] = 'miercoles'
eventos.loc[eventos.diasemana.str.contains('Thursday', na=False), 'diasemana'] = 'jueves'
eventos.loc[eventos.diasemana.str.contains('Friday', na=False), 'diasemana'] = 'viernes'
eventos.loc[eventos.diasemana.str.contains('Saturday', na=False), 'diasemana'] = 'sabado'
eventos.loc[eventos.diasemana.str.contains('Sunday', na=False), 'diasemana'] = 'domingo'
# DEFINIMOS UNA LÓGICA PARA INDICAR SI EL DÍA EN QUE SE EJECUTA EL EVENTO ES FIN DE SEMANA.
eventos['finde'] = 0
eventos.loc[(eventos.diasemana.str.contains('DOM', na=False) | eventos.diasemana.str.contains('SAB', na=False)), 'finde'] = 1
# DEFINIMOS EL MES COMO NOMBRE PARA FACILITAR LAS COLUMNAS
eventos['mesMayus'] = ''
eventos.loc[eventos.mes == 1, 'mesMayus'] = 'enero'
eventos.loc[eventos.mes == 2, 'mesMayus'] = 'febrero'
eventos.loc[eventos.mes == 3, 'mesMayus'] = 'marzo'
eventos.loc[eventos.mes == 4, 'mesMayus'] = 'abril'
eventos.loc[eventos.mes == 5, 'mesMayus'] = 'mayo'
eventos.loc[eventos.mes == 6, 'mesMayus'] = 'junio'
eventos.loc[eventos.mes == 7, 'mesMayus'] = 'julio'
eventos.loc[eventos.mes == 8, 'mesMayus'] = 'agosto'
eventos.loc[eventos.mes == 9, 'mesMayus'] = 'septiembre'
eventos.loc[eventos.mes == 10, 'mesMayus'] = 'octubre'
eventos.loc[eventos.mes == 11, 'mesMayus'] = 'noviembre'
eventos.loc[eventos.mes == 12, 'mesMayus'] = 'diciembre'
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
eventos['hora_madrugada'] = 0
eventos.loc[((eventos.hora > -1) & (eventos.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
eventos['hora_maniana'] = 0
eventos.loc[((eventos.hora > 6) & (eventos.hora < 12)), 'hora_maniana'] = 1
# ALMUERZO de 12 a 13
eventos['hora_almuerzo'] = 0
eventos.loc[((eventos.hora > 11) & (eventos.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
eventos['hora_tarde'] = 0
eventos.loc[((eventos.hora > 13) & (eventos.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
eventos['hora_noche'] = 0
eventos.loc[((eventos.hora > 18) & (eventos.hora < 24)), 'hora_noche'] = 1
# TRANSFORMAMOS EN CATEGÓRICAS EL DÍA DE LA SEMANA Y EL MES.
eventos['diasemana'] = eventos['diasemana'].astype('category')
eventos['mesMayus'] = eventos['mesMayus'].astype('category')

In [6]:
# COMO TENEMOS UN EVENTO CON EL MISMO NOMBRE SE GENERA CONFLICTOS, 
# ERGO LE MODIFICAMOS EL NOMBRE PARA NO TENER DOS COLUMNAS CON = NOMBRE Y DISTINTO TIPO.
eventos.rename(columns={'staticpage': 'Genstatpage'}, inplace=True)
eventos['event'].value_counts()

viewed product       1248124
brand listing         216312
visited site          204069
ad campaign hit       191388
generic listing       160176
searched products     130616
search engine hit     106406
checkout               65315
staticpage             11201
conversion              7091
lead                     983
Name: event, dtype: int64

In [7]:
# COLUMNAS DONDE POR CADA REGISTRO SABEMOS QUE TENEMOS UN VALOR (SIEMPRE PRESENTES)
dummies = pd.get_dummies(eventos['diasemana'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = ''
dummies = pd.get_dummies(eventos['mesMayus'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = ''
dummies = pd.get_dummies(eventos['event'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = ''

In [8]:
# MEJORA UN POCO
dummies = pd.get_dummies(eventos['storage'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = ''
# MEJORA UN POCO.
dummies = pd.get_dummies(eventos['condition'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = ''
# EMPEORA UN POCO.
#dummies = pd.get_dummies(eventos['color'], drop_first=False)
#eventos = pd.concat([eventos, dummies], axis=1)
#dummies = ''
# EMPEORA UN POCO.
#dummies = pd.get_dummies(eventos['model'], drop_first=False)
#eventos = pd.concat([eventos, dummies], axis=1)
#dummies = ''

In [9]:
eventos.rename(columns={'generic listing': 'geneList', 'staticpage': 'statpage', 'staticpage': 'SP'}, inplace=True)

In [10]:
eventos_filtrados = ''
eventos_agrupados = ''
agregar = ''

columnas_filtrar = list(eventos.select_dtypes(include=['int','float64','uint8']).columns)

In [11]:
eventos_agrupados = ''
eventos_filtrados = ''
agrupar = ''

columnas_filtrar.remove('sku')
columnas_filtrar.append('person')
agrupar = eventos.loc[:, eventos.columns.isin(columnas_filtrar)]
columnas_filtrar.remove('person')

eventos_agrupados = agrupar.groupby('person')[columnas_filtrar].mean().astype(np.float16).reset_index()
eventos_filtrados = ''
agrupar = ''

In [12]:
eventos_agrupados.head()

Unnamed: 0,person,mes,dia,hora,finde,hora_madrugada,hora_maniana,hora_almuerzo,hora_tarde,hora_noche,domingo,jueves,lunes,martes,miercoles,sabado,viernes,abril,enero,febrero,marzo,mayo,ad campaign hit,brand listing,checkout,conversion,geneList,lead,search engine hit,searched products,SP,viewed product,visited site,128GB,16GB,256GB,32GB,4GB,512MB,64GB,8GB,Bom,Bom - Sem Touch ID,Excelente,Muito Bom,Novo,Asus Live,Asus Zenfone 2,Asus Zenfone 2 Deluxe,Asus Zenfone 2 Laser,"Asus Zenfone 2 Laser 6""",Asus Zenfone 3 Max 32 GB,Asus Zenfone 3 Max 16 GB,Asus Zenfone 5,Asus Zenfone 6,Asus Zenfone Go,Asus Zenfone Selfie,LG X Screen,LG G2 Mini D618,LG G3 Beat D724,LG G3 D855,LG G3 Stylus D690,LG G4 Beat H736,LG G4 H815P,LG G4 H818P,LG G4 Stylus H630,LG G4 Stylus HDTV H540T,LG G5 SE,LG K10,LG K10 Novo,LG K10 TV,LG K4,LG K8,LG L Prime D337,LG L80 Dual,LG Nexus 4,LG Nexus 5 D821,LG Prime Plus H522,LG X Power,Lenovo Vibe A7010 Dual Chip,Lenovo Vibe K5,Motorola Moto E1,Motorola Moto E2 3G Dual,Motorola Moto E2 4G Dual,Motorola Moto E4 Plus,Motorola Moto G1 3G,Motorola Moto G1 4G,Motorola Moto G2 3G Dual,Motorola Moto G2 4G Dual,Motorola Moto G3 4G,Motorola Moto G3 HDTV,Motorola Moto G4 DTV,Motorola Moto G4 Play,Motorola Moto G4 Play DTV,Motorola Moto G4 Plus,Motorola Moto G5,Motorola Moto G5 Plus,Motorola Moto G5S,Motorola Moto G5S Plus,Motorola Moto MAXX,Motorola Moto X Force,Motorola Moto X Play 4G Dual,Motorola Moto X Style,Motorola Moto X2,Motorola Moto Z,Motorola Moto Z Play,Motorola Moto Z Power Edition,Motorola Moto Z2 Force,Motorola Moto Z2 Play,Outros TV LED 15,Quantum GO 3G,Quantum GO 4G,Quantum Muv,Quantum Muv Pro,Quantum Muv Up,Quantum YOU,Samsung Galaxy A3 2016,Samsung Galaxy A3 Duos,Samsung Galaxy A5,Samsung Galaxy A5 2016,Samsung Galaxy A5 2017,Samsung Galaxy A7,Samsung Galaxy A7 2016,Samsung Galaxy A7 2017,Samsung Galaxy A9 Pro 2016,Samsung Galaxy Core 2 Duos,Samsung Galaxy Core Plus Duos TV,Samsung Galaxy E5 4G Duos,Samsung Galaxy E7,Samsung Galaxy Gran 2 Duos TV,Samsung Galaxy Gran Neo Duos,Samsung Galaxy Gran Neo Plus Duos,Samsung Galaxy Gran Prime 3G Duos,Samsung Galaxy Gran Prime Duos,Samsung Galaxy Gran Prime Duos TV,Samsung Galaxy Grand Duos i9082,Samsung Galaxy J1 2016,Samsung Galaxy J1 Mini,Samsung Galaxy J2 4G Duos,Samsung Galaxy J2 4G Duos TV,Samsung Galaxy J2 Prime TV,Samsung Galaxy J3,Samsung Galaxy J5,Samsung Galaxy J5 2016 Metal,Samsung Galaxy J5 PRO,Samsung Galaxy J5 Prime,Samsung Galaxy J7,Samsung Galaxy J7 2016 Metal,Samsung Galaxy J7 Neo,Samsung Galaxy J7 PRO,Samsung Galaxy J7 Prime,Samsung Galaxy Mega Duos,Samsung Galaxy Note 2 N7100,Samsung Galaxy Note 3,Samsung Galaxy Note 3 Neo Duos,Samsung Galaxy Note 4,Samsung Galaxy Note 5,Samsung Galaxy Note 8,Samsung Galaxy Note Edge,Samsung Galaxy On 7,Samsung Galaxy Pocket 2 Duos,Samsung Galaxy S Duos 2,Samsung Galaxy S3 Duos,Samsung Galaxy S3 Mini,Samsung Galaxy S3 Neo Duos i9300i,Samsung Galaxy S3 Slim Duos,Samsung Galaxy S3 i9300,Samsung Galaxy S4 Mini,Samsung Galaxy S4 Mini Duos,Samsung Galaxy S4 i9500,Samsung Galaxy S4 i9505,Samsung Galaxy S4 i9515,Samsung Galaxy S5,Samsung Galaxy S5 Duos,Samsung Galaxy S5 Mini,Samsung Galaxy S5 Mini Duos,Samsung Galaxy S5 New Edition,Samsung Galaxy S5 New Edition Duos,Samsung Galaxy S6 Edge,Samsung Galaxy S6 Edge Plus,Samsung Galaxy S6 Flat,Samsung Galaxy S7,Samsung Galaxy S7 Edge,Samsung Galaxy S8,Samsung Galaxy S8 Plus,Samsung Galaxy Tab 3 10.1 Wi-Fi + 3G,Samsung Galaxy Tab 4 10.1 Wi-Fi + 3G,Samsung Galaxy Tab 4 10.1 Wi-Fi,Samsung Galaxy Tab A 2016 10.1 W-Fi + 4G,Samsung Galaxy Tab A com S Pen 8 Wi-Fi + 4G,Samsung Galaxy Tab E 7 Wi-Fi,Samsung Galaxy Tab E 7 Wi-Fi + 3G,Samsung Galaxy Tab E 9.6 Wi-Fi,Samsung Galaxy Tab E 9.6 Wi-Fi + 3G,Samsung Galaxy Tab Pro 10.1 Wi-Fi,Samsung Galaxy Tab S 10.5 Wi-Fi,Samsung Galaxy Tab S 10.5 Wi-Fi + 4G,Samsung Galaxy Tab S 8.4 Wi-Fi + 4G,Samsung Galaxy Tab S2 8 Wi-Fi + 4G,Samsung Galaxy Tab S2 9.7 Wi-Fi + 4G,Samsung Galaxy Win 2 Duos TV,Samsung Galaxy Win Duos,Samsung Galaxy Y Duos,Samsung Galaxy Young 2 Duos TV,Samsung Gear Fit 2 Grande,Samsung Gear Fit 2 Pequeno,Samsung Gear S2,Samsung Gear S3 Classic,Samsung Gear S3 Frontier,Sony Xperia M4 Aqua,Sony Xperia M4 Aqua Dual,Sony Xperia Z ULTRA,Sony Xperia Z2,Sony Xperia Z3,Sony Xperia Z3 Compact,Sony Xperia Z3 Dual,Sony Xperia Z3 Plus,Sony Xperia Z3 TV,Sony Xperia Z5,Sony Xperia Z5 Premium,Xiaomi Redmi 2,iPad 2 Wi-Fi,iPad 2 Wi-Fi + 3G,iPad 3 Wi-Fi,iPad 3 Wi-Fi + 4G,iPad 4 Wi-Fi,iPad 4 Wi-Fi + 4G,iPad Air 2 Wi-Fi,iPad Air 2 Wi-Fi + 4G,iPad Air Wi-Fi,iPad Air Wi-Fi + 4G,iPad Mini 2 Wi-Fi,iPad Mini 2 Wi-Fi + 4G,iPad Mini 3 Wi-Fi,iPad Mini 3 Wi-Fi + 4G,iPad Mini 4 Wi-Fi,iPad Mini 4 Wi-Fi + 4G,iPad Mini Wi-Fi,iPad Mini Wi-Fi + 4G,iPhone 4G,iPhone 4S,iPhone 5,iPhone 5c,iPhone 5s,iPhone 6,iPhone 6 Plus,iPhone 6S,iPhone 6S Plus,iPhone 7,iPhone 7 Plus,iPhone 8,iPhone 8 Plus,iPhone SE,iPhone X
0,0008ed71,5.0,17.0,14.335938,0.0,0.0,0.0,0.5,0.5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.5,0.0,0.166626,0.0,0.0,0.0,0.0,0.0,0.333252,0.0,0.0,0.0,0.333252,0.0,0.0,0.166626,0.0,0.333252,0.0,0.0,0.166626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166626,0.0
1,00091926,5.0,16.734375,7.007812,0.0,0.69873,0.0,0.0,0.111633,0.189697,0.29248,0.1875,0.071411,0.096008,0.089294,0.071411,0.192017,0.0,0.0,0.0,0.0,1.0,0.033478,0.055817,0.004463,0.0,0.0,0.0,0.0,0.0,0.0,0.830566,0.075867,0.107117,0.232178,0.022324,0.294678,0.0,0.0,0.178589,0.0,0.227661,0.0,0.241089,0.36377,0.002232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002232,0.0,0.0,0.0,0.002232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011162,0.0,0.004463,0.0,0.0,0.0,0.0,0.002232,0.004463,0.078125,0.022324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004463,0.0,0.0,0.011162,0.004463,0.0,0.0,0.0,0.004463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002232,0.002232,0.0,0.0,0.006695,0.0,0.002232,0.0,0.015625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006695,0.004463,0.033478,0.002232,0.020096,0.011162,0.004463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011162,0.091492,0.214233,0.113831,0.100464,0.020096,0.0,0.004463,0.006695,0.002232
2,00091a7a,3.0,26.0,14.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.099976,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.300049,0.099976,0.099976,0.099976,0.0,0.0,0.0,0.0,0.099976,0.0,0.099976,0.0,0.199951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.099976,0.0,0.099976,0.0,0.0,0.0,0.0,0.0,0.099976,0.0
3,000ba417,5.0,22.265625,13.523438,0.0,0.0,0.276611,0.330078,0.393311,0.0,0.0,0.713379,0.0,0.0,0.0,0.286377,0.0,0.0,0.0,0.0,0.0,1.0,0.004856,0.116516,0.029129,0.004856,0.067932,0.0,0.004856,0.0,0.0,0.742676,0.029129,0.0,0.558105,0.004856,0.097107,0.004856,0.0,0.004856,0.106812,0.558105,0.0,0.053406,0.165039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004856,0.135864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067932,0.155396,0.067932,0.004856,0.033966,0.0,0.024277,0.0,0.0,0.004856,0.0,0.0,0.0,0.0,0.0,0.0,0.014565,0.0,0.029129,0.009712,0.0,0.0,0.0,0.0,0.0,0.0,0.053406,0.0,0.0,0.0,0.0,0.009712,0.0,0.0,0.0,0.0,0.019424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004856,0.0,0.0,0.019424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004856,0.0,0.004856,0.0,0.0,0.0,0.0,0.004856,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004856,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029129,0.004856,0.0,0.0,0.0,0.0,0.004856,0.0,0.0,0.0,0.0,0.0
4,000c79fe,5.0,29.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.058838,0.0,0.058838,0.0,0.058838,0.0,0.058838,0.529297,0.0,0.176514,0.058838,0.235352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.235352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.235352,0.0,0.0,0.0,0.0,0.0


In [13]:
training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19414 entries, 0 to 19413
Data columns (total 2 columns):
person    19414 non-null object
label     19414 non-null int64
dtypes: int64(1), object(1)
memory usage: 303.4+ KB


In [14]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19415 entries, 0 to 19414
Data columns (total 1 columns):
person    19415 non-null object
dtypes: object(1)
memory usage: 151.8+ KB


In [15]:
eventos_agrupados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38829 entries, 0 to 38828
Columns: 254 entries, person to iPhone X
dtypes: category(1), float16(253)
memory usage: 20.4 MB


In [16]:
## ====================================================================================================================
## LO QUE NOS INDICA LA SIGUIENTE INFORMACIÓN ES QUE DE TODO EL SET DE DATOS TENEMOS LA SIGUIENTE CANTIDAD DE PERSONAS:
## 38829 ... ESTAS SON TODAS LAS PERSONAS QUE HAY REGISTRADAS EN ESTE SET DE DATOS.
## POR OTRO LADO PARA EL SET DE DATOS CON EL QUE VAMOS A HACER EL ENTRENAMIENTO TENEMOS: 19414
## Y PARA EL SET DE DATOS CON EL QUE VAMOS A HACER LA PRUEBA TENEMOS: 19415
## QUE SUMANDO AMBOS NOS DA EL TOTAL DE PERSONAS REGISTRADAS.
## ====================================================================================================================

In [17]:
training_completo = pd.merge(eventos_agrupados, training, on='person', how='inner')
training_completo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19414 entries, 0 to 19413
Columns: 255 entries, person to label
dtypes: float16(253), int64(1), object(1)
memory usage: 9.8+ MB


In [18]:
test_completo = pd.merge(eventos_agrupados, test, on='person', how='inner')
test_completo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19415 entries, 0 to 19414
Columns: 254 entries, person to iPhone X
dtypes: float16(253), object(1)
memory usage: 9.7+ MB


### =======================================================================
### ENTRENAMIENTO Y PREDICCIÓN.
### =======================================================================

In [19]:
feature_columns_to_use = columnas_filtrar
nonnumeric_columns = ['person']

In [20]:
columnas_filtrar

['mes',
 'dia',
 'hora',
 'finde',
 'hora_madrugada',
 'hora_maniana',
 'hora_almuerzo',
 'hora_tarde',
 'hora_noche',
 'domingo',
 'jueves',
 'lunes',
 'martes',
 'miercoles',
 'sabado',
 'viernes',
 'abril',
 'enero',
 'febrero',
 'marzo',
 'mayo',
 u'ad campaign hit',
 u'brand listing',
 u'checkout',
 u'conversion',
 'geneList',
 u'lead',
 u'search engine hit',
 u'searched products',
 'SP',
 u'viewed product',
 u'visited site',
 u'128GB',
 u'16GB',
 u'256GB',
 u'32GB',
 u'4GB',
 u'512MB',
 u'64GB',
 u'8GB',
 u'Bom',
 u'Bom - Sem Touch ID',
 u'Excelente',
 u'Muito Bom',
 u'Novo',
 u'Asus Live',
 u'Asus Zenfone 2',
 u'Asus Zenfone 2 Deluxe',
 u'Asus Zenfone 2 Laser',
 u'Asus Zenfone 2 Laser 6"',
 u'Asus Zenfone 3 Max  32 GB',
 u'Asus Zenfone 3 Max 16 GB',
 u'Asus Zenfone 5',
 u'Asus Zenfone 6',
 u'Asus Zenfone Go',
 u'Asus Zenfone Selfie',
 u'LG  X Screen',
 u'LG G2 Mini D618',
 u'LG G3 Beat D724',
 u'LG G3 D855',
 u'LG G3 Stylus D690',
 u'LG G4 Beat H736',
 u'LG G4 H815P',
 u'LG G4 H

In [21]:
y = pd.factorize(training_completo['label'])[0]
training_completo = training_completo.loc[:, training_completo.columns.isin(columnas_filtrar)]
features = training_completo.columns
clf = RandomForestClassifier(n_jobs=2, random_state=0)
clf.fit(training_completo[features], y)

In [24]:
## Predecimos.
test_completo['label'] = 0
#prediccion = clf.predict(test_completo[features])
# REVISAR. todos dan iguales.
prediccion = clf.predict_proba(test_completo[features])[:,1]
#y = np.unique(prueba)
prediccion

In [28]:
## =================================================================================================
## ARMAMOS EN BASE A LA PREDICCIÓN QUE TENEMOS UN CSV PARA SUBIR A KAGGLE CON EL FORMATO INDICADO.
## =================================================================================================
submission = pd.DataFrame({ 'label': prediccion, 'person': test_completo['person'] })
submission.to_csv("submission_grupo17_RF.csv", index=False)

In [29]:
## RESULTADO DE KAGGLE: 0.71355

## HABIENDO AGREGADO LOS FEATURES DEL 'STORAGE' >> MEJORA POCO: 0.71856

## HABIENDO AGREGADO LOS FEATURES DEL 'CONDITION' >> MEJORA POCO: 0.72024

## HABIENDO AGREGADO LOS FEATURES DEL 'COLOR' >> EMPEORA: 0.70725

## HABIENDO AGREGADO LOS FEATURES DEL 'MODEL' >> EMPEORA: 0.68584