### =======================================================================
### IMPORTACIÓN DE DATOS.
### =======================================================================

In [52]:
## IMPORTACIÓN GENERAL DE LIBRERIAS Y VISUALIZACIÓN DE DATOS (matplotlib y seaborn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as DT
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')
plt.style.use('default') 
sns.set(style="whitegrid") 
plt.rcParams['figure.figsize'] = (15, 10)
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn import metrics

In [53]:
training = ''
eventos = ''
dummies = ''
test = ''

In [54]:
## OBTENEMOS TODA LA INFORMACIÓN DEL SET DE ENTRENAMIENTO.
training = pd.read_csv('labels_training_set.csv', encoding = 'utf-8')
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES EVENTOS.
eventos = pd.read_csv('events_up_to_01062018.csv', encoding = 'utf-8')
## OBTENEMOS TODA LA INFORMACIÓN A TESTEAR.
test = pd.read_csv('trocafone_kaggle_test.csv', encoding = 'utf-8')

In [55]:
## PASAMOS LAS COLUMNAS QUE TIENEN UNA CANTIDAD DE VALORES LIMITADA A UN TIPO CATEGORY
eventos['person'] = eventos['person'].astype('category')
eventos['event'] = eventos['event'].astype('category')
eventos['condition'] = eventos['condition'].astype('category')
eventos['storage'] = eventos['storage'].astype('category')
eventos['search_engine'] = eventos['search_engine'].astype('category')
eventos['channel'] = eventos['channel'].astype('category')
eventos['new_vs_returning'] = eventos['new_vs_returning'].astype('category')
eventos['device_type'] = eventos['device_type'].astype('category')
eventos['color'] = eventos['color'].astype('category')
eventos['region'] = eventos['region'].astype('category')
eventos['country'] = eventos['country'].astype('category')
eventos['operating_system_version'] = eventos['operating_system_version'].astype('category')
eventos['city'] = eventos['city'].astype('category')
eventos['browser_version'] = eventos['browser_version'].astype('category')
eventos['screen_resolution'] = eventos['screen_resolution'].astype('category')
eventos['model'] = eventos['model'].astype('category')
eventos['timestamp'] = pd.to_datetime(eventos['timestamp'], infer_datetime_format=True)

### =======================================================================
### ARMADO DE FEATURES.
### =======================================================================

In [56]:
## ORDENAMOS LOS DATOS ṔOR PERSONAS EN PRIMER LUGAR Y TIEMPO EN SEGUNDO.
eventos.sort_values(['person', 'timestamp'], ascending=[True, True], inplace=True)
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
eventos['mes'] = eventos['timestamp'].dt.month
eventos['dia'] = eventos['timestamp'].dt.day
eventos['hora'] = eventos['timestamp'].dt.hour
## ARMAMOS UNA COLUMNA PARA EL DÍA DE LA SEMANA COMO NOMBRE.
eventos['diasemana'] = eventos['timestamp'].dt.weekday_name
## PONEMOS LOS NOMBRES DE MANERA MÁS PROLIJA PARA LOS GRÁFICOS.
eventos.loc[eventos.diasemana.str.contains('Monday', na=False), 'diasemana'] = 'lunes'
eventos.loc[eventos.diasemana.str.contains('Tuesday', na=False), 'diasemana'] = 'martes'
eventos.loc[eventos.diasemana.str.contains('Wednesday', na=False), 'diasemana'] = 'miercoles'
eventos.loc[eventos.diasemana.str.contains('Thursday', na=False), 'diasemana'] = 'jueves'
eventos.loc[eventos.diasemana.str.contains('Friday', na=False), 'diasemana'] = 'viernes'
eventos.loc[eventos.diasemana.str.contains('Saturday', na=False), 'diasemana'] = 'sabado'
eventos.loc[eventos.diasemana.str.contains('Sunday', na=False), 'diasemana'] = 'domingo'
# DEFINIMOS UNA LÓGICA PARA INDICAR SI EL DÍA EN QUE SE EJECUTA EL EVENTO ES FIN DE SEMANA.
eventos['finde'] = 0
eventos.loc[(eventos.diasemana.str.contains('DOM', na=False) | eventos.diasemana.str.contains('SAB', na=False)), 'finde'] = 1
# DEFINIMOS EL MES COMO NOMBRE PARA FACILITAR LAS COLUMNAS
eventos['mesMayus'] = ''
eventos.loc[eventos.mes == 1, 'mesMayus'] = 'enero'
eventos.loc[eventos.mes == 2, 'mesMayus'] = 'febrero'
eventos.loc[eventos.mes == 3, 'mesMayus'] = 'marzo'
eventos.loc[eventos.mes == 4, 'mesMayus'] = 'abril'
eventos.loc[eventos.mes == 5, 'mesMayus'] = 'mayo'
eventos.loc[eventos.mes == 6, 'mesMayus'] = 'junio'
eventos.loc[eventos.mes == 7, 'mesMayus'] = 'julio'
eventos.loc[eventos.mes == 8, 'mesMayus'] = 'agosto'
eventos.loc[eventos.mes == 9, 'mesMayus'] = 'septiembre'
eventos.loc[eventos.mes == 10, 'mesMayus'] = 'octubre'
eventos.loc[eventos.mes == 11, 'mesMayus'] = 'noviembre'
eventos.loc[eventos.mes == 12, 'mesMayus'] = 'diciembre'
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
eventos['hora_madrugada'] = 0
eventos.loc[((eventos.hora > -1) & (eventos.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
eventos['hora_maniana'] = 0
eventos.loc[((eventos.hora > 6) & (eventos.hora < 12)), 'hora_maniana'] = 1
# ALMUERZO de 12 a 13
eventos['hora_almuerzo'] = 0
eventos.loc[((eventos.hora > 11) & (eventos.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
eventos['hora_tarde'] = 0
eventos.loc[((eventos.hora > 13) & (eventos.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
eventos['hora_noche'] = 0
eventos.loc[((eventos.hora > 18) & (eventos.hora < 24)), 'hora_noche'] = 1
# TRANSFORMAMOS EN CATEGÓRICAS EL DÍA DE LA SEMANA Y EL MES.
eventos['diasemana'] = eventos['diasemana'].astype('category')
eventos['mesMayus'] = eventos['mesMayus'].astype('category')

In [57]:
# COMO TENEMOS UN EVENTO CON EL MISMO NOMBRE SE GENERA CONFLICTOS, 
# ERGO LE MODIFICAMOS EL NOMBRE PARA NO TENER DOS COLUMNAS CON = NOMBRE Y DISTINTO TIPO.
eventos.rename(columns={'staticpage': 'Genstatpage'}, inplace=True)

In [58]:
# COLUMNAS DONDE POR CADA REGISTRO SABEMOS QUE TENEMOS UN VALOR (SIEMPRE PRESENTES)
dummies = pd.get_dummies(eventos['diasemana'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = ''
dummies = pd.get_dummies(eventos['mesMayus'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = ''
dummies = pd.get_dummies(eventos['event'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = ''

In [59]:
dummies = pd.get_dummies(eventos['storage'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = ''
dummies = pd.get_dummies(eventos['condition'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = ''

In [60]:
dummies = ''
eventos.rename(columns={'generic listing': 'geneList', 'staticpage': 'statpage', 'staticpage': 'SP'}, inplace=True)

In [61]:
eventos_filtrados = ''
eventos_agrupados = ''
agregar = ''

# VOY A TRABAJAR CON UN CASO  TESTIGO.
pd.options.display.max_columns = 350

In [62]:
columnas_filtrar = list(eventos.select_dtypes(include=['int','float64','uint8']).columns)

In [63]:
eventos_agrupados = ''
eventos_filtrados = ''
agrupar = ''

columnas_filtrar.remove('sku')
columnas_filtrar.append('person')
agrupar = eventos.loc[:, eventos.columns.isin(columnas_filtrar)]
columnas_filtrar.remove('person')

eventos_agrupados = agrupar.groupby('person')[columnas_filtrar].mean().astype(np.float16).reset_index()
eventos_filtrados = ''
agrupar = ''

In [64]:
eventos_agrupados.head()

Unnamed: 0,person,domingo,jueves,lunes,martes,miercoles,sabado,viernes,abril,enero,febrero,marzo,mayo,ad campaign hit,brand listing,checkout,conversion,geneList,lead,search engine hit,searched products,SP,viewed product,visited site,128GB,16GB,256GB,32GB,4GB,512MB,64GB,8GB,Bom,Bom - Sem Touch ID,Excelente,Muito Bom,Novo
0,0008ed71,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.5,0.0,0.166626,0.0,0.0,0.0,0.0,0.0,0.333252,0.0,0.0,0.0,0.333252,0.0,0.0,0.166626,0.0,0.333252,0.0,0.0,0.166626,0.0
1,00091926,0.29248,0.1875,0.071411,0.096008,0.089294,0.071411,0.192017,0.0,0.0,0.0,0.0,1.0,0.033478,0.055817,0.004463,0.0,0.0,0.0,0.0,0.0,0.0,0.830566,0.075867,0.107117,0.232178,0.022324,0.294678,0.0,0.0,0.178589,0.0,0.227661,0.0,0.241089,0.36377,0.002232
2,00091a7a,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.099976,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.300049,0.099976,0.099976,0.099976,0.0,0.0,0.0,0.0,0.099976,0.0,0.099976,0.0,0.199951,0.0,0.0
3,000ba417,0.0,0.713379,0.0,0.0,0.0,0.286377,0.0,0.0,0.0,0.0,0.0,1.0,0.004856,0.116516,0.029129,0.004856,0.067932,0.0,0.004856,0.0,0.0,0.742676,0.029129,0.0,0.558105,0.004856,0.097107,0.004856,0.0,0.004856,0.106812,0.558105,0.0,0.053406,0.165039,0.0
4,000c79fe,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.058838,0.0,0.058838,0.0,0.058838,0.0,0.058838,0.529297,0.0,0.176514,0.058838,0.235352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.235352,0.0,0.0,0.0,0.0


In [65]:
training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19414 entries, 0 to 19413
Data columns (total 2 columns):
person    19414 non-null object
label     19414 non-null int64
dtypes: int64(1), object(1)
memory usage: 303.4+ KB


In [66]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19415 entries, 0 to 19414
Data columns (total 1 columns):
person    19415 non-null object
dtypes: object(1)
memory usage: 151.8+ KB


In [67]:
eventos_agrupados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38829 entries, 0 to 38828
Data columns (total 37 columns):
person                38829 non-null category
domingo               38829 non-null float16
jueves                38829 non-null float16
lunes                 38829 non-null float16
martes                38829 non-null float16
miercoles             38829 non-null float16
sabado                38829 non-null float16
viernes               38829 non-null float16
abril                 38829 non-null float16
enero                 38829 non-null float16
febrero               38829 non-null float16
marzo                 38829 non-null float16
mayo                  38829 non-null float16
ad campaign hit       38829 non-null float16
brand listing         38829 non-null float16
checkout              38829 non-null float16
conversion            38829 non-null float16
geneList              38829 non-null float16
lead                  38829 non-null float16
search engine hit     38829 non-nul

In [68]:
## ====================================================================================================================
## LO QUE NOS INDICA LA SIGUIENTE INFORMACIÓN ES QUE DE TODO EL SET DE DATOS TENEMOS LA SIGUIENTE CANTIDAD DE PERSONAS:
## 38829 ... ESTAS SON TODAS LAS PERSONAS QUE HAY REGISTRADAS EN ESTE SET DE DATOS.
## POR OTRO LADO PARA EL SET DE DATOS CON EL QUE VAMOS A HACER EL ENTRENAMIENTO TENEMOS: 19414
## Y PARA EL SET DE DATOS CON EL QUE VAMOS A HACER LA PRUEBA TENEMOS: 19415
## QUE SUMANDO AMBOS NOS DA EL TOTAL DE PERSONAS REGISTRADAS.
## ====================================================================================================================

In [69]:
training_completo = pd.merge(eventos_agrupados, training, on='person', how='inner')
training_completo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19414 entries, 0 to 19413
Data columns (total 38 columns):
person                19414 non-null object
domingo               19414 non-null float16
jueves                19414 non-null float16
lunes                 19414 non-null float16
martes                19414 non-null float16
miercoles             19414 non-null float16
sabado                19414 non-null float16
viernes               19414 non-null float16
abril                 19414 non-null float16
enero                 19414 non-null float16
febrero               19414 non-null float16
marzo                 19414 non-null float16
mayo                  19414 non-null float16
ad campaign hit       19414 non-null float16
brand listing         19414 non-null float16
checkout              19414 non-null float16
conversion            19414 non-null float16
geneList              19414 non-null float16
lead                  19414 non-null float16
search engine hit     19414 non-null 

In [70]:
test_completo = pd.merge(eventos_agrupados, test, on='person', how='inner')
test_completo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19415 entries, 0 to 19414
Data columns (total 37 columns):
person                19415 non-null object
domingo               19415 non-null float16
jueves                19415 non-null float16
lunes                 19415 non-null float16
martes                19415 non-null float16
miercoles             19415 non-null float16
sabado                19415 non-null float16
viernes               19415 non-null float16
abril                 19415 non-null float16
enero                 19415 non-null float16
febrero               19415 non-null float16
marzo                 19415 non-null float16
mayo                  19415 non-null float16
ad campaign hit       19415 non-null float16
brand listing         19415 non-null float16
checkout              19415 non-null float16
conversion            19415 non-null float16
geneList              19415 non-null float16
lead                  19415 non-null float16
search engine hit     19415 non-null 

### =======================================================================
### ENTRENAMIENTO Y PREDICCIÓN.
### =======================================================================

In [71]:
feature_columns_to_use = columnas_filtrar
nonnumeric_columns = ['person']

In [72]:
columnas_filtrar

['domingo',
 'jueves',
 'lunes',
 'martes',
 'miercoles',
 'sabado',
 'viernes',
 'abril',
 'enero',
 'febrero',
 'marzo',
 'mayo',
 'ad campaign hit',
 'brand listing',
 'checkout',
 'conversion',
 'geneList',
 'lead',
 'search engine hit',
 'searched products',
 'SP',
 'viewed product',
 'visited site',
 '128GB',
 '16GB',
 '256GB',
 '32GB',
 '4GB',
 '512MB',
 '64GB',
 '8GB',
 'Bom',
 'Bom - Sem Touch ID',
 'Excelente',
 'Muito Bom',
 'Novo']

In [73]:
# PREPARAMOS LOS INPUTS PARA EL ENTRENAMIENTO.
train_X = training_completo[columnas_filtrar].as_matrix()
test_X = test_completo[columnas_filtrar].as_matrix()
train_y = training_completo['label']

In [103]:
# ENTRENAMOS.
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(train_X, train_y)

In [104]:
# OBTENEMOS UNA PREDICCIÓN.
predictions = gbm.predict(test_X)

In [105]:
## =================================================================================================
## ARMAMOS EN BASE A LA PREDICCIÓN QUE TENEMOS UN CSV PARA SUBIR A KAGGLE CON EL FORMATO INDICADO.
## =================================================================================================
submission = pd.DataFrame({ 'person': test_completo['person'], 'label': predictions })
submission.to_csv("submission_grupo27_XGB.csv", index=False)

### =======================================================================
### METRICAS
### =======================================================================

In [77]:
xtrain, xtest, ytrain, ytest = train_test_split(training_completo[columnas_filtrar], training_completo['label'])

In [78]:
print(xtrain.shape)
print(ytrain.shape)
print(xtest.shape)
print(ytest.shape)

(14560, 36)
(14560,)
(4854, 36)
(4854,)


In [97]:
def decimal_range(start, stop, increment):
    while start < stop: # and not math.isclose(start, stop): Py>3.5
        yield start
        start += increment

for i in range(5,15):
    for j in range(1, 5):
        # Creamos un clasificador y entrenamosytest
        gbm = xgb.XGBClassifier(n_estimators=i,max_depth=j, learning_rate=0.1).fit(xtrain, ytrain)
        prediccion = gbm.predict(xtest)
        print('estimators: ' + str(i) + ' depth: ' + str(j) + '-' + str(np.sqrt(metrics.mean_squared_error(ytest, prediccion))))

estimators: 5 depth: 1-0.23321263580292098
estimators: 5 depth: 2-0.23321263580292098
estimators: 5 depth: 3-0.23188377805826785
estimators: 5 depth: 4-0.23409435023623648
estimators: 6 depth: 1-0.23321263580292098
estimators: 6 depth: 2-0.23321263580292098
estimators: 6 depth: 3-0.23188377805826785
estimators: 6 depth: 4-0.23541072996806153
estimators: 7 depth: 1-0.23321263580292098
estimators: 7 depth: 2-0.23321263580292098
estimators: 7 depth: 3-0.23188377805826785
estimators: 7 depth: 4-0.23365390892372379
estimators: 8 depth: 1-0.23321263580292098
estimators: 8 depth: 2-0.23321263580292098
estimators: 8 depth: 3-0.23277052614314364
estimators: 8 depth: 4-0.23409435023623648
estimators: 9 depth: 1-0.23321263580292098
estimators: 9 depth: 2-0.23321263580292098
estimators: 9 depth: 3-0.23277052614314364
estimators: 9 depth: 4-0.23365390892372379
estimators: 10 depth: 1-0.23321263580292098
estimators: 10 depth: 2-0.23321263580292098
estimators: 10 depth: 3-0.23277052614314364
estimato