### =======================================================================
### IMPORTACIÓN DE DATOS.
### =======================================================================

In [61]:
## IMPORTACIÓN GENERAL DE LIBRERIAS Y VISUALIZACIÓN DE DATOS (matplotlib y seaborn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as DT
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')
plt.style.use('default') 
sns.set(style="whitegrid") 
plt.rcParams['figure.figsize'] = (15, 10)
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

In [62]:
training = ''
eventos = ''
dummies = ''
test = ''

In [63]:
## OBTENEMOS TODA LA INFORMACIÓN DEL SET DE ENTRENAMIENTO.
training = pd.read_csv('labels_training_set.csv', encoding = 'utf-8')
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES EVENTOS.
eventos = pd.read_csv('events_up_to_01062018.csv', encoding = 'utf-8')
## OBTENEMOS TODA LA INFORMACIÓN A TESTEAR.
test = pd.read_csv('trocafone_kaggle_test.csv', encoding = 'utf-8')

In [64]:
## PASAMOS LAS COLUMNAS QUE TIENEN UNA CANTIDAD DE VALORES LIMITADA A UN TIPO CATEGORY
eventos['person'] = eventos['person'].astype('category')
eventos['event'] = eventos['event'].astype('category')
eventos['condition'] = eventos['condition'].astype('category')
eventos['storage'] = eventos['storage'].astype('category')
eventos['search_engine'] = eventos['search_engine'].astype('category')
eventos['channel'] = eventos['channel'].astype('category')
eventos['new_vs_returning'] = eventos['new_vs_returning'].astype('category')
eventos['device_type'] = eventos['device_type'].astype('category')
eventos['color'] = eventos['color'].astype('category')
eventos['region'] = eventos['region'].astype('category')
eventos['country'] = eventos['country'].astype('category')
eventos['operating_system_version'] = eventos['operating_system_version'].astype('category')
eventos['city'] = eventos['city'].astype('category')
eventos['browser_version'] = eventos['browser_version'].astype('category')
eventos['screen_resolution'] = eventos['screen_resolution'].astype('category')
eventos['model'] = eventos['model'].astype('category')
eventos['timestamp'] = pd.to_datetime(eventos['timestamp'], infer_datetime_format=True)

### =======================================================================
### ARMADO DE FEATURES.
### =======================================================================

In [65]:
## ORDENAMOS LOS DATOS ṔOR PERSONAS EN PRIMER LUGAR Y TIEMPO EN SEGUNDO.
eventos.sort_values(['person', 'timestamp'], ascending=[True, True], inplace=True)
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
eventos['mes'] = eventos['timestamp'].dt.month
eventos['dia'] = eventos['timestamp'].dt.day
eventos['hora'] = eventos['timestamp'].dt.hour
## ARMAMOS UNA COLUMNA PARA EL DÍA DE LA SEMANA COMO NOMBRE.
eventos['diasemana'] = eventos['timestamp'].dt.weekday_name
## PONEMOS LOS NOMBRES DE MANERA MÁS PROLIJA PARA LOS GRÁFICOS.
eventos.loc[eventos.diasemana.str.contains('Monday', na=False), 'diasemana'] = 'lunes'
eventos.loc[eventos.diasemana.str.contains('Tuesday', na=False), 'diasemana'] = 'martes'
eventos.loc[eventos.diasemana.str.contains('Wednesday', na=False), 'diasemana'] = 'miercoles'
eventos.loc[eventos.diasemana.str.contains('Thursday', na=False), 'diasemana'] = 'jueves'
eventos.loc[eventos.diasemana.str.contains('Friday', na=False), 'diasemana'] = 'viernes'
eventos.loc[eventos.diasemana.str.contains('Saturday', na=False), 'diasemana'] = 'sabado'
eventos.loc[eventos.diasemana.str.contains('Sunday', na=False), 'diasemana'] = 'domingo'
# DEFINIMOS UNA LÓGICA PARA INDICAR SI EL DÍA EN QUE SE EJECUTA EL EVENTO ES FIN DE SEMANA.
eventos['finde'] = 0
eventos.loc[(eventos.diasemana.str.contains('DOM', na=False) | eventos.diasemana.str.contains('SAB', na=False)), 'finde'] = 1
# DEFINIMOS EL MES COMO NOMBRE PARA FACILITAR LAS COLUMNAS
eventos['mesMayus'] = ''
eventos.loc[eventos.mes == 1, 'mesMayus'] = 'enero'
eventos.loc[eventos.mes == 2, 'mesMayus'] = 'febrero'
eventos.loc[eventos.mes == 3, 'mesMayus'] = 'marzo'
eventos.loc[eventos.mes == 4, 'mesMayus'] = 'abril'
eventos.loc[eventos.mes == 5, 'mesMayus'] = 'mayo'
eventos.loc[eventos.mes == 6, 'mesMayus'] = 'junio'
eventos.loc[eventos.mes == 7, 'mesMayus'] = 'julio'
eventos.loc[eventos.mes == 8, 'mesMayus'] = 'agosto'
eventos.loc[eventos.mes == 9, 'mesMayus'] = 'septiembre'
eventos.loc[eventos.mes == 10, 'mesMayus'] = 'octubre'
eventos.loc[eventos.mes == 11, 'mesMayus'] = 'noviembre'
eventos.loc[eventos.mes == 12, 'mesMayus'] = 'diciembre'
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
eventos['hora_madrugada'] = 0
eventos.loc[((eventos.hora > -1) & (eventos.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
eventos['hora_mañana'] = 0
eventos.loc[((eventos.hora > 6) & (eventos.hora < 12)), 'hora_mañana'] = 1
# ALMUERZO de 12 a 13
eventos['hora_almuerzo'] = 0
eventos.loc[((eventos.hora > 11) & (eventos.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
eventos['hora_tarde'] = 0
eventos.loc[((eventos.hora > 13) & (eventos.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
eventos['hora_noche'] = 0
eventos.loc[((eventos.hora > 18) & (eventos.hora < 24)), 'hora_noche'] = 1
# TRANSFORMAMOS EN CATEGÓRICAS EL DÍA DE LA SEMANA Y EL MES.
eventos['diasemana'] = eventos['diasemana'].astype('category')
eventos['mesMayus'] = eventos['mesMayus'].astype('category')

In [66]:
eventos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2341681 entries, 1507286 to 1504503
Data columns (total 34 columns):
timestamp                   datetime64[ns]
event                       category
person                      category
url                         object
sku                         float64
model                       category
condition                   category
storage                     category
color                       category
skus                        object
search_term                 object
staticpage                  object
campaign_source             object
search_engine               category
channel                     category
new_vs_returning            category
city                        category
region                      category
country                     category
device_type                 category
screen_resolution           category
operating_system_version    category
browser_version             category
mes                         int64
d

In [67]:
# COLUMNAS DONDE POR CADA REGISTRO SABEMOS QUE TENEMOS UN VALOR (SIEMPRE PRESENTES)
dummies = pd.get_dummies(eventos['diasemana'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = pd.get_dummies(eventos['mesMayus'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = pd.get_dummies(eventos['event'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)

In [68]:
## ESTAS COLUMNAS NO APARECEN SIEMPRE, PUESTO QUE SOLO APARECEN SEGÚN EL TIPO DE EVENTO.
dummies = pd.get_dummies(eventos['color'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = pd.get_dummies(eventos['model'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = pd.get_dummies(eventos['condition'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = pd.get_dummies(eventos['storage'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)

In [89]:
dummies = ''

In [111]:
eventos_filtrados = ''

#eventos = eventos[(eventos.person == '0008ed71')]
eventos_filtrados = eventos.iloc[:, 0:45] 
eventos_filtrados.head(50)

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version,mes,dia,hora,diasemana,finde,mesMayus,hora_madrugada,hora_mañana,hora_almuerzo,hora_tarde,hora_noche,domingo,jueves,lunes,martes,miercoles,sabado,viernes,abril,enero,febrero,marzo
1507286,2018-05-17 12:27:47,checkout,0008ed71,,3372.0,Samsung Galaxy S6 Flat,Muito Bom,32GB,Dourado,,,,,,,,,,,,,,,5,17,12,jueves,0,mayo,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
2336760,2018-05-17 13:44:59,visited site,0008ed71,,,,,,,,,,,,Referral,New,Unknown,Unknown,Brazil,Computer,1920x1080,Windows 10,Chrome 66.0,5,17,13,jueves,0,mayo,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
1507716,2018-05-17 13:45:00,checkout,0008ed71,,8247.0,iPhone SE,Bom,64GB,Cinza espacial,,,,,,,,,,,,,,,5,17,13,jueves,0,mayo,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
2336761,2018-05-17 16:21:54,visited site,0008ed71,,,,,,,,,,,,Referral,Returning,Unknown,Unknown,Brazil,Computer,1920x1080,Windows 10,Chrome 66.0,5,17,16,jueves,0,mayo,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
2122051,2018-05-17 16:22:06,generic listing,0008ed71,,,,,,,"6594,6651,6664,7253,2820,6706,6721,12606,480,1...",,,,,,,,,,,,,,5,17,16,jueves,0,mayo,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
1505383,2018-05-17 16:28:37,checkout,0008ed71,,7505.0,LG G4 H818P,Bom,32GB,Preto,,,,,,,,,,,,,,,5,17,16,jueves,0,mayo,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
2146920,2018-05-03 22:08:29,visited site,00091926,,,,,,,,,,,,Direct,New,Carlos Barbosa,Rio Grande do Sul,Brazil,Computer,1024x768,Windows 7,Chrome 66.0,5,3,22,jueves,0,mayo,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0
121981,2018-05-03 22:08:35,viewed product,00091926,,8568.0,Motorola Moto X Style,Muito Bom,32GB,Preto,,,,,,,,,,,,,,,5,3,22,jueves,0,mayo,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0
120337,2018-05-03 22:08:51,viewed product,00091926,,14734.0,Samsung Galaxy A7 2017,Novo,32GB,Preto,,,,,,,,,,,,,,,5,3,22,jueves,0,mayo,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0
120458,2018-05-03 22:09:25,viewed product,00091926,,8568.0,Motorola Moto X Style,Muito Bom,32GB,Preto,,,,,,,,,,,,,,,5,3,22,jueves,0,mayo,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0


In [112]:
columnas_filtrar = list(eventos_filtrados.select_dtypes(include=['int','float64','uint8']).columns)

columnas_filtrar.remove('sku')
#columnas_filtrar.remove('generic listing')
#columnas_filtrar.remove('staticpage')
#columnas_filtrar.remove('Azul Topázio')
columnas_filtrar.append('person')
asd = eventos_filtrados.loc[:, eventos_filtrados.columns.isin(columnas_filtrar)]
columnas_filtrar.remove('person')

# VOY A TRABAJAR CON UN CASO  TESTIGO.
pd.options.display.max_columns = 350

#asd.info()
asd.head(25)
eventos_agrupados = ''
eventos_agrupados = asd.groupby('person')[columnas_filtrar].sum().reset_index()
eventos_agrupados = eventos_agrupados[(eventos_agrupados.dia > 0)]
eventos_agrupados.head(15)

#asd.head()
#grouped_df = eventos.groupby('person').mean().reset_index()
#entrenar = pd.](grouped_df)

Unnamed: 0,person,mes,dia,hora,finde,hora_madrugada,hora_mañana,hora_almuerzo,hora_tarde,hora_noche,domingo,jueves,lunes,martes,miercoles,sabado,viernes,abril,enero,febrero,marzo
0,0008ed71,30,102,86,0,0,0,3,3,0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00091926,2240,7496,3139,0,313,0,0,50,85,131.0,84.0,32.0,43.0,40.0,32.0,86.0,0.0,0.0,0.0,0.0
2,00091a7a,30,260,140,0,0,0,0,10,0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0
3,000ba417,1030,4586,2786,0,0,57,68,81,0,0.0,147.0,0.0,0.0,0.0,59.0,0.0,0.0,0.0,0.0,0.0
4,000c79fe,85,493,0,0,17,0,0,0,0,0.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,000e4d9e,2055,8464,5510,0,0,27,273,109,2,0.0,29.0,27.0,140.0,207.0,0.0,8.0,0.0,0.0,0.0,0.0
6,000e619d,340,1149,1077,0,0,0,19,49,0,0.0,27.0,0.0,10.0,9.0,0.0,22.0,0.0,0.0,0.0,0.0
7,001001be,340,1097,1233,0,0,0,1,49,18,0.0,0.0,0.0,1.0,66.0,1.0,0.0,0.0,0.0,0.0,0.0
8,0010e89a,20,120,72,0,0,0,0,4,0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0016c4b5,20,116,72,0,0,0,0,4,0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### =======================================================================
### ENTRENAMIENTO Y PREDICCIÓN.
### =======================================================================

In [129]:
## OBTENEMOS TODA LA INFORMACIÓN DEL SET DE ENTRENAMIENTO.
training = ''
training = pd.read_csv('labels_training_set.csv', encoding = 'utf-8')

train_completo = pd.merge(training, eventos_agrupados, on='person', how='left')
train_completo.dropna(subset=['person']) 
train_completo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19414 entries, 0 to 19413
Data columns (total 23 columns):
person            19414 non-null object
label_x           19414 non-null int64
mes               19414 non-null int64
dia               19414 non-null int64
hora              19414 non-null int64
finde             19414 non-null int64
hora_madrugada    19414 non-null int64
hora_mañana       19414 non-null int64
hora_almuerzo     19414 non-null int64
hora_tarde        19414 non-null int64
hora_noche        19414 non-null int64
domingo           19414 non-null float64
jueves            19414 non-null float64
lunes             19414 non-null float64
martes            19414 non-null float64
miercoles         19414 non-null float64
sabado            19414 non-null float64
viernes           19414 non-null float64
abril             19414 non-null float64
enero             19414 non-null float64
febrero           19414 non-null float64
marzo             19414 non-null float64
label_y   

In [130]:
training = ''

In [132]:
y = pd.factorize(train_completo['label'])[0]
train_completo = train_completo[['mes','dia','hora','finde','hora_madrugada','hora_mañana','hora_almuerzo','hora_tarde','hora_noche','domingo','jueves','lunes','martes','miercoles','sabado','viernes','abril','enero','febrero','marzo','mayo']]
# VER COMO OBTENER LAS COLUMNAS 
# ESPECÍFICAS QUE QUEREMOS TRABAJAR.
features = train_completo.columns

In [133]:
y

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [134]:
# Creamos un clasificador con Random Forest..
clf = RandomForestClassifier(n_jobs=2, random_state=0)
# Entrenamos.
clf.fit(train_completo[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [135]:
# Predecimos.
eventos['label'] = 0
clf.predict(eventos[features])

ValueError: Expected 2D array, got 1D array instead:
array=[0. 0. 0. ... 0. 0. 0.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [123]:
# REVISAR. todos dan iguales.
prueba = clf.predict_proba(eventos_agrupados[features])[0:10000]

In [124]:
y = np.unique(prueba)

In [125]:
y

array([0. , 0.1, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9, 1. ])

In [126]:
prueba

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])