In [25]:
## IMPORTACIÓN GENERAL DE LIBRERIAS Y VISUALIZACIÓN DE DATOS (matplotlib y seaborn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as DT
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')
plt.style.use('default') 
sns.set(style="whitegrid") 
plt.rcParams['figure.figsize'] = (15, 10)
pd.set_option('display.max_columns', 100)
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

In [12]:
## OBTENEMOS TODA LA INFORMACIÓN DEL SET DE ENTRENAMIENTO.
training = pd.read_csv('labels_training_set.csv', encoding = 'utf-8')
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES EVENTOS.
eventos = pd.read_csv('events_up_to_01062018.csv', encoding = 'utf-8')
## OBTENEMOS TODA LA INFORMACIÓN A TESTEAR.
test = pd.read_csv('trocafone_kaggle_test.csv', encoding = 'utf-8')

In [20]:
training.head()

Unnamed: 0,person,label
0,0566e9c1,0
1,6ec7ee77,0
2,abe7a2fb,0
3,34728364,0
4,87ed62de,0


In [13]:
## PASAMOS LAS COLUMNAS QUE TIENEN UNA CANTIDAD DE VALORES LIMITADA A UN TIPO CATEGORY
eventos['event'] = eventos['event'].astype('category')
eventos['condition'] = eventos['condition'].astype('category')
eventos['storage'] = eventos['storage'].astype('category')
eventos['search_engine'] = eventos['search_engine'].astype('category')
eventos['channel'] = eventos['channel'].astype('category')
eventos['new_vs_returning'] = eventos['new_vs_returning'].astype('category')
eventos['device_type'] = eventos['device_type'].astype('category')
eventos['color'] = eventos['color'].astype('category')
eventos['region'] = eventos['region'].astype('category')
eventos['country'] = eventos['country'].astype('category')
eventos['operating_system_version'] = eventos['operating_system_version'].astype('category')
eventos['city'] = eventos['city'].astype('category')
eventos['browser_version'] = eventos['browser_version'].astype('category')
eventos['screen_resolution'] = eventos['screen_resolution'].astype('category')
eventos['timestamp'] = pd.to_datetime(eventos['timestamp'], infer_datetime_format=True)

In [14]:
## ORDENAMOS LOS DATOS ṔOR PERSONAS EN PRIMER LUGAR Y TIEMPO EN SEGUNDO.
eventos.sort_values(['person', 'timestamp'], ascending=[True, True], inplace=True)

In [15]:
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
eventos['mes'] = eventos['timestamp'].dt.month
eventos['dia'] = eventos['timestamp'].dt.day
eventos['hora'] = eventos['timestamp'].dt.hour

In [16]:
## ARMAMOS UNA COLUMNA PARA EL DÍA DE LA SEMANA COMO NOMBRE.
eventos['diasemana'] = eventos['timestamp'].dt.weekday_name
## PONEMOS LOS NOMBRES DE MANERA MÁS PROLIJA PARA LOS GRÁFICOS.
eventos.loc[eventos.diasemana.str.contains('Monday', na=False), 'diasemana'] = 'lunes'
eventos.loc[eventos.diasemana.str.contains('Tuesday', na=False), 'diasemana'] = 'martes'
eventos.loc[eventos.diasemana.str.contains('Wednesday', na=False), 'diasemana'] = 'miercoles'
eventos.loc[eventos.diasemana.str.contains('Thursday', na=False), 'diasemana'] = 'jueves'
eventos.loc[eventos.diasemana.str.contains('Friday', na=False), 'diasemana'] = 'viernes'
eventos.loc[eventos.diasemana.str.contains('Saturday', na=False), 'diasemana'] = 'sabado'
eventos.loc[eventos.diasemana.str.contains('Sunday', na=False), 'diasemana'] = 'domingo'

In [17]:
# AGREGAMOS UNA COLUMNA PARA INDICAR SI EL EVENTO OCURRIO UN FIN DE SEMANA
eventos['evento_en_finde'] = 0
eventos.loc[(eventos.diasemana.str.contains('DOM', na=False) | eventos.diasemana.str.contains('SAB', na=False)), 'evento_en_finde'] = 1

In [18]:
# DEFINIMOS EL MES COMO NOMBRE PARA FACILITAR LAS COLUMNAS
eventos['mesMayus'] = ''
eventos.loc[eventos.mes == 1, 'mesMayus'] = 'enero'
eventos.loc[eventos.mes == 2, 'mesMayus'] = 'febrero'
eventos.loc[eventos.mes == 3, 'mesMayus'] = 'marzo'
eventos.loc[eventos.mes == 4, 'mesMayus'] = 'abril'
eventos.loc[eventos.mes == 5, 'mesMayus'] = 'mayo'
eventos.loc[eventos.mes == 6, 'mesMayus'] = 'junio'
eventos.loc[eventos.mes == 7, 'mesMayus'] = 'julio'
eventos.loc[eventos.mes == 8, 'mesMayus'] = 'agosto'
eventos.loc[eventos.mes == 9, 'mesMayus'] = 'septiembre'
eventos.loc[eventos.mes == 10, 'mesMayus'] = 'octubre'
eventos.loc[eventos.mes == 11, 'mesMayus'] = 'noviembre'
eventos.loc[eventos.mes == 12, 'mesMayus'] = 'diciembre'

In [19]:
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
eventos['hora_madrugada'] = 0
eventos.loc[((eventos.hora > -1) & (eventos.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
eventos['hora_mañana'] = 0
eventos.loc[((eventos.hora > 6) & (eventos.hora < 12)), 'hora_mañana'] = 1
# ALMUERZO de 12 a 13
eventos['hora_almuerzo'] = 0
eventos.loc[((eventos.hora > 11) & (eventos.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
eventos['hora_tarde'] = 0
eventos.loc[((eventos.hora > 13) & (eventos.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
eventos['hora_noche'] = 0
eventos.loc[((eventos.hora > 18) & (eventos.hora < 24)), 'hora_noche'] = 1
# TRANSFORMAMOS EN CATEGÓRICAS EL DÍA DE LA SEMANA Y EL MES.
eventos['diasemana'] = eventos['diasemana'].astype('category')
eventos['mesMayus'] = eventos['mesMayus'].astype('category')

In [21]:
dummies_diasemana = pd.get_dummies(eventos['diasemana'], drop_first=False)
eventos = pd.concat([eventos, dummies_diasemana], axis=1)
dummies_mesMayus = pd.get_dummies(eventos['mesMayus'], drop_first=False)
eventos = pd.concat([eventos, dummies_mesMayus], axis=1)

In [27]:
train_completo = pd.merge(training, eventos, on='person', how='left')
train_completo.dropna(subset=['person']) 
#train_completo.info()

Unnamed: 0,person,label,timestamp,event,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version,mes,dia,hora,diasemana,evento_en_finde,mesMayus,hora_madrugada,hora_mañana,hora_almuerzo,hora_tarde,hora_noche,domingo,jueves,lunes,martes,miercoles,sabado,viernes,abril,enero,febrero,marzo,mayo
0,0566e9c1,0,2018-05-22 17:54:26,visited site,,,,,,,,,,,,Organic,New,Fazenda Rio Grande,Parana,Brazil,Smartphone,320x568,iOS 11.3,Mobile Safari 11,5,22,17,martes,0,mayo,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1
1,0566e9c1,0,2018-05-22 17:54:27,search engine hit,,,,,,,,,,,Google,,,,,,,,,,5,22,17,martes,0,mayo,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1
2,0566e9c1,0,2018-05-22 17:54:27,generic listing,,,,,,,"6594,6636,6649,2820,6707,2750,12618,11346,725,...",,,,,,,,,,,,,,5,22,17,martes,0,mayo,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1
3,0566e9c1,0,2018-05-22 17:54:43,viewed product,,6023.0,iPhone 5s,Bom - Sem Touch ID,16GB,Dourado,,,,,,,,,,,,,,,5,22,17,martes,0,mayo,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1
4,0566e9c1,0,2018-05-22 17:54:54,viewed product,,6011.0,iPhone 5s,Bom - Sem Touch ID,16GB,Prateado,,,,,,,,,,,,,,,5,22,17,martes,0,mayo,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1
5,0566e9c1,0,2018-05-22 17:55:05,viewed product,,2691.0,iPhone 5s,Bom,16GB,Prateado,,,,,,,,,,,,,,,5,22,17,martes,0,mayo,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1
6,0566e9c1,0,2018-05-22 17:55:13,viewed product,,2693.0,iPhone 5s,Bom,32GB,Prateado,,,,,,,,,,,,,,,5,22,17,martes,0,mayo,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1
7,0566e9c1,0,2018-05-22 17:56:47,brand listing,,,,,,,"2694,6001,6023,2711,6930,5904,10294,2833,6011,...",,,,,,,,,,,,,,5,22,17,martes,0,mayo,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1
8,0566e9c1,0,2018-05-22 17:57:43,viewed product,,2679.0,iPhone 4S,Bom,8GB,Preto,"3371,6357,6371,2777,2718,10896,3191,2694,6791,...",,,,,,,,,,,,,,5,22,17,martes,0,mayo,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1
9,0566e9c1,0,2018-05-22 17:57:54,viewed product,,2680.0,iPhone 4S,Bom,8GB,Branco,,,,,,,,,,,,,,,5,22,17,martes,0,mayo,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1


In [29]:
y = pd.factorize(train_completo['label'])[0]
train_completo = train_completo[['mes','dia','hora','evento_en_finde','hora_madrugada','hora_mañana','hora_almuerzo','hora_tarde','hora_noche','domingo','jueves','lunes','martes','miercoles','sabado','viernes','abril','enero','febrero','marzo','mayo']]
# VER COMO OBTENER LAS COLUMNAS 
# ESPECÍFICAS QUE QUEREMOS TRABAJAR.
features = train_completo.columns

In [31]:
features

Index(['mes', 'dia', 'hora', 'evento_en_finde', 'hora_madrugada',
       'hora_mañana', 'hora_almuerzo', 'hora_tarde', 'hora_noche', 'domingo',
       'jueves', 'lunes', 'martes', 'miercoles', 'sabado', 'viernes', 'abril',
       'enero', 'febrero', 'marzo', 'mayo'],
      dtype='object')

In [32]:
# Creamos un clasificador con Random Forest..
clf = RandomForestClassifier(n_jobs=2, random_state=0)
# Entrenamos.
clf.fit(train_completo[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [33]:
# Predecimos.
eventos['label'] = 0
clf.predict(eventos[features])

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [34]:
# REVISAR. todos dan iguales.
prueba = clf.predict_proba(eventos[features])[0:10000]

In [35]:
y = np.unique(prueba)

(1298,)