In [39]:
## IMPORTACIÓN GENERAL DE LIBRERIAS Y VISUALIZACIÓN DE DATOS (matplotlib y seaborn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as DT
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')
plt.style.use('default') 
sns.set(style="whitegrid") 
plt.rcParams['figure.figsize'] = (15, 10)
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

In [40]:
## OBTENEMOS TODA LA INFORMACIÓN DEL SET DE ENTRENAMIENTO.
training = pd.read_csv('labels_training_set.csv', encoding = 'utf-8')
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES EVENTOS.
eventos = pd.read_csv('events_up_to_01062018.csv', encoding = 'utf-8')
## OBTENEMOS TODA LA INFORMACIÓN A TESTEAR.
test = pd.read_csv('trocafone_kaggle_test.csv', encoding = 'utf-8')

In [41]:
## PASAMOS LAS COLUMNAS QUE TIENEN UNA CANTIDAD DE VALORES LIMITADA A UN TIPO CATEGORY
eventos['event'] = eventos['event'].astype('category')
eventos['condition'] = eventos['condition'].astype('category')
eventos['storage'] = eventos['storage'].astype('category')
eventos['search_engine'] = eventos['search_engine'].astype('category')
eventos['channel'] = eventos['channel'].astype('category')
eventos['new_vs_returning'] = eventos['new_vs_returning'].astype('category')
eventos['device_type'] = eventos['device_type'].astype('category')
eventos['color'] = eventos['color'].astype('category')
eventos['region'] = eventos['region'].astype('category')
eventos['country'] = eventos['country'].astype('category')
eventos['operating_system_version'] = eventos['operating_system_version'].astype('category')
eventos['city'] = eventos['city'].astype('category')
eventos['browser_version'] = eventos['browser_version'].astype('category')
eventos['screen_resolution'] = eventos['screen_resolution'].astype('category')
eventos['timestamp'] = pd.to_datetime(eventos['timestamp'], infer_datetime_format=True)

In [48]:
train_completo = pd.merge(training, eventos, on='person', how='left')
train_completo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1171886 entries, 0 to 1171885
Data columns (total 24 columns):
person                      1171886 non-null object
label                       1171886 non-null int64
timestamp                   1171886 non-null datetime64[ns]
event                       1171886 non-null category
url                         94875 non-null object
sku                         665336 non-null float64
model                       665767 non-null object
condition                   665336 non-null category
storage                     665336 non-null category
color                       665336 non-null category
skus                        249587 non-null object
search_term                 55774 non-null object
staticpage                  5660 non-null object
campaign_source             94940 non-null object
search_engine               52829 non-null category
channel                     102299 non-null category
new_vs_returning            102299 non-null category


In [49]:
## CREAMOS UNA COLUMNA PARA DIFERENCIAR LA CONDICION.
train_completo['dia_semana'] = train_completo['timestamp'].dt.day
train_completo['mes'] = train_completo['timestamp'].dt.month
train_completo['anio'] = train_completo['timestamp'].dt.year


In [50]:
train_completo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1171886 entries, 0 to 1171885
Data columns (total 27 columns):
person                      1171886 non-null object
label                       1171886 non-null int64
timestamp                   1171886 non-null datetime64[ns]
event                       1171886 non-null category
url                         94875 non-null object
sku                         665336 non-null float64
model                       665767 non-null object
condition                   665336 non-null category
storage                     665336 non-null category
color                       665336 non-null category
skus                        249587 non-null object
search_term                 55774 non-null object
staticpage                  5660 non-null object
campaign_source             94940 non-null object
search_engine               52829 non-null category
channel                     102299 non-null category
new_vs_returning            102299 non-null category


In [52]:
y = pd.factorize(train_completo['label'])[0]
train_completo = train_completo[['dia_semana','mes','anio']]
# VER COMO OBTENER LAS COLUMNAS 
# ESPECÍFICAS QUE QUEREMOS TRABAJAR.
features = train_completo.columns

In [53]:
y

array([0, 0, 0, ..., 0, 0, 0])

In [54]:
# Creamos un clasificador con Random Forest..
clf = RandomForestClassifier(n_jobs=2, random_state=0)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(train_completo[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [55]:
eventos['dia_semana'] = eventos['timestamp'].dt.day
eventos['mes'] = eventos['timestamp'].dt.month
eventos['anio'] = eventos['timestamp'].dt.year

In [56]:
clf.predict(eventos[features])

array([0, 0, 0, ..., 0, 0, 0])

In [58]:
# REVISAR.
clf.predict_proba(eventos[features])[0:150]

array([[0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.95189959, 0.04810041],
       [0.