In [1]:
import numpy as np  # Librería para aplicar álgebra lineal
import pandas as pd # Para manejar los datos (datasets)

import matplotlib.pyplot as plt # Para las visualizaciones
import seaborn as sns # Visualizaciones más fancy

from sklearn import preprocessing # Para el procesador de los datos
from sklearn.preprocessing import Imputer # Para adoptar una estrategia para los missing values
from sklearn.preprocessing import LabelEncoder as Codificar # Para codificar variables categóricas
from sklearn.preprocessing import OneHotEncoder # Para pasar el LaberEncoder vector a OneHot matriz
from sklearn.preprocessing import MinMaxScaler # Para realizar el escalado en escala (0-1)
from sklearn.model_selection import train_test_split as Separar # Para dividir en los 2 conjuntos
#from statsmodels.tools.eval_measures import rmse # Para calcular el error
from sklearn.metrics import confusion_matrix as CM # Para construir la matriz de confusión
from matplotlib.colors import ListedColormap as Colors # Para pintar las regiones en Clasificación

In [2]:
#Se especifican los tipo de datos para mejorar la performance
eventos = pd.read_csv('/home/nicolas/Descargas/fiuba-trocafone-tp2-final-set/events_up_to_01062018.csv', 
                      dtype={"timestamp": object,#intenté hacerla datetime y no anduvo 
                             "event": 'category',
                             "person":object,
                             "url":object,
                             "sku":object,
                             "model":object,
                             "condition":'category',
                             "storage":object,
                             "color":'category',
                             "skus":object,
                             "search_term":object,
                             "staticpage":object,
                             "campaign_source":object,
                             "search_engine":object,
                             "channel":object,
                             "new_vs_returning":'category',
                             "city":object,
                             "region":object,
                             "country":object,
                             "device_type":object,
                             "screen_resolution":object,
                             "operating_system_version":object,
                             "browser_version":object})
etiquetas = pd.read_csv('/home/nicolas/Descargas/fiuba-trocafone-tp2-final-set/labels_training_set.csv',
                        dtype={"person":object, "label":bool})

In [3]:
etiquetas['label'].value_counts()

False    18434
True       980
Name: label, dtype: int64

In [4]:
eventos.head(3)

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,,


In [5]:
len(eventos['person'].unique()) - len(etiquetas)

19415

In [6]:
print(etiquetas['label'].value_counts())
print(etiquetas['label'].value_counts(normalize=True))

False    18434
True       980
Name: label, dtype: int64
False    0.949521
True     0.050479
Name: label, dtype: float64


In [7]:
eventos['timestamp'] =  pd.to_datetime(eventos['timestamp'])
eventos[['marca','modelo']] = eventos['model'].dropna().str.split(' ',n=1,expand=True)
eventos['weekday']=eventos['timestamp'].dt.day_name()
eventos['hour']=eventos['timestamp'].dt.hour
eventos['month']=eventos['timestamp'].dt.month
eventos['day']=eventos['timestamp'].dt.day

In [8]:
print('Cantidad de meses distintos:',len(eventos['month'].value_counts()))
eventos['month'].value_counts()

('Cantidad de meses distintos:', 5)


5    1713920
4     309849
3     193790
2      73541
1      50581
Name: month, dtype: int64

In [9]:
list(eventos)

['timestamp',
 'event',
 'person',
 'url',
 'sku',
 'model',
 'condition',
 'storage',
 'color',
 'skus',
 'search_term',
 'staticpage',
 'campaign_source',
 'search_engine',
 'channel',
 'new_vs_returning',
 'city',
 'region',
 'country',
 'device_type',
 'screen_resolution',
 'operating_system_version',
 'browser_version',
 'marca',
 'modelo',
 'weekday',
 'hour',
 'month',
 'day']

In [57]:
device=eventos.groupby('person')['device_type'].value_counts(normalize=True).unstack().reset_index()
device.head()

device_type,person,Computer,Smartphone,Tablet,Unknown
0,0008ed71,1.0,,,
1,00091926,1.0,,,
2,00091a7a,,1.0,,
3,000ba417,1.0,,,
4,000c79fe,,1.0,,


In [94]:
device2=eventos.groupby('person')['device_type'].value_counts().unstack().reset_index()
device2.head()

device_type,person,Computer,Smartphone,Tablet,Unknown
0,0008ed71,2.0,,,
1,00091926,34.0,,,
2,00091a7a,,1.0,,
3,000ba417,6.0,,,
4,000c79fe,,1.0,,


In [168]:
color=eventos.groupby('person')['color'].value_counts(normalize=True).unstack().reset_index()
color.head()

color,person,Amarelo,Ametista,Azul,Azul Escuro,Azul Safira,Azul Topázio,Bambu,Black Piano,Branco,...,Rouge,Roxo,Silver,Titânio,Turquesa,Verde,Verde Petroleo,Verde Água,Vermelho,Ônix
0,0008ed71,,,,,,,,,,...,,,,,,,,,,
1,00091926,,0.005348,0.008021,0.005348,,,0.002674,0.002674,0.008021,...,,,,,,0.002674,,0.002674,0.005348,
2,00091a7a,,,,,,,,,,...,,,,,,,,,,
3,000ba417,,,0.025,,,,0.01875,,0.09375,...,0.00625,,,,,0.0125,,,0.00625,
4,000c79fe,,,,,,,,,,...,,,,,,,,,,


In [169]:
color2=eventos.groupby('person')['color'].value_counts().unstack().reset_index()
color2.head()

color,person,Amarelo,Ametista,Azul,Azul Escuro,Azul Safira,Azul Topázio,Bambu,Black Piano,Branco,...,Rouge,Roxo,Silver,Titânio,Turquesa,Verde,Verde Petroleo,Verde Água,Vermelho,Ônix
0,0008ed71,,,,,,,,,,...,,,,,,,,,,
1,00091926,,2.0,3.0,2.0,,,1.0,1.0,3.0,...,,,,,,1.0,,1.0,2.0,
2,00091a7a,,,,,,,,,,...,,,,,,,,,,
3,000ba417,,,4.0,,,,3.0,,15.0,...,1.0,,,,,2.0,,,1.0,
4,000c79fe,,,,,,,,,,...,,,,,,,,,,


In [173]:
condition=eventos.groupby('person')['condition'].value_counts(normalize=True).unstack().reset_index()
condition.head()

condition,person,Bom,Bom - Sem Touch ID,Excelente,Muito Bom,Novo
0,0008ed71,0.666667,,,0.333333,
1,00091926,0.272727,,0.28877,0.435829,0.002674
2,00091a7a,0.333333,,0.666667,,
3,000ba417,0.71875,,0.06875,0.2125,
4,000c79fe,1.0,,,,


In [174]:
condition2=eventos.groupby('person')['condition'].value_counts().unstack().reset_index()
condition2.head()

condition,person,Bom,Bom - Sem Touch ID,Excelente,Muito Bom,Novo
0,0008ed71,2.0,,,1.0,
1,00091926,102.0,,108.0,163.0,1.0
2,00091a7a,1.0,,2.0,,
3,000ba417,115.0,,11.0,34.0,
4,000c79fe,4.0,,,,


In [58]:
weekd=eventos.groupby('person')['weekday'].value_counts(normalize=True).unstack().reset_index().fillna(0)
weekd.head()

weekday,person,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,0008ed71,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,00091926,0.191964,0.071429,0.071429,0.292411,0.1875,0.095982,0.089286
2,00091a7a,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,000ba417,0.0,0.0,0.286408,0.0,0.713592,0.0,0.0
4,000c79fe,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [95]:
weekd2=eventos.groupby('person')['weekday'].value_counts().unstack().reset_index().fillna(0)
weekd2.head()

weekday,person,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,0008ed71,0.0,0.0,0.0,0.0,6.0,0.0,0.0
1,00091926,86.0,32.0,32.0,131.0,84.0,43.0,40.0
2,00091a7a,0.0,10.0,0.0,0.0,0.0,0.0,0.0
3,000ba417,0.0,0.0,59.0,0.0,147.0,0.0,0.0
4,000c79fe,0.0,0.0,0.0,0.0,0.0,17.0,0.0


In [59]:
dias=eventos.groupby('person')['day'].value_counts(normalize=True).unstack().reset_index()
dias.head()

day,person,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0008ed71,,,,,,,,,,...,,,,,,,,,,
1,00091926,,,0.015625,,0.044643,0.013393,0.024554,0.013393,0.006696,...,0.026786,0.080357,,,0.022321,0.040179,,0.006696,,0.035714
2,00091a7a,,,,,,,,,,...,,,,,1.0,,,,,
3,000ba417,,,,,,,,,,...,,,0.383495,,0.286408,,,,,
4,000c79fe,,,,,,,,,,...,,,,,,,,1.0,,


In [96]:
dias2=eventos.groupby('person')['day'].value_counts().unstack().reset_index()
dias2.head()

day,person,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0008ed71,,,,,,,,,,...,,,,,,,,,,
1,00091926,,,7.0,,20.0,6.0,11.0,6.0,3.0,...,12.0,36.0,,,10.0,18.0,,3.0,,16.0
2,00091a7a,,,,,,,,,,...,,,,,10.0,,,,,
3,000ba417,,,,,,,,,,...,,,79.0,,59.0,,,,,
4,000c79fe,,,,,,,,,,...,,,,,,,,17.0,,


In [374]:
suma=weekd.merge(device,on='person',how='left')
#suma=suma.merge(weekd2,on='person',how='left')
suma=suma.merge(dias,on='person',how='left')
#suma=suma.merge(device2,on='person',how='left')
#suma=suma.merge(dias2,on='person',how='left')
suma=suma.merge(color,on='person',how='left')
#suma=suma.merge(color2,on='person',how='left')
suma=suma.merge(condition,on='person',how='left')
#suma=suma.merge(condition2,on='person',how='left')
suma=suma.fillna(0)
suma.head()

Unnamed: 0,person,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,Computer,Smartphone,...,Verde,Verde Petroleo,Verde Água,Vermelho,Ônix,Bom,Bom - Sem Touch ID,Excelente,Muito Bom,Novo
0,0008ed71,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.333333,0.0
1,00091926,0.191964,0.071429,0.071429,0.292411,0.1875,0.095982,0.089286,1.0,0.0,...,0.002674,0.0,0.002674,0.005348,0.0,0.272727,0.0,0.28877,0.435829,0.002674
2,00091a7a,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.666667,0.0,0.0
3,000ba417,0.0,0.0,0.286408,0.0,0.713592,0.0,0.0,1.0,0.0,...,0.0125,0.0,0.0,0.00625,0.0,0.71875,0.0,0.06875,0.2125,0.0
4,000c79fe,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [375]:
eventos.groupby('person')['month'].value_counts().unstack().reset_index().fillna(0).head()

month,person,1,2,3,4,5
0,0008ed71,0.0,0.0,0.0,0.0,6.0
1,00091926,0.0,0.0,0.0,0.0,448.0
2,00091a7a,0.0,0.0,10.0,0.0,0.0
3,000ba417,0.0,0.0,0.0,0.0,206.0
4,000c79fe,0.0,0.0,0.0,0.0,17.0


In [376]:
mesesGrp = eventos.groupby('person')['month'].value_counts().unstack().reset_index()
mesesGrp.rename({1:'total_enero',2:'total_febrero',3:'total_marzo',4:'total_abril',5:'total_mayo'},inplace=True, axis=1)
print('Cantidad de elementos de mesesGrp:',len(mesesGrp))
mesesGrp.head(3)

('Cantidad de elementos de mesesGrp:', 38829)


month,person,total_enero,total_febrero,total_marzo,total_abril,total_mayo
0,0008ed71,,,,,6.0
1,00091926,,,,,448.0
2,00091a7a,,,10.0,,


In [377]:
eventosGrp  = eventos.groupby('person')['event'].value_counts().unstack().reset_index()
eventosGrp.columns = eventosGrp.columns.map(lambda x: x if x == 'person' else 'total ' + x)
print('Cantidad de elementos de eventosGrp:',len(eventosGrp))
eventosGrp.head(3)

('Cantidad de elementos de eventosGrp:', 38829)


event,person,total ad campaign hit,total brand listing,total checkout,total conversion,total generic listing,total lead,total search engine hit,total searched products,total staticpage,total viewed product,total visited site
0,0008ed71,,,3.0,,1.0,,,,,,2.0
1,00091926,15.0,25.0,2.0,,,,,,,372.0,34.0
2,00091a7a,1.0,5.0,,,,,,,,3.0,1.0


In [378]:
eventosNormalizados  = eventos.groupby('person')['event'].value_counts(normalize=True).unstack().reset_index()
eventosNormalizados.columns = eventosNormalizados.columns.map(lambda x: x if x == 'person' else 'promedio ' + x)
eventosNormalizados.head(3)

event,person,promedio ad campaign hit,promedio brand listing,promedio checkout,promedio conversion,promedio generic listing,promedio lead,promedio search engine hit,promedio searched products,promedio staticpage,promedio viewed product,promedio visited site
0,0008ed71,,,0.5,,0.166667,,,,,,0.333333
1,00091926,0.033482,0.055804,0.004464,,,,,,,0.830357,0.075893
2,00091a7a,0.1,0.5,,,,,,,,0.3,0.1


In [379]:
#Agrupamos los primeros features, de mes y evento
clientesGrp = pd.merge(eventosGrp,mesesGrp,on='person')
clientesGrp = clientesGrp.merge(eventosNormalizados,on='person',how='left')
clientesGrp = clientesGrp.merge(suma,on='person',how='left')
print('Cantidad de elementos de clientesGrp:',len(clientesGrp))
clientesGrp.head(3)

('Cantidad de elementos de clientesGrp:', 38829)


Unnamed: 0,person,total ad campaign hit,total brand listing,total checkout,total conversion,total generic listing,total lead,total search engine hit,total searched products,total staticpage,...,Verde,Verde Petroleo,Verde Água,Vermelho,Ônix,Bom,Bom - Sem Touch ID,Excelente,Muito Bom,Novo
0,0008ed71,,,3.0,,1.0,,,,,...,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.333333,0.0
1,00091926,15.0,25.0,2.0,,,,,,,...,0.002674,0.0,0.002674,0.005348,0.0,0.272727,0.0,0.28877,0.435829,0.002674
2,00091a7a,1.0,5.0,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.666667,0.0,0.0


In [380]:
marcasGrp = eventos.groupby('person')['marca'].value_counts(normalize=True).unstack().reset_index()
marcasGrp.columns = marcasGrp.columns.map(lambda x: x if x == 'person' else 'total marca ' + x)
marcasGrp.head(3)

marca,person,total marca Asus,total marca LG,total marca Lenovo,total marca Motorola,total marca Outros,total marca Quantum,total marca Samsung,total marca Sony,total marca Xiaomi,total marca iPad,total marca iPhone
0,0008ed71,,0.333333,,,,,0.333333,,,,0.333333
1,00091926,,0.005348,0.002674,0.147059,,,0.163102,0.002674,,0.002674,0.676471
2,00091a7a,,,,,,,,,,,1.0


In [381]:
#A los features ya obtenidos le agregamos las marcas de los modelos consultados. Importante el left join!
#clientesGrp = clientesGrp.merge(marcasGrp,on='person',how='left')
clientesGrp.head(5)

Unnamed: 0,person,total ad campaign hit,total brand listing,total checkout,total conversion,total generic listing,total lead,total search engine hit,total searched products,total staticpage,...,Verde,Verde Petroleo,Verde Água,Vermelho,Ônix,Bom,Bom - Sem Touch ID,Excelente,Muito Bom,Novo
0,0008ed71,,,3.0,,1.0,,,,,...,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.333333,0.0
1,00091926,15.0,25.0,2.0,,,,,,,...,0.002674,0.0,0.002674,0.005348,0.0,0.272727,0.0,0.28877,0.435829,0.002674
2,00091a7a,1.0,5.0,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.666667,0.0,0.0
3,000ba417,1.0,24.0,6.0,1.0,14.0,,1.0,,,...,0.0125,0.0,0.0,0.00625,0.0,0.71875,0.0,0.06875,0.2125,0.0
4,000c79fe,1.0,,1.0,,1.0,,1.0,9.0,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [382]:
eventos['storage'].value_counts()

16GB     442096
32GB     430283
64GB     228413
128GB     99345
8GB       96019
256GB     17623
4GB        5409
512MB      1342
Name: storage, dtype: int64

In [383]:
#Creamos los features del almacenamiento interno de los dispositivos consultados
almacenamientoGrp = eventos.groupby('person')['storage'].value_counts(normalize=True).unstack().reset_index()
almacenamientoGrp.columns = almacenamientoGrp.columns.map(lambda x: x if x == 'person' else 'total ' + x)
#Agregamos estos features
clientesGrp = clientesGrp.merge(almacenamientoGrp, how='left', on='person')

In [384]:
browser_versions = pd.Series(eventos.browser_version.fillna('').unique())
print(browser_versions)
print(len(browser_versions))

0                                     
1                   Chrome Mobile 66.0
2                           Firefox 57
3                          Chrome 66.0
4                   Chrome Mobile 34.0
5                   Chrome Mobile 65.0
6                         Facebook 172
7                         Facebook 173
8                     Mobile Safari 11
9                          Chrome 65.0
10                         Chrome 67.0
11                         Safari 11.1
12                          Firefox 60
13                          Firefox 59
14                               IE 11
15                    Chrome Mobile 39
16                  Chrome Mobile 56.0
17                         Chrome 63.0
18                         Chrome 64.0
19                  Chrome Mobile 64.0
20                     Mobile Safari 9
21                         Chrome 62.0
22                        Facebook 166
23                        Facebook 163
24                  Chrome Mobile 63.0
25                       

In [385]:
print (browser_versions[1])
print (browser_versions[1].rsplit(' ', 1)[0])
#eliminarUltimaPalabra(browser_versions[1])
browser_versions[1].rsplit(' ', 1)[0]

Chrome Mobile 66.0
Chrome Mobile


'Chrome Mobile'

In [386]:
#Son muchos navegadores, pero mayormente cambia la versión utilizada. 
#Vamos a ver cuantos quedan sacando el número de versión, a ver si tiene sentido para usarlo luego como feature

In [387]:
browsers_sin_version = browser_versions.map(lambda x: x.rsplit(' ', 1)[0])
print(len(browsers_sin_version.unique()))
browsers_sin_version.value_counts()

31


Chrome                        69
Facebook                      55
Chrome Mobile                 45
Firefox                       38
Opera Mini                    17
Mobile Safari                 17
Samsung Internet              16
Safari                        12
Opera                         12
Android                        9
UC Browser                     9
Chrome Mobile iOS              8
Opera Mobile                   8
Edge                           7
Chromium                       7
Firefox Mobile                 6
Mobile Safari UI/WKWebView     6
Edge Mobile                    4
Puffin                         3
Yandex Browser                 3
IE                             3
Vivaldi                        2
IE Mobile                      2
Maxthon                        2
BlackBerry WebKit              1
BingPreview                    1
Other                          1
K-Meleon                       1
WebKit Nightly                 1
Pinterest                      1
          

#### Seguir con feature engeneering, agregando features. Algunas ideas: 
*    Modelo más consultado
*    dividir el mes en 3 o 4 y ver cuantos eventos se generaron en determinadas epocas del mes
*    Modelo mas consultado
*    color ?
*    navegador más usado (capaz ver los 3 o 5 más frecuentes y una columna otros)
*    etc

## Machine learning
#### En principio, usaremos random forest, después vamos viendo

In [388]:
#Cargamos las bibliotecas para usar y validar Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [389]:
#Reemplazo NaNs por ceros
clientesGrp = clientesGrp.fillna(0)

#Cargamos X e y para trabajar con Random Forest.
X = pd.merge(clientesGrp,etiquetas,on='person')
X.drop("person", axis=1, inplace=True)
X = X.fillna(0)
y = X.pop("label")
X.head(1)

Unnamed: 0,total ad campaign hit,total brand listing,total checkout,total conversion,total generic listing,total lead,total search engine hit,total searched products,total staticpage,total viewed product,...,Muito Bom,Novo,total 128GB,total 16GB,total 256GB,total 32GB,total 4GB,total 512MB,total 64GB,total 8GB
0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.333333,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.333333,0.0


In [390]:
list(X)

['total ad campaign hit',
 'total brand listing',
 'total checkout',
 'total conversion',
 'total generic listing',
 'total lead',
 'total search engine hit',
 'total searched products',
 'total staticpage',
 'total viewed product',
 'total visited site',
 'total_enero',
 'total_febrero',
 'total_marzo',
 'total_abril',
 'total_mayo',
 'promedio ad campaign hit',
 'promedio brand listing',
 'promedio checkout',
 'promedio conversion',
 'promedio generic listing',
 'promedio lead',
 'promedio search engine hit',
 'promedio searched products',
 'promedio staticpage',
 'promedio viewed product',
 'promedio visited site',
 'Friday',
 'Monday',
 'Saturday',
 'Sunday',
 'Thursday',
 'Tuesday',
 'Wednesday',
 'Computer',
 'Smartphone',
 'Tablet',
 'Unknown',
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 'Amarelo',
 'Ametista',
 'Azul',
 'Azul Escuro',
 'Azul Safira',
 'Azul Top\xc3\xa1zio',
 '

In [391]:
'''Tomamos "prestada" una función para presentar prolijamente los n features más importantes.
La función (entre otras cosas) fue tomada casi literal de:
https://github.com/Featuretools/predict-next-purchase/blob/master/utils.py
'''
def feature_importances(model, features, n=10):
    importances = model.feature_importances_
    zipped = sorted(zip(features, importances), key=lambda x: -x[1])
    for i, f in enumerate(zipped[:n]):
        print("%d: Feature: %s, %.3f" % (i+1, f[0], f[1]))

    return [f[0] for f in zipped[:n]]

In [392]:
clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
#Otra version:
#clf = RandomForestClassifier(n_estimators=400, n_jobs=-1)

In [393]:
#Probamos qué tan bien viene clasificando nuestro modelo

scores = cross_val_score(estimator=clf,X=X, y=y, cv=3,
                         scoring="roc_auc", verbose=True)

"AUC %.2f +/- %.2f" % (scores.mean(), scores.std())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   26.1s finished


'AUC 0.85 +/- 0.00'

In [394]:
#Vemos cuales fueron los 15 features más relevantes
clf.fit(X,y)
top_features = feature_importances(clf, X.columns, n=15)

1: Feature: promedio checkout, 0.038
2: Feature: total checkout, 0.035
3: Feature: total_mayo, 0.031
4: Feature: promedio viewed product, 0.023
5: Feature: promedio visited site, 0.022
6: Feature: total viewed product, 0.021
7: Feature: promedio generic listing, 0.020
8: Feature: promedio ad campaign hit, 0.020
9: Feature: promedio search engine hit, 0.019
10: Feature: promedio brand listing, 0.018
11: Feature: Bom, 0.018
12: Feature: Dourado, 0.018
13: Feature: total 16GB, 0.018
14: Feature: Preto, 0.018
15: Feature: Muito Bom, 0.017


In [395]:
#vemos cómo quedaría una pequeña muestra de la matriz de features con los más relevantes
X.loc[:,top_features].head(2)

Unnamed: 0,promedio checkout,total checkout,total_mayo,promedio viewed product,promedio visited site,total viewed product,promedio generic listing,promedio ad campaign hit,promedio search engine hit,promedio brand listing,Bom,Dourado,total 16GB,Preto,Muito Bom
0,0.5,3.0,6.0,0.0,0.333333,0.0,0.166667,0.0,0.0,0.0,0.666667,0.333333,0.0,0.333333,0.333333
1,0.058824,1.0,17.0,0.176471,0.058824,3.0,0.058824,0.058824,0.058824,0.0,1.0,0.0,0.0,0.0,0.0


In [396]:
#Pruebo de nuevo los random forest con los resultados obtenidos de mejores clasificadores
X_reducido = X.loc[:,top_features]
scores = cross_val_score(estimator=clf,X=X_reducido, y=y, cv=3,
                         scoring="roc_auc", verbose=True)

"AUC %.2f +/- %.2f" % (scores.mean(), scores.std())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   14.2s finished


'AUC 0.81 +/- 0.02'

#### Con los features más relevantes aproxima mas o menos igual que con todos juntos

In [397]:
#Prueba con los mismos datos de entrenamientos, a ver cuantos predice que convirtieron en los siguientes 15 días
clf.fit(X,y)
prediccionTest = clf.predict(X=clientesGrp.drop('person', axis=1))
print('total del set de entrenamiento que convirtieron:', etiquetas['label'].sum())
print('total del set de entrenamiento que predice que convertirán:',pd.Series(prediccionTest).sum())

('total del set de entrenamiento que convirtieron:', 980)
('total del set de entrenamiento que predice que convertir\xc3\xa1n:', 984)


## Traemos los id de clientes que necesitamos clasificar

In [398]:
clientesAEvaluar = pd.read_csv('/home/nicolas/Descargas/all/trocafone_kaggle_test.csv')
print(len(clientesAEvaluar))
clientesAEvaluar.head(2)

19415


Unnamed: 0,person
0,4886f805
1,0297fc1e


In [399]:
datosTest = clientesAEvaluar.merge(clientesGrp, on='person',how='inner')
print('cantidad de datos:',len(datosTest))
datosTest.head(2)

('cantidad de datos:', 19415)


Unnamed: 0,person,total ad campaign hit,total brand listing,total checkout,total conversion,total generic listing,total lead,total search engine hit,total searched products,total staticpage,...,Muito Bom,Novo,total 128GB,total 16GB,total 256GB,total 32GB,total 4GB,total 512MB,total 64GB,total 8GB
0,4886f805,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0297fc1e,29.0,4.0,7.0,0.0,21.0,1.0,0.0,6.0,0.0,...,0.289538,0.0,0.121655,0.311436,0.014599,0.082725,0.0,0.0,0.469586,0.0


In [400]:
#Verifico que no se hayan "perdido" filas en el camino (un merge mal definido capaz?)
print('cantidad de personas en total:',len(eventos['person'].unique()))
print('cantidad de personas para training:',len(etiquetas))
print('cantidad de personas para evaluar:',len(clientesAEvaluar))
print('cantidad de personas en clientes agrupados:',len(clientesGrp))

('cantidad de personas en total:', 38829)
('cantidad de personas para training:', 19414)
('cantidad de personas para evaluar:', 19415)
('cantidad de personas en clientes agrupados:', 38829)


In [401]:
'''Se toma una configuración de random forest encontrada en:
https://www.datacamp.com/community/tutorials/random-forests-classifier-python
clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)'''

clf.fit(X,y)
prediccionProbabilistica = clf.predict_proba(X=datosTest.drop('person', axis=1).fillna(0))

In [402]:
for i in range(1,6):
    print('%d\tclientes con p > %.2f de convertir' % ((prediccionProbabilistica[:,[1]] > i/10).sum(),i/10))

16135	clientes con p > 0.00 de convertir
16135	clientes con p > 0.00 de convertir
16135	clientes con p > 0.00 de convertir
16135	clientes con p > 0.00 de convertir
16135	clientes con p > 0.00 de convertir


In [403]:
clientesAEvaluar['label'] = prediccionProbabilistica[:,[1]]
clientesAEvaluar.head()

Unnamed: 0,person,label
0,4886f805,0.0025
1,0297fc1e,0.0625
2,2d681dd8,0.01
3,cccea85e,0.19
4,4c8a8b93,0.0175


In [404]:
#Exportamos el archivo a subir. De esta manera tiene el formato correcto. 
#Notar que así lo deja en el directorio de la notebook
clientesAEvaluar.to_csv('entrega_3.csv',index=False)

### Con Random Forest obtuvimos unos resultados modestos.
#### Usando la suma de eventos, los eventos normalizados, las marcas, los meses y los almacenamientos, obtuvimos un 0.80894, habrá que probar de nuevo cambiando cosas. Ahora probar otra técnica, a ver si da algo más razonable

In [405]:
#Probaremos con SVM. Vamos importando las bibliotecas

#Para preparar sets de entrenamiento y prueba
from sklearn.model_selection import train_test_split  

#Traemos la bibliotecas de SVC. SVC es por SVM para clasificación
from sklearn.svm import SVC 

#Nos traemos una biblioteca para medir la performance
from sklearn.metrics import classification_report, confusion_matrix  

In [406]:
#Probaremos SVM
'''Ya X e y completas las tenemos cargadas de cuando probamos RandomForests.
Pero como SVM tarda una eternidad por ser de orden  O = (n features * n2 samples)
tomamos los features que Random Forest nos marcó como más importantes
'''

#Preparamos los sets de entrenamiento y prueba
#(https://stackabuse.com/implementing-svm-and-kernel-svm-with-pythons-scikit-learn/)
X_train, X_test, y_train, y_test = train_test_split(X_reducido, y, test_size = 0.20)  

In [407]:
#Empieza el entrenamiento 
#svclassifier = SVC(kernel='linear')  
#%timeit svclassifier.fit(X_train, y_train)  

In [408]:
#Ahora hacemos una predicción y vemos cómo viene andando
y_pred = svclassifier.predict(X_test)  
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  

NameError: name 'svclassifier' is not defined

### Algunos análisis que hicimos por problemas de performance, comprobando lo que tanto nos comentaron del apply

In [None]:
%timeit eventos['timestamp_dt'] =  pd.to_datetime(eventos['timestamp'])

In [None]:
%timeit eventos['timestamp'] =  pd.to_datetime(eventos['timestamp'])

In [None]:
%timeit eventos[['marca','modelo']] = eventos['model'].dropna().str.split(' ',n=1,expand=True)

In [None]:
%timeit eventos['weekday']=eventos['timestamp_dt'].apply(lambda x: x.day_name())

In [None]:
%timeit eventos['weekday_2']=eventos['timestamp_dt'].dt.day_name()

In [None]:
#vemos si de una y otra forma obtenemos los mismos resultados
print(eventos['weekday'].head())
print(eventos['weekday_2'].head())