In [49]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import datetime
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Cargo los datos

In [50]:
events = pd.read_csv("events_up_to_01062018.csv", low_memory=False, dtype={'sku' : 'object'})
training_labels = pd.read_csv("labels_training_set.csv", low_memory=False)
labels_predict = pd.read_csv("trocafone_kaggle_test.csv", low_memory=False)

### Categorizamos 

In [51]:
events['timestamp'] = pd.to_datetime(events['timestamp'], errors = 'coerce', format= '%Y-%m-%d')
events['event'] = pd.Categorical(events['event'])
events['url'] = pd.Categorical(events['url'])
events['sku'] = pd.Categorical(events['sku'])
events['model'] = pd.Categorical(events['model'])
events['condition'] = pd.Categorical(events['condition'])
events['storage'] = pd.Categorical(events['storage'])
events['color'] = pd.Categorical(events['color'])
events['staticpage'] = pd.Categorical(events['staticpage'])
events['campaign_source'] = pd.Categorical(events['campaign_source'])
events['search_engine'] = pd.Categorical(events['search_engine'])
events['channel'] = pd.Categorical(events['channel'])
events['new_vs_returning'] = pd.Categorical(events['new_vs_returning'])
events['city'] = pd.Categorical(events['city'])
events['region'] = pd.Categorical(events['region'])
events['country'] = pd.Categorical(events['country'])
events['device_type'] = pd.Categorical(events['device_type'])
events['screen_resolution'] = pd.Categorical(events['screen_resolution'])
events['operating_system_version'] = pd.Categorical(events['operating_system_version'])
events['browser_version'] = pd.Categorical(events['browser_version'])

### Agregamos algunos features

In [52]:
events['day'] = events['timestamp'].dt.day
events['month'] = events['timestamp'].dt.month

In [53]:
events.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version,day,month
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,18,5
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,18,5
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,18,5
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,18,5
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,18,5


### Contamos los eventos para cada usuario

In [54]:
events_data = events.pivot_table(index='person', columns='event', values='timestamp', aggfunc='count', fill_value=0)
events_data.columns = events_data.columns.astype('object')
events_data.reset_index(inplace=True)
final_data = events_data
events_data.head()

event,person,ad campaign hit,brand listing,checkout,conversion,generic listing,lead,search engine hit,searched products,staticpage,viewed product,visited site
0,0008ed71,0,0,3,0,1,0,0,0,0,0,2
1,00091926,15,25,2,0,0,0,0,0,0,372,34
2,00091a7a,1,5,0,0,0,0,0,0,0,3,1
3,000ba417,1,24,6,1,14,0,1,0,0,153,6
4,000c79fe,1,0,1,0,1,0,1,9,0,3,1


### Comenzamos con el set de entrenamiento

In [55]:
set_entrenamiento = pd.merge(events_data, training_labels, on ='person', how = 'inner')

In [56]:
set_entrenamiento = set_entrenamiento.drop(columns = ['person'])
set_entrenamiento.head()

Unnamed: 0,ad campaign hit,brand listing,checkout,conversion,generic listing,lead,search engine hit,searched products,staticpage,viewed product,visited site,label
0,0,0,3,0,1,0,0,0,0,0,2,0
1,1,0,1,0,1,0,1,9,0,3,1,0
2,5,0,1,0,4,0,0,4,0,4,1,0
3,29,165,15,2,28,0,13,11,0,189,19,0
4,0,1,2,1,1,0,0,0,0,2,0,0


In [57]:
X = set_entrenamiento.drop(columns = ['label'])
y = set_entrenamiento["label"]

In [58]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=123)

In [59]:
data_dmatrix = xgb.DMatrix(data=X,label=y)

In [60]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 5, n_estimators = 10)

In [61]:
xg_reg.fit(x_train,y_train)

XGBRegressor(alpha=5, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=10,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [62]:
preds = xg_reg.predict(x_test)
preds = preds.tolist()
df = pd.DataFrame(preds)
df = df[0].map(lambda x: 1 if x > 0.5 else 0);
df.head()

0    0
1    0
2    0
3    0
4    0
Name: 0, dtype: int64

In [63]:
df.value_counts()

0    7766
Name: 0, dtype: int64

In [64]:
accuracy_score(df,y_test)

0.9475920679886686

### Comenzamos con el set de test

In [65]:
set_test = pd.merge(events_data,labels_predict,on = "person", how = "right")
set_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19415 entries, 0 to 19414
Data columns (total 12 columns):
person               19415 non-null object
ad campaign hit      19415 non-null int64
brand listing        19415 non-null int64
checkout             19415 non-null int64
conversion           19415 non-null int64
generic listing      19415 non-null int64
lead                 19415 non-null int64
search engine hit    19415 non-null int64
searched products    19415 non-null int64
staticpage           19415 non-null int64
viewed product       19415 non-null int64
visited site         19415 non-null int64
dtypes: int64(11), object(1)
memory usage: 1.9+ MB


In [66]:
personas = set_test["person"]
set_test = set_test.drop(columns=["person"])
set_test = set_test.fillna(0)

In [67]:
predicts_kaggel = xg_reg.predict(set_test)
set_test.tail()

Unnamed: 0,ad campaign hit,brand listing,checkout,conversion,generic listing,lead,search engine hit,searched products,staticpage,viewed product,visited site
19410,0,0,0,0,0,0,1,0,0,19,1
19411,1,0,1,0,0,0,0,0,0,1,1
19412,0,0,0,0,0,0,1,0,0,0,1
19413,7,1,1,1,12,0,12,8,3,42,10
19414,1,2,1,0,0,0,1,0,0,4,1


In [68]:
predicts_kaggel = predicts_kaggel.tolist()
df_kaggel = pd.DataFrame(predicts_kaggel)
df_kaggel.head()

Unnamed: 0,0
0,0.232497
1,0.22341
2,0.22593
3,0.225981
4,0.204086


In [69]:
df_kaggel[0].describe()

count    19415.000000
mean         0.206992
std          0.022966
min          0.183683
25%          0.192304
50%          0.198056
75%          0.219104
max          0.540970
Name: 0, dtype: float64

In [70]:
df_kaggel = df_kaggel[0].map(lambda x: 1 if x > 0.207173 else 0);
df_kaggel = df_kaggel.to_frame()
df_kaggel["person"] = personas
df_kaggel.rename(columns = {0: 'label'},inplace = True) 

In [71]:
df_kaggel = df_kaggel[['person','label']]

### Vemos resultados y pasamos a un csv

In [74]:
df_kaggel['label'].value_counts()

0    12801
1     6614
Name: label, dtype: int64

In [72]:
df_kaggel.to_csv('Resultados.csv',index=False)

In [75]:
# Nota: Score con este algoritmo usando los hiper-parametros que estan ahora: 0.74850