In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [20]:
plt.rc('figure',dpi=100);

In [21]:
training_labels = pd.read_csv("set_entrenamiento.csv", low_memory=False)
labels_predict = pd.read_csv("trocafone_kaggle_test.csv", low_memory=False)
data = pd.read_csv("data_set.csv", low_memory = False)

In [22]:
training_labels = training_labels[['person','label']]
data = data.drop(columns = {'Email','Unknown','Social','days_since_last_session'})

In [23]:
data.shape

(38829, 26)

In [24]:
training_labels['label'].value_counts()

1    99684
0    54281
Name: label, dtype: int64

In [25]:
features = pd.merge(training_labels, data, on='person', how='inner')
features = features.fillna(0)
features['label'].value_counts()

1    99684
0    54281
Name: label, dtype: int64

In [26]:
features.head()

Unnamed: 0.1,person,label,Organic,Paid,Referral,ad campaign hit,brand listing,checkout,conversion,generic listing,...,promedio hora,retornos,Unnamed: 0,sessions,total_time,mean_time_by_session,max_session_time,mean_events_by_session,max_events_by_session,promedio de TFIdf
0,43790d8f,1,0,1,0,2,0,2,0,6,...,0.0,2,10219,2,1431.0,715.5,1417.0,38.0,73,0.0
1,43790d8f,1,0,1,0,2,0,2,0,6,...,0.0,2,10219,2,1431.0,715.5,1417.0,38.0,73,0.0
2,43790d8f,1,0,1,0,2,0,2,0,6,...,0.0,2,10219,2,1431.0,715.5,1417.0,38.0,73,0.0
3,43790d8f,1,0,1,0,2,0,2,0,6,...,0.0,2,10219,2,1431.0,715.5,1417.0,38.0,73,0.0
4,43790d8f,1,0,1,0,2,0,2,0,6,...,0.0,2,10219,2,1431.0,715.5,1417.0,38.0,73,0.0


In [27]:
train, test = train_test_split(features,test_size=0.10)
print("Train: ",len(train),"Test: ",len(test))
features=list(features.columns)
features.remove('person')
features.remove('label')

x_train=train[features]
y_train=train['label']

x_test=test[features]
y_test=test['label']

Train:  138568 Test:  15397


In [28]:
c=RandomForestClassifier(n_estimators=100,min_samples_split=100,n_jobs=-1,random_state=0)
dt=c.fit(x_train,y_train)

In [29]:
score=c.score(x_test,y_test)*100
print(score)

99.70124050139637


In [30]:
prediccion=pd.merge(labels_predict,data,on='person',how='inner')
prediccion.head()

Unnamed: 0.1,person,Organic,Paid,Referral,ad campaign hit,brand listing,checkout,conversion,generic listing,lead,...,promedio hora,retornos,Unnamed: 0,sessions,total_time,mean_time_by_session,max_session_time,mean_events_by_session,max_events_by_session,promedio de TFIdf
0,4886f805,1,0,0,0,0,1,0,1,0,...,0.0,1,10956,1,1388.0,1388.0,1388.0,9.0,9,1.0
1,0297fc1e,1,21,17,29,4,7,0,21,1,...,12.793651,95,365,96,11646.0,121.3125,1640.0,5.90625,18,1.146942
2,2d681dd8,1,1,0,1,5,1,0,1,0,...,11.692308,2,6812,2,553.0,276.5,377.0,13.0,16,1.331171
3,cccea85e,14,6,1,15,7,1,0,20,0,...,16.199761,22,30986,20,20397.0,1019.85,4350.0,41.8,104,1.375124
4,4c8a8b93,3,6,9,14,8,2,0,14,0,...,10.540856,20,11596,20,6526.0,326.3,1420.0,12.85,51,1.303777


In [31]:
labels_predict.shape

(19415, 1)

In [32]:
x_final=prediccion[features]
y_final=c.predict(x_final)
y_final

array([0, 1, 0, ..., 1, 1, 1])

In [33]:
prediccion['label']=y_final
prediccion.head()

Unnamed: 0.1,person,Organic,Paid,Referral,ad campaign hit,brand listing,checkout,conversion,generic listing,lead,...,retornos,Unnamed: 0,sessions,total_time,mean_time_by_session,max_session_time,mean_events_by_session,max_events_by_session,promedio de TFIdf,label
0,4886f805,1,0,0,0,0,1,0,1,0,...,1,10956,1,1388.0,1388.0,1388.0,9.0,9,1.0,0
1,0297fc1e,1,21,17,29,4,7,0,21,1,...,95,365,96,11646.0,121.3125,1640.0,5.90625,18,1.146942,1
2,2d681dd8,1,1,0,1,5,1,0,1,0,...,2,6812,2,553.0,276.5,377.0,13.0,16,1.331171,0
3,cccea85e,14,6,1,15,7,1,0,20,0,...,22,30986,20,20397.0,1019.85,4350.0,41.8,104,1.375124,0
4,4c8a8b93,3,6,9,14,8,2,0,14,0,...,20,11596,20,6526.0,326.3,1420.0,12.85,51,1.303777,0


In [34]:
prediccion['label'].value_counts()

0    13297
1     6118
Name: label, dtype: int64

In [35]:
prediccion[['person', 'label']].to_csv('../modelos/RandomForestResults.csv',index=False)

In [36]:
# Get numerical feature importances
importances = list(c.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(features, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: checkout             Importance: 0.09
Variable: viewed product       Importance: 0.07
Variable: total_time           Importance: 0.06
Variable: promedio dias        Importance: 0.05
Variable: max_events_by_session Importance: 0.05
Variable: ad campaign hit      Importance: 0.04
Variable: brand listing        Importance: 0.04
Variable: generic listing      Importance: 0.04
Variable: visited site         Importance: 0.04
Variable: promedio hora        Importance: 0.04
Variable: retornos             Importance: 0.04
Variable: Unnamed: 0           Importance: 0.04
Variable: sessions             Importance: 0.04
Variable: mean_time_by_session Importance: 0.04
Variable: max_session_time     Importance: 0.04
Variable: mean_events_by_session Importance: 0.04
Variable: Paid                 Importance: 0.03
Variable: Referral             Importance: 0.03
Variable: search engine hit    Importance: 0.03
Variable: promedio de TFIdf    Importance: 0.03
Variable: Organic              Import