In [180]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [181]:
plt.rc('figure',dpi=100);

In [182]:
training_labels = pd.read_csv("labels_training_set.csv", low_memory=False)
labels_predict = pd.read_csv("trocafone_kaggle_test.csv", low_memory=False)
data = pd.read_csv("data_set.csv", low_memory = False)

In [183]:
training_labels = training_labels[['person','label']]
#data = data.drop(columns = {'Email','Social'})

In [204]:
training_labels['label'].value_counts()

0    18434
1      980
Name: label, dtype: int64

In [245]:
features = pd.merge(training_labels, data, on='person', how='inner')
features = features.fillna(0)
features['label'].value_counts()
features = pd.concat([(features[features['label'] == 0].sort_values(by='days_since_last_session', ascending=True).head(5000)),(features[features['label'] == 1])])
features['label'].value_counts()

0    5000
1     980
Name: label, dtype: int64

In [246]:
features.head()

Unnamed: 0,person,label,Direct,Email,Organic,Paid,Referral,Social,ad campaign hit,brand listing,...,retornos,coincide,sessions,total_time,mean_time_by_session,max_session_time,mean_events_by_session,max_events_by_session,days_since_last_session,promedio de TFIdf
5393,9ba912d3,0,20.0,0.0,12.0,0.0,0.0,0.0,4,1,...,34,0.0,34,28162.0,828.294118,4801.0,11.058824,59,172,1.0
2978,5123e3e1,0,1.0,0.0,3.0,0.0,0.0,0.0,0,17,...,4,0.0,4,4122.0,1030.5,2081.0,19.0,46,172,0.0
10181,a2f74923,0,0.0,0.0,0.0,0.0,0.0,0.0,3,0,...,2,1.0,2,263.0,131.5,263.0,6.0,9,172,0.0
15513,a73eabbf,0,3.0,0.0,1.0,2.0,0.0,0.0,15,2,...,11,1.0,11,10849.0,986.272727,5327.0,23.545455,135,172,1.133111
1803,789b9f69,0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,...,1,0.0,1,729.0,729.0,729.0,25.0,25,172,1.410205


In [247]:
train, test = train_test_split(features,test_size=0.10)
print("Train: ",len(train),"Test: ",len(test))
features=list(features.columns)
features.remove('person')
features.remove('label')

x_train=train[features]
y_train=train['label']

x_test=test[features]
y_test=test['label']

Train:  5382 Test:  598


In [248]:
def grid_search():
    n_estimators = [10,20,30,40,50,60,70,80,90,100]
    min_samples_split = [10,20,30,40,50,60,70,80,90,100]
    n_jobs = [-10,-5,5,10,15,20]
    maximos = []
    max_score = 0
    for n_est in n_estimators:
        for min_sam in min_samples_split:
            for n_j in n_jobs:
                c=RandomForestClassifier(n_estimators=n_est,min_samples_split=min_sam,n_jobs=n_j,random_state=0)
                dt=c.fit(x_train,y_train)
                score=c.score(x_test,y_test)*100
                if max_score < score:
                    maximos = [n_est,min_sam,n_j]
                    max_score = score
                print("n_estimators : "+ str(n_est)+"; min_samples_split: "+str(min_sam)+"; n_job: "+str(n_j)+"; SCORE: "+ str(score))
    print("El mejor score fue: "+str(max_score))
    return maximos

In [249]:
mejores_hiperparmetros = grid_search()

n_estimators : 10; min_samples_split: 10; n_job: -10; SCORE: 93.47826086956522
n_estimators : 10; min_samples_split: 10; n_job: -5; SCORE: 93.47826086956522
n_estimators : 10; min_samples_split: 10; n_job: 5; SCORE: 93.47826086956522
n_estimators : 10; min_samples_split: 10; n_job: 10; SCORE: 93.47826086956522
n_estimators : 10; min_samples_split: 10; n_job: 15; SCORE: 93.47826086956522
n_estimators : 10; min_samples_split: 10; n_job: 20; SCORE: 93.47826086956522
n_estimators : 10; min_samples_split: 20; n_job: -10; SCORE: 93.81270903010034
n_estimators : 10; min_samples_split: 20; n_job: -5; SCORE: 93.81270903010034
n_estimators : 10; min_samples_split: 20; n_job: 5; SCORE: 93.81270903010034
n_estimators : 10; min_samples_split: 20; n_job: 10; SCORE: 93.81270903010034
n_estimators : 10; min_samples_split: 20; n_job: 15; SCORE: 93.81270903010034
n_estimators : 10; min_samples_split: 20; n_job: 20; SCORE: 93.81270903010034
n_estimators : 10; min_samples_split: 30; n_job: -10; SCORE: 93.

In [263]:
mejores_hiperparmetros

[20, 20, -10]

In [265]:
c=RandomForestClassifier(n_estimators=mejores_hiperparmetros[0],
                         min_samples_split=mejores_hiperparmetros[1],
                         n_jobs=mejores_hiperparmetros[2],random_state=0)
dt=c.fit(x_train,y_train)

In [266]:
prediccion=pd.merge(labels_predict,data,on='person',how='inner')
prediccion.head()

Unnamed: 0,person,Direct,Email,Organic,Paid,Referral,Social,ad campaign hit,brand listing,checkout,...,retornos,coincide,sessions,total_time,mean_time_by_session,max_session_time,mean_events_by_session,max_events_by_session,days_since_last_session,promedio de TFIdf
0,4886f805,0.0,0.0,0.0,1.0,0.0,0.0,0,0,1,...,1,0.0,1,1388.0,1388.0,1388.0,9.0,9,186,1.0
1,0297fc1e,17.0,0.0,56.0,1.0,0.0,0.0,29,4,7,...,95,0.0,96,11646.0,121.3125,1640.0,5.90625,18,176,1.146942
2,2d681dd8,0.0,0.0,0.0,1.0,0.0,0.0,1,5,1,...,2,0.0,2,553.0,276.5,377.0,13.0,16,177,1.331171
3,cccea85e,1.0,0.0,1.0,14.0,0.0,0.0,15,7,1,...,22,0.0,20,20397.0,1019.85,4350.0,41.8,104,173,1.375124
4,4c8a8b93,9.0,0.0,2.0,3.0,0.0,0.0,14,8,2,...,20,0.0,20,6526.0,326.3,1420.0,12.85,51,182,1.303777


In [267]:
c.score(x_test,y_test)*100

93.9799331103679

In [268]:
x_final=prediccion[features]
y_final=c.predict(x_final)
y_final

array([1, 1, 1, ..., 1, 0, 0])

In [269]:
prediccion['label']=y_final
prediccion.head()

Unnamed: 0,person,Direct,Email,Organic,Paid,Referral,Social,ad campaign hit,brand listing,checkout,...,coincide,sessions,total_time,mean_time_by_session,max_session_time,mean_events_by_session,max_events_by_session,days_since_last_session,promedio de TFIdf,label
0,4886f805,0.0,0.0,0.0,1.0,0.0,0.0,0,0,1,...,0.0,1,1388.0,1388.0,1388.0,9.0,9,186,1.0,1
1,0297fc1e,17.0,0.0,56.0,1.0,0.0,0.0,29,4,7,...,0.0,96,11646.0,121.3125,1640.0,5.90625,18,176,1.146942,1
2,2d681dd8,0.0,0.0,0.0,1.0,0.0,0.0,1,5,1,...,0.0,2,553.0,276.5,377.0,13.0,16,177,1.331171,1
3,cccea85e,1.0,0.0,1.0,14.0,0.0,0.0,15,7,1,...,0.0,20,20397.0,1019.85,4350.0,41.8,104,173,1.375124,0
4,4c8a8b93,9.0,0.0,2.0,3.0,0.0,0.0,14,8,2,...,0.0,20,6526.0,326.3,1420.0,12.85,51,182,1.303777,1


In [270]:
prediccion['label'].value_counts()

1    12480
0     6935
Name: label, dtype: int64

In [271]:
prediccion[['person', 'label']].to_csv('../modelos/RandomForestResults.csv',index=False)

In [261]:
# Get numerical feature importances
importances = list(c.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(features, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: days_since_last_session Importance: 0.48
Variable: checkout             Importance: 0.11
Variable: promedio dias        Importance: 0.1
Variable: brand listing        Importance: 0.02
Variable: viewed product       Importance: 0.02
Variable: promedio hora        Importance: 0.02
Variable: total_time           Importance: 0.02
Variable: mean_time_by_session Importance: 0.02
Variable: max_session_time     Importance: 0.02
Variable: mean_events_by_session Importance: 0.02
Variable: max_events_by_session Importance: 0.02
Variable: Direct               Importance: 0.01
Variable: Organic              Importance: 0.01
Variable: Paid                 Importance: 0.01
Variable: ad campaign hit      Importance: 0.01
Variable: conversion           Importance: 0.01
Variable: generic listing      Importance: 0.01
Variable: search engine hit    Importance: 0.01
Variable: searched products    Importance: 0.01
Variable: staticpage           Importance: 0.01
Variable: visited site         Impo