In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [83]:
plt.rc('figure',dpi=100);

In [84]:
training_labels = pd.read_csv("labels_training_set.csv", low_memory=False)
labels_predict = pd.read_csv("trocafone_kaggle_test.csv", low_memory=False)
data = pd.read_csv("data_set.csv", low_memory = False)

In [85]:
training_labels['label'].value_counts()

0    18434
1      980
Name: label, dtype: int64

In [86]:
features = pd.merge(training_labels, data, on='person', how='inner')
features = features.fillna(0)
features['label'].value_counts()


0    18151
1      975
Name: label, dtype: int64

In [87]:
train, test = train_test_split(features,test_size=0.10)
print("Train: ",len(train),"Test: ",len(test))
features=list(features.columns)
features.remove('person')
features.remove('label')

x_train=train[features]
y_train=train['label']

x_test=test[features]
y_test=test['label']

Train:  17213 Test:  1913


In [88]:
c=RandomForestClassifier(n_estimators=500,min_samples_split=100,n_jobs=-1,random_state=0)
dt=c.fit(x_train,y_train)

In [89]:
score=c.score(x_test,y_test)*100
print(score)

95.45216936748562


In [90]:
prediccion=pd.merge(labels_predict,data,on='person',how='inner')
prediccion.head()

Unnamed: 0,person,Direct,Email,Organic,Paid,Referral,Social,ad campaign hit,brand listing,checkout,...,generic listing,lead,search engine hit,searched products,staticpage,viewed product,visited site,promedio dias,promedio hora,retornos
0,4886f805,0,0,0,1,0,0,0,0,1,...,1,0,1,1,0,4,1,18.0,0.0,1
1,0297fc1e,17,0,56,1,0,0,29,4,7,...,21,1,0,6,0,404,95,18.537919,12.793651,95
2,2d681dd8,0,0,0,1,0,0,1,5,1,...,1,0,2,1,0,13,2,23.538462,11.692308,2
3,cccea85e,1,0,1,14,0,0,15,7,1,...,20,0,26,1,5,739,22,18.659091,16.199761,22
4,4c8a8b93,9,0,2,3,0,0,14,8,2,...,14,0,13,9,0,177,20,18.661479,10.540856,20


In [91]:
labels_predict.shape

(19415, 1)

In [92]:
x_final=prediccion[features]
y_final=c.predict(x_final)
y_final

array([0, 0, 0, ..., 0, 0, 0])

In [93]:
prediccion['label']=y_final
prediccion.head()

Unnamed: 0,person,Direct,Email,Organic,Paid,Referral,Social,ad campaign hit,brand listing,checkout,...,lead,search engine hit,searched products,staticpage,viewed product,visited site,promedio dias,promedio hora,retornos,label
0,4886f805,0,0,0,1,0,0,0,0,1,...,0,1,1,0,4,1,18.0,0.0,1,0
1,0297fc1e,17,0,56,1,0,0,29,4,7,...,1,0,6,0,404,95,18.537919,12.793651,95,0
2,2d681dd8,0,0,0,1,0,0,1,5,1,...,0,2,1,0,13,2,23.538462,11.692308,2,0
3,cccea85e,1,0,1,14,0,0,15,7,1,...,0,26,1,5,739,22,18.659091,16.199761,22,0
4,4c8a8b93,9,0,2,3,0,0,14,8,2,...,0,13,9,0,177,20,18.661479,10.540856,20,0


In [94]:
prediccion['label'].value_counts()

0    19116
Name: label, dtype: int64

In [95]:
prediccion[['person', 'label']].to_csv('../modelos/RandomForestResults.csv',index=False)

In [96]:
# Get numerical feature importances
importances = list(c.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(features, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: checkout             Importance: 0.31
Variable: promedio dias        Importance: 0.13
Variable: promedio hora        Importance: 0.08
Variable: viewed product       Importance: 0.07
Variable: brand listing        Importance: 0.06
Variable: conversion           Importance: 0.04
Variable: generic listing      Importance: 0.04
Variable: Organic              Importance: 0.03
Variable: ad campaign hit      Importance: 0.03
Variable: search engine hit    Importance: 0.03
Variable: searched products    Importance: 0.03
Variable: visited site         Importance: 0.03
Variable: retornos             Importance: 0.03
Variable: Direct               Importance: 0.02
Variable: Paid                 Importance: 0.02
Variable: staticpage           Importance: 0.02
Variable: Referral             Importance: 0.01
Variable: Social               Importance: 0.01
Variable: lead                 Importance: 0.01
Variable: Email                Importance: 0.0
