In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [24]:
plt.rc('figure',dpi=100);

In [25]:
training_labels = pd.read_csv("set_entrenamiento.csv", low_memory=False)
labels_predict = pd.read_csv("trocafone_kaggle_test.csv", low_memory=False)
data = pd.read_csv("data_set.csv", low_memory = False)

In [26]:
training_labels = training_labels[['person','label']]
data = data.drop(columns = {'Email','Unknown','Social','days_since_last_session'})

In [27]:
data.shape

(38829, 26)

In [28]:
training_labels['label'].value_counts()

1    980
0    980
Name: label, dtype: int64

In [29]:
features = pd.merge(training_labels, data, on='person', how='inner')
features = features.fillna(0)
features['label'].value_counts()

1    980
0    980
Name: label, dtype: int64

In [30]:
features.head()

Unnamed: 0.1,person,label,Organic,Paid,Referral,ad campaign hit,brand listing,checkout,conversion,generic listing,...,promedio hora,retornos,Unnamed: 0,sessions,total_time,mean_time_by_session,max_session_time,mean_events_by_session,max_events_by_session,promedio de TFIdf
0,db2c4d27,1,57,1,32,3,31,3,1,166,...,14.193262,114,33192,115,86410.0,751.391304,13079.0,4.904348,24,1.0
1,8123457d,1,0,3,0,4,2,0,0,3,...,5.6,3,19494,3,603.0,201.0,311.0,8.333333,10,0.0
2,e4b02ea2,1,0,2,0,2,0,0,0,0,...,20.384615,2,34691,2,490.0,245.0,294.0,13.0,19,0.0
3,d8001b23,1,9,10,0,13,32,0,0,6,...,15.707865,19,32705,20,5305.0,265.25,1714.0,8.9,29,1.394975
4,7a472832,1,0,1,1,4,16,2,2,0,...,13.66474,8,18448,9,4272.0,474.666667,2428.0,19.222222,45,0.0


In [31]:
train, test = train_test_split(features,test_size=0.10)
print("Train: ",len(train),"Test: ",len(test))
features=list(features.columns)
features.remove('person')
features.remove('label')

x_train=train[features]
y_train=train['label']

x_test=test[features]
y_test=test['label']

Train:  1764 Test:  196


In [32]:
def grid_search():
    n_estimators = [10,20,30,40,50,60,70,80,90,100]
    min_samples_split = [10,20,30,40,50,60,70,80,90,100]
    n_jobs = [-10,-5,5,10,15,20]
    maximos = []
    max_score = 0
    for n_est in n_estimators:
        for min_sam in min_samples_split:
            for n_j in n_jobs:
                c=RandomForestClassifier(n_estimators=n_est,min_samples_split=min_sam,n_jobs=n_j,random_state=0)
                dt=c.fit(x_train,y_train)
                score=c.score(x_test,y_test)*100
                if max_score < score:
                    maximos = [n_est,min_sam,n_j]
                    max_score = score
                print("n_estimators : "+ str(n_est)+"; min_samples_split: "+str(min_sam)+"; n_job: "+str(n_j)+"; SCORE: "+ str(score))
    print("El mejor score fue: "+str(max_score))
    return maximos

In [None]:
mejores_hiperparmetros = grid_search()

n_estimators : 10; min_samples_split: 10; n_job: -10; SCORE: 70.40816326530613
n_estimators : 10; min_samples_split: 10; n_job: -5; SCORE: 70.40816326530613
n_estimators : 10; min_samples_split: 10; n_job: 5; SCORE: 70.40816326530613
n_estimators : 10; min_samples_split: 10; n_job: 10; SCORE: 70.40816326530613
n_estimators : 10; min_samples_split: 10; n_job: 15; SCORE: 70.40816326530613
n_estimators : 10; min_samples_split: 10; n_job: 20; SCORE: 70.40816326530613
n_estimators : 10; min_samples_split: 20; n_job: -10; SCORE: 67.85714285714286
n_estimators : 10; min_samples_split: 20; n_job: -5; SCORE: 67.85714285714286
n_estimators : 10; min_samples_split: 20; n_job: 5; SCORE: 67.85714285714286
n_estimators : 10; min_samples_split: 20; n_job: 10; SCORE: 67.85714285714286
n_estimators : 10; min_samples_split: 20; n_job: 15; SCORE: 67.85714285714286
n_estimators : 10; min_samples_split: 20; n_job: 20; SCORE: 67.85714285714286
n_estimators : 10; min_samples_split: 30; n_job: -10; SCORE: 67.

In [None]:
mejores_hiperparmetros

In [None]:
c=RandomForestClassifier(n_estimators=40,min_samples_split=10,n_jobs=-10,random_state=0)
dt=c.fit(x_train,y_train)

In [None]:
prediccion=pd.merge(labels_predict,data,on='person',how='inner')
prediccion.head()

In [None]:
labels_predict.shape

In [None]:
x_final=prediccion[features]
y_final=c.predict(x_final)
y_final

In [None]:
prediccion['label']=y_final
prediccion.head()

In [None]:
prediccion['label'].value_counts()

In [None]:
prediccion[['person', 'label']].to_csv('../modelos/RandomForestResults.csv',index=False)

In [None]:
# Get numerical feature importances
importances = list(c.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(features, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];