## Random Forest

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# metrics
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score

# RF
from sklearn.ensemble import RandomForestClassifier

In [2]:
training = pd.read_csv('../test/training-person2.csv').fillna(0)
test = pd.read_csv('../test/test-person2.csv').fillna(0)
sumbit = test['person'].to_frame()

In [3]:
RANDOM_SEED = 12
TEST_SIZE_PERCENT = 0.1

In [4]:
y = training['label']
X = training.drop(axis=1, labels=['label'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE_PERCENT, random_state=RANDOM_SEED)

feature_list = list(X.columns)

In [5]:
X.columns

Index(['event', 'model', 'count_top_viewed_product', 'condition',
       'count_top_viewed_product_cond', 'conversion_model_count_Asus',
       'conversion_model_count_LG', 'conversion_model_count_Lenovo',
       'conversion_model_count_Motorola', 'conversion_model_count_Quantum',
       'conversion_model_count_Samsung', 'conversion_model_count_Sony',
       'conversion_model_count_iPad', 'conversion_model_count_iPhone',
       'returning_x', 'returning_y', 'last_event_days', 'first_event_days',
       'events_cout_last_week', 'events_mean_frec_last_week', 'searched_model',
       'count_top_searched_model', 'region', 'city', 'country'],
      dtype='object')

In [6]:
rdf = RandomForestClassifier(
    min_samples_split=5,
    criterion = 'entropy',
    random_state=RANDOM_SEED,
    n_estimators=100
)

In [7]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 20)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, scoring='roc_auc', param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] n_estimators=410, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=70, bootstrap=False 
[CV] n_estimators=410, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=70, bootstrap=False 


KeyboardInterrupt: 

In [None]:
# Training
rdf.fit(X_train, y_train)

In [None]:
# Predict value
ret = rdf.predict_proba(X_test)

In [None]:
y_test_predictd = rdf.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_test_predictd)

In [None]:
rdf.score(X_test, y_test)

In [None]:
# Predict value kaggle set
# Training
rdf.fit(X, y)
ret = rdf.predict_proba(test.drop(axis=1, labels=['person']))

In [None]:
# Get proba
sumbit['label'] = pd.DataFrame(ret)[1]

In [None]:
g = (sumbit['label']).hist()

In [None]:
# Get numerical feature importances
importances = list(rdf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 7)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

In [None]:
#sumbit.to_csv('test/sumbit-2.csv', index=False)

In [None]:
training.head()