## Import libraries

In [1]:
import pandas as pd
import numpy as numpy
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from ipynb.fs.full.preprocessing_step import get_preprocessed_dataset
from ipynb.fs.full.generic_feature_selection_extratrees import get_dataset_kimportantfeatures_ExtraTrees
from sklearn.ensemble import RandomForestClassifier

#Accuracy
from sklearn import metrics

In [2]:
shoppers_dataset = get_preprocessed_dataset()
shop_data_copy = shoppers_dataset.copy()
x_data = shoppers_dataset.drop(columns=['Revenue'])
y_data = shoppers_dataset[['Revenue']]

In [3]:
display(shoppers_dataset.count())

Administrative             12330
Administrative_Duration    12330
Informational              12330
Informational_Duration     12330
ProductRelated             12330
ProductRelated_Duration    12330
BounceRates                12330
ExitRates                  12330
PageValues                 12330
SpecialDay                 12330
Month                      12330
OperatingSystems           12330
Browser                    12330
Region                     12330
TrafficType                12330
Revenue                    12330
VisitorType_0              12330
VisitorType_1              12330
VisitorType_2              12330
Weekend_0                  12330
Weekend_1                  12330
dtype: int64

## Applying Model


In [4]:
rf = RandomForestClassifier(random_state=42)

In [None]:
cross_val = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
accuracy_iris = cross_val_score(rf, x_data, y_data, cv=cross_val, scoring='f1')
display(accuracy_iris.mean())

## Grid Search using all features

In [None]:
param_grid = {'n_estimators': [10,20,30,40,50,60],  
              'criterion': ['entropy', 'gini'],
              'max_depth': [10,20,30,40],
              'min_samples_leaf': [2,3],
              'max_features': ['float', 'sqrt', 'log2'],
              'warm_start' : [True]} 

founded_parameters = {'criterion': 'gini',
 'max_depth': 20,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'n_estimators': 50,
 'warm_start': True,
 'verbose':True}

strat_10_fold_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search_estimator = GridSearchCV(rf, param_grid, scoring='f1', cv=strat_10_fold_cv, return_train_score=False)
nested_cv_score = cross_val_score(grid_search_estimator, x_data, y_data, cv=5, scoring='f1')
display(nested_cv_score.mean())
grid_search_estimator.fit(x_data, y_data)
display(grid_search_estimator.best_params_)

In [5]:
important_features = get_dataset_kimportantfeatures_ExtraTrees(x_data,y_data, 10)

In [6]:
display(important_features)

Unnamed: 0,PageValues,ExitRates,ProductRelated_Duration,ProductRelated,BounceRates,Administrative,Month,Administrative_Duration,TrafficType,Region
0,-0.317178,3.229316,-0.624348,-0.691003,3.667189,-0.696993,-1.665924,-0.457191,-0.762629,-0.894178
117,-0.317178,-0.325141,-0.322820,-0.466151,-0.082695,-0.696993,-1.665924,-0.457191,-0.762629,-0.061364
118,-0.317178,0.657012,-0.574703,-0.533607,-0.457683,-0.696993,-1.665924,-0.457191,-0.265735,-0.061364
119,-0.317178,1.400122,-0.466790,-0.578577,-0.457683,-0.696993,-1.665924,-0.457191,-0.265735,2.437081
120,-0.317178,-0.200423,-0.497361,-0.578577,-0.457683,-0.696993,-1.665924,-0.457191,-0.762629,0.355044
...,...,...,...,...,...,...,...,...,...,...
8394,-0.317178,-0.004438,-0.521400,-0.556092,-0.457683,-0.696993,1.281578,-0.457191,-0.514182,-0.061364
11001,-0.009351,-0.022213,0.338861,0.613138,-0.160982,0.206173,1.281578,0.150533,0.728052,1.604266
11000,-0.317178,-0.229821,-0.106212,0.095979,-0.163049,-0.395938,1.281578,-0.411935,-0.514182,2.020674
11013,1.560095,-0.639288,1.243069,0.343316,-0.379855,-0.094882,1.281578,-0.004631,-0.762629,-0.061364


## Using hyperparameter tuning

In [9]:
rf = RandomForestClassifier(criterion='entropy', 
                            n_estimators=50, 
                            max_depth=20, 
                            max_features='sqrt', 
                            min_samples_leaf=2,
                            warm_start=True,
                            verbose=True,
                            random_state=42)

cross_val = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
accuracy_iris = cross_val_score(rf, x_data, y_data, cv=cross_val, scoring='f1')
display(accuracy_iris.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

0.6514132096953773