# Random Forest Forest Classifier without PCA

# Random Forest Classifier with PCA

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score

In [2]:
#loading in the data

df_training = pd.read_csv('../data/archive/train_pca.csv')
df_testing = pd.read_csv('../data/archive/test_pca.csv')

In [3]:
#getting all of our randomized samples from the training
all_samples = []

for sample in range(0,5):
    all_samples.append(df_training.sample(n=5000, replace= True))

In [4]:
#Separating X and y from testing since this won't be used in the for loop
x_train_full = df_training.iloc[:,:-1]
y_train_full = df_training.iloc[:,-1]

x_test = df_testing.iloc[:, :-1]
y_test = df_testing.iloc[:,-1]

In [5]:
#initializing our lists so we can later store the best params that achieve best scores across metrics
accuracy_best_params = []
f1_best_params = []
precision_best_params = []



for sample in all_samples:
    
    rf = RandomForestClassifier()
    
    #Making a grid of values we want our grid search to test to find the best parameters
    grid_values ={'n_estimators': list(range(1,105,4))}

    rf_classifier = GridSearchCV(estimator = rf, param_grid = grid_values, cv = StratifiedKFold(n_splits = 5),
                      scoring = ['accuracy', 'f1_micro', 'precision_micro'], refit = False, verbose = 0)
    
    x_train = sample.iloc[:, :-1]
    y_train = sample.iloc[:,-1]
    
    rf_model = rf_classifier.fit(x_train, y_train)

    accuracy_best_params.append(rf_model.cv_results_['params'][ np.argmin(rf_model.cv_results_['rank_test_accuracy'])])
    f1_best_params.append(rf_model.cv_results_['params'][ np.argmin(rf_model.cv_results_['rank_test_f1_micro'])])
    precision_best_params.append(rf_model.cv_results_['params'][ np.argmin(rf_model.cv_results_['rank_test_precision_micro'])])

In [6]:
accuracy_best_params

[{'n_estimators': 101},
 {'n_estimators': 45},
 {'n_estimators': 89},
 {'n_estimators': 89},
 {'n_estimators': 89}]

In [7]:
f1_best_params

[{'n_estimators': 101},
 {'n_estimators': 45},
 {'n_estimators': 89},
 {'n_estimators': 89},
 {'n_estimators': 89}]

In [8]:
precision_best_params

[{'n_estimators': 101},
 {'n_estimators': 45},
 {'n_estimators': 89},
 {'n_estimators': 89},
 {'n_estimators': 89}]

In [9]:
accuracy_test_score = []

i = 0
for param in accuracy_best_params:

    rf_clf = RandomForestClassifier(n_estimators = accuracy_best_params[i].get('n_estimators'))
    model = rf_clf.fit(x_train_full, y_train_full)

    y_pred = model.predict(x_test)

    accuracy_test_score.append(accuracy_score(y_test, y_pred))
    
    i += 1

In [10]:
accuracy_test_score

[0.8971835765184933,
 0.8934509670851714,
 0.8944689514760774,
 0.8998982015609094,
 0.8954869358669834]

In [12]:
f1_test_score = []

i = 0
for param in f1_best_params:

    rf_clf = RandomForestClassifier(n_estimators = f1_best_params[i].get('n_estimators'))
    model = rf_clf.fit(x_train_full, y_train_full)

    y_pred = model.predict(x_test)

    f1_test_score.append(f1_score(y_test, y_pred, average = 'micro'))
    
    i += 1

In [13]:
f1_test_score

[0.8934509670851712,
 0.8927723108245673,
 0.9012555140821175,
 0.8907363420427553,
 0.8992195453003053]

In [14]:
precision_test_score = []

i = 0
for param in precision_best_params:

    rf_clf = RandomForestClassifier(n_estimators = precision_best_params[i].get('n_estimators'))
    model = rf_clf.fit(x_train_full, y_train_full)

    y_pred = model.predict(x_test)

    precision_test_score.append(precision_score(y_test, y_pred, average = 'micro'))
    
    i += 1
    

In [15]:
precision_test_score

[0.9022734984730234,
 0.8941296233457754,
 0.9043094672548354,
 0.8924329826942654,
 0.8927723108245673]