# Re-optimizing with GridSearchCV

In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split

from func import outlier_counter, get_all_univariate_outlier_index, cmx
from modelling_purpose import Xy, algorithm_report_accumulation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('csv/imputed.csv')
df.shape

(1029, 31)

In [3]:
X = df.drop('Attrition',axis=1)
y = df['Attrition'].map({'Yes':1,'No':0})

categorical_features = X.select_dtypes(include='O').columns.tolist()
X = pd.get_dummies(X, columns=categorical_features, drop_first=True)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,random_state=11111992)

In [5]:
params = {
    'pca__n_components': [7,8,9,10],
    'rfc__class_weight': [{0: 1, 1: 4.75}, {0: 1, 1: 6}],
}

pipeline = Pipeline([
    ('pca', PCA()),
    ('rfc', RandomForestClassifier(random_state=11111992,
                                   n_estimators=20000,
                                   max_depth=30,
                                   min_samples_split= 400,
                                   min_samples_leaf= 200))
])

grid = GridSearchCV(pipeline, params,
                    cv=8, scoring='recall', n_jobs=-1, verbose=2)

grid.fit(X_train, y_train)

Fitting 8 folds for each of 8 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed: 15.6min finished


GridSearchCV(cv=8,
             estimator=Pipeline(steps=[('pca', PCA()),
                                       ('rfc',
                                        RandomForestClassifier(max_depth=30,
                                                               min_samples_leaf=200,
                                                               min_samples_split=400,
                                                               n_estimators=20000,
                                                               random_state=11111992))]),
             n_jobs=-1,
             param_grid={'pca__n_components': [7, 8, 9, 10],
                         'rfc__class_weight': [{0: 1, 1: 4.75}, {0: 1, 1: 6}]},
             scoring='recall', verbose=2)

In [6]:
grid.best_params_

{'pca__n_components': 8, 'rfc__class_weight': {0: 1, 1: 6}}

In [7]:
y_pred = grid.predict(X_train)
print(classification_report(y_train, y_pred))
y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.18      0.31       639
           1       0.18      0.89      0.30       132

    accuracy                           0.30       771
   macro avg       0.54      0.54      0.30       771
weighted avg       0.77      0.30      0.31       771

              precision    recall  f1-score   support

           0       0.98      0.21      0.35       214
           1       0.20      0.98      0.34        44

    accuracy                           0.34       258
   macro avg       0.59      0.59      0.34       258
weighted avg       0.85      0.34      0.34       258



# Manual Post-Prune

In [32]:
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split

from func import outlier_counter, get_all_univariate_outlier_index, cmx
from modelling_purpose import Xy, algorithm_report_accumulation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [33]:
df = pd.read_csv('csv/imputed.csv')
df.shape

(1029, 31)

In [34]:
X = df.drop('Attrition',axis=1)
y = df['Attrition'].map({'Yes':1,'No':0})

categorical_features = X.select_dtypes(include='O').columns.tolist()
X = pd.get_dummies(X, columns=categorical_features, drop_first=True)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,random_state=11111992)

In [36]:
pipeline = Pipeline([
    ('pca', PCA(10)),
    ('rfc', RandomForestClassifier(random_state=11111992,
                                     class_weight={0: 1, 1: 6},
                                     n_estimators=4000,
                                     max_depth=30,
                                     min_samples_split= 400,
                                     min_samples_leaf= 200))
])

pipeline.fit(X_train, y_train)

Pipeline(steps=[('pca', PCA(n_components=10)),
                ('rfc',
                 RandomForestClassifier(class_weight={0: 1, 1: 6}, max_depth=30,
                                        min_samples_leaf=200,
                                        min_samples_split=400,
                                        n_estimators=4000,
                                        random_state=11111992))])

In [37]:
y_pred = pipeline.predict(X_train)
print(classification_report(y_train, y_pred))
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.28      0.43       639
           1       0.20      0.87      0.33       132

    accuracy                           0.39       771
   macro avg       0.56      0.58      0.38       771
weighted avg       0.79      0.39      0.42       771

              precision    recall  f1-score   support

           0       0.97      0.30      0.46       214
           1       0.22      0.95      0.36        44

    accuracy                           0.41       258
   macro avg       0.60      0.63      0.41       258
weighted avg       0.84      0.41      0.44       258

