# Re-optimizing with increasing the Folds

In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split

from func import outlier_counter, get_all_univariate_outlier_index, cmx
from modelling_purpose import Xy, algorithm_report_accumulation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('csv/imputed.csv')
df.shape

(1029, 31)

In [3]:
X = df.drop('Attrition',axis=1)
y = df['Attrition'].map({'Yes':1,'No':0})

categorical_features = X.select_dtypes(include='O').columns.tolist()
X = pd.get_dummies(X, columns=categorical_features, drop_first=True)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,random_state=11111992)

params = {
    'pca__n_components': [10,13,15,17,20],
    'dtree__criterion': ['gini', 'entropy'],
    'dtree__splitter' : ['best', 'random'],
    'dtree__max_features' : list(range(1,X_train.shape[1])),
    'dtree__class_weight': [
        {0: 1, 1: 4.75},
        {0: 1, 1: 4},
        {0: 1, 1: 5},
        {0: 1.2, 1: 4.5},
    ],
    'dtree__max_depth' : np.linspace(4, 15, 11),
    'dtree__min_samples_split' : [200,300,400],
    'dtree__min_samples_leaf' : [100,150,200],
}

model_pipeline = Pipeline([
    ('pca', PCA()),
    ('dtree', DecisionTreeClassifier(random_state=11111992))
])

grid = GridSearchCV(model_pipeline, params,
                    cv=8, scoring='recall', n_jobs=-1, verbose=2)

grid.fit(X_train, y_train)

Fitting 8 folds for each of 340560 candidates, totalling 2724480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 604 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 2228 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done 4492 tasks      | elapsed:   34.9s
[Parallel(n_jobs=-1)]: Done 7412 tasks      | elapsed:   58.0s
[Parallel(n_jobs=-1)]: Done 10972 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 15188 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 20044 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 25556 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 31708 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 38516 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 45964 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 54068 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 62812 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 72212 t

GridSearchCV(cv=8,
             estimator=Pipeline(steps=[('pca', PCA()),
                                       ('dtree',
                                        DecisionTreeClassifier(random_state=11111992))]),
             n_jobs=-1,
             param_grid={'dtree__class_weight': [{0: 1, 1: 4.75}, {0: 1, 1: 4},
                                                 {0: 1, 1: 5},
                                                 {0: 1.2, 1: 4.5}],
                         'dtree__criterion': ['gini', 'entropy'],
                         'dtree__max_depth': array([ 4. ,  5.1,  6.2,  7.3,  8.4,  9.5, 10.6, 11.7, 12.8, 13.9, 15. ]),
                         'dtree__max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                                                 11, 12, 13, 14, 15, 16, 17, 18,
                                                 19, 20, 21, 22, 23, 24, 25, 26,
                                                 27, 28, 29, 30, ...],
                         'dtree__min_samples_leaf': [

In [5]:
grid.best_estimator_

Pipeline(steps=[('pca', PCA(n_components=10)),
                ('dtree',
                 DecisionTreeClassifier(class_weight={0: 1, 1: 5},
                                        max_depth=4.0, max_features=1,
                                        min_samples_leaf=200,
                                        min_samples_split=200,
                                        random_state=11111992,
                                        splitter='random'))])

In [6]:
grid.best_params_

{'dtree__class_weight': {0: 1, 1: 5},
 'dtree__criterion': 'gini',
 'dtree__max_depth': 4.0,
 'dtree__max_features': 1,
 'dtree__min_samples_leaf': 200,
 'dtree__min_samples_split': 200,
 'dtree__splitter': 'random',
 'pca__n_components': 10}

In [7]:
grid.best_score_

1.0

In [8]:
y_pred = grid.predict(X_train)
print(classification_report(y_train, y_pred))
y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       639
           1       0.17      1.00      0.29       132

    accuracy                           0.17       771
   macro avg       0.09      0.50      0.15       771
weighted avg       0.03      0.17      0.05       771

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       214
           1       0.17      1.00      0.29        44

    accuracy                           0.17       258
   macro avg       0.09      0.50      0.15       258
weighted avg       0.03      0.17      0.05       258



**Temuan**: Dengan cv=8, recall pada train dan juga test menjadi 100% dan specificity 0%.

Hal seperti ini artinya adalah model akan memprediksi bahwa semua pegawai akan keluar. (lihat pada confusion matrix di bawah.)

In [9]:
grid.score(X_test, y_test)

1.0

In [10]:
grid.best_index_

170345

In [11]:
pd.DataFrame(grid.cv_results_).loc[grid.best_index_]

mean_fit_time                                                             0.0158259
std_fit_time                                                             0.00120099
mean_score_time                                                          0.00812119
std_score_time                                                           0.00276688
param_dtree__class_weight                                              {0: 1, 1: 5}
param_dtree__criterion                                                         gini
param_dtree__max_depth                                                            4
param_dtree__max_features                                                         1
param_dtree__min_samples_leaf                                                   200
param_dtree__min_samples_split                                                  200
param_dtree__splitter                                                        random
param_pca__n_components                                                     

In [12]:
grid.scorer_

make_scorer(recall_score, average=binary)

In [13]:
grid.n_splits_

8

In [14]:
grid.refit_time_

0.035249948501586914

In [15]:
y_pred = grid.predict(X_train)
cmx(y_train,y_pred)

Unnamed: 0,Pred 1,Pred 0
Act1,132,0
Act 0,639,0


In [16]:
y_pred = grid.predict(X_test)
cmx(y_test, y_pred)

Unnamed: 0,Pred 1,Pred 0
Act1,44,0
Act 0,214,0


# Post-Pruning (GridSearchCV)

In [325]:
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split

from func import outlier_counter, get_all_univariate_outlier_index, cmx
from modelling_purpose import Xy, algorithm_report_accumulation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [326]:
df = pd.read_csv('csv/imputed.csv')
df.shape

(1029, 31)

In [327]:
X = df.drop('Attrition',axis=1)
y = df['Attrition'].map({'Yes':1,'No':0})

categorical_features = X.select_dtypes(include='O').columns.tolist()
X = pd.get_dummies(X, columns=categorical_features, drop_first=True)

In [328]:
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,random_state=11111992)

In [329]:
params = {'dtree__ccp_alpha': np.linspace(0,1,1000)}

model_pipeline = Pipeline(steps=[
    ('pca', PCA(n_components=10)),
    ('dtree', DecisionTreeClassifier(class_weight={0: 1, 1: 5},
                                     max_depth=4.0, max_features=1,
                                     min_samples_leaf=200,
                                     min_samples_split=200,
                                     random_state=11111992,
                                     splitter='random'))
])

grid = GridSearchCV(model_pipeline, params,
                    cv=8, scoring='recall', n_jobs=-1, verbose=2)

In [330]:
grid.fit(X_train, y_train)

Fitting 8 folds for each of 1000 candidates, totalling 8000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 1096 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 2720 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed:   30.0s
[Parallel(n_jobs=-1)]: Done 7904 tasks      | elapsed:   47.5s
[Parallel(n_jobs=-1)]: Done 8000 out of 8000 | elapsed:   48.0s finished


GridSearchCV(cv=8,
             estimator=Pipeline(steps=[('pca', PCA(n_components=10)),
                                       ('dtree',
                                        DecisionTreeClassifier(class_weight={0: 1,
                                                                             1: 5},
                                                               max_depth=4.0,
                                                               max_features=1,
                                                               min_samples_leaf=200,
                                                               min_samples_split=200,
                                                               random_state=11111992,
                                                               splitter='random'))]),
             n_jobs=-1,
             param_grid={'dtree__ccp_alpha': array([0.        , 0.001001  , 0.002002  , 0.003003  , 0.004004  ,
       0.00500501, 0.006006...
       0.96596597, 0.9669669

In [331]:
grid.best_params_

{'dtree__ccp_alpha': 0.0}

In [332]:
y_pred = grid.predict(X_train)
print(classification_report(y_train, y_pred))
y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       639
           1       0.17      1.00      0.29       132

    accuracy                           0.17       771
   macro avg       0.09      0.50      0.15       771
weighted avg       0.03      0.17      0.05       771

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       214
           1       0.17      1.00      0.29        44

    accuracy                           0.17       258
   macro avg       0.09      0.50      0.15       258
weighted avg       0.03      0.17      0.05       258



**Temuan** : Post-Pruning dengan GridSearchCV menggunakan parameter ccp_alpha tidak mengubah apapun karena best_parameter_ yang digunakan adalah 0.0

# Manual Post-Pruning

In [16]:
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split

from func import outlier_counter, get_all_univariate_outlier_index, cmx
from modelling_purpose import Xy, algorithm_report_accumulation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [17]:
df = pd.read_csv('csv/imputed.csv')
df.shape

(1029, 31)

In [18]:
X = df.drop('Attrition',axis=1)
y = df['Attrition'].map({'Yes':1,'No':0})

categorical_features = X.select_dtypes(include='O').columns.tolist()
X = pd.get_dummies(X, columns=categorical_features, drop_first=True)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,random_state=11111992)

In [20]:
pipeline = Pipeline(steps=[
    ('pca', PCA(n_components=10)),
    ('dtree', DecisionTreeClassifier(class_weight={0: 1, 1: 5},
                                     max_depth=4.0, max_features=1,
                                     min_samples_leaf=163,
                                     min_samples_split=163,
                                     random_state=11111992,
                                     splitter='random'))
])

In [21]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('pca', PCA(n_components=10)),
                ('dtree',
                 DecisionTreeClassifier(class_weight={0: 1, 1: 5},
                                        max_depth=4.0, max_features=1,
                                        min_samples_leaf=163,
                                        min_samples_split=163,
                                        random_state=11111992,
                                        splitter='random'))])

In [22]:
y_pred = pipeline.predict(X_train)
print(classification_report(y_train, y_pred))
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.23      0.37       639
           1       0.19      0.86      0.31       132

    accuracy                           0.34       771
   macro avg       0.54      0.54      0.34       771
weighted avg       0.77      0.34      0.36       771

              precision    recall  f1-score   support

           0       0.97      0.28      0.43       214
           1       0.21      0.95      0.35        44

    accuracy                           0.39       258
   macro avg       0.59      0.62      0.39       258
weighted avg       0.84      0.39      0.42       258

