In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split

from func import outlier_counter, get_all_univariate_outlier_index, cmx
from modelling_purpose import Xy, algorithm_report_accumulation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('csv/imputed.csv')
df.shape

(1029, 31)

In [3]:
X = df.drop('Attrition',axis=1)
y = df['Attrition'].map({'Yes':1,'No':0})

categorical_features = X.select_dtypes(include='O').columns.tolist()
ordinal = ['Education', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel',
           'JobSatisfaction', 'PerformanceRating', 'RelationshipSatisfaction',
           'StockOptionLevel', 'WorkLifeBalance']
for i in ordinal:
    X[i] = X[i].astype(str)
categorical_features += ordinal
categorical_features = categorical_features

X = pd.get_dummies(X, columns=categorical_features, drop_first=True)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,random_state=11111992)

params = {
    'pca__n_components': [10,13,15,17,20],
    'dtree__criterion': ['gini', 'entropy'],
    'dtree__splitter' : ['best', 'random'],
    'dtree__max_features' : list(range(1,X_train.shape[1])),
    'dtree__class_weight': [
        {0: 1, 1: 4.75},
        {0: 1, 1: 4},
        {0: 1, 1: 5},
        {0: 1.2, 1: 4.5},
    ],
    'dtree__max_depth' : np.linspace(4, 15, 10),
    'dtree__min_samples_split' : [200,300,400],
    'dtree__min_samples_leaf' : [100,150,200],
}

model_pipeline = Pipeline([
    ('pca', PCA()),
    ('dtree', DecisionTreeClassifier(random_state=11111992))
])

grid = GridSearchCV(model_pipeline, params,
                    cv=5, scoring='recall', n_jobs=-1, verbose=2)

grid.fit(X_train, y_train)

Fitting 5 folds for each of 439200 candidates, totalling 2196000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 956 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 2580 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 4844 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-1)]: Done 7764 tasks      | elapsed:   49.2s
[Parallel(n_jobs=-1)]: Done 11324 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 15540 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 20396 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 25908 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 32060 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 38868 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 46316 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 54420 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 63164 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 72564 t

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('pca', PCA()),
                                       ('dtree',
                                        DecisionTreeClassifier(random_state=11111992))]),
             n_jobs=-1,
             param_grid={'dtree__class_weight': [{0: 1, 1: 4.75}, {0: 1, 1: 4},
                                                 {0: 1, 1: 5},
                                                 {0: 1.2, 1: 4.5}],
                         'dtree__criterion': ['gini', 'entropy'],
                         'dtree__max_depth': array([ 4.        ,  5.22222222,  6.44444444,  7.66666667,  8.88888889,
       10.11111111, 11.33333333, 12.55555556, 13.77777778, 15.        ]),
                         'dtree__max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                                                 11, 12, 13, 14, 15, 16, 17, 18,
                                                 19, 20, 21, 22, 23, 24, 25, 26,
                                                 27, 

In [8]:
grid.best_estimator_

Pipeline(steps=[('pca', PCA(n_components=10)),
                ('dtree',
                 DecisionTreeClassifier(class_weight={0: 1, 1: 5},
                                        max_depth=4.0, max_features=1,
                                        min_samples_leaf=150,
                                        min_samples_split=200,
                                        random_state=11111992,
                                        splitter='random'))])

In [9]:
grid.best_params_

{'dtree__class_weight': {0: 1, 1: 5},
 'dtree__criterion': 'gini',
 'dtree__max_depth': 4.0,
 'dtree__max_features': 1,
 'dtree__min_samples_leaf': 150,
 'dtree__min_samples_split': 200,
 'dtree__splitter': 'random',
 'pca__n_components': 10}

In [10]:
grid.best_score_

1.0

In [11]:
y_pred = grid.predict(X_train)
print(classification_report(y_train, y_pred))
y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.23      0.37       639
           1       0.19      0.86      0.31       132

    accuracy                           0.34       771
   macro avg       0.54      0.54      0.34       771
weighted avg       0.77      0.34      0.36       771

              precision    recall  f1-score   support

           0       0.97      0.28      0.43       214
           1       0.21      0.95      0.35        44

    accuracy                           0.39       258
   macro avg       0.59      0.62      0.39       258
weighted avg       0.84      0.39      0.42       258



In [12]:
grid.score(X_test, y_test)

0.9545454545454546

In [13]:
grid.best_index_

219635

In [14]:
pd.DataFrame(grid.cv_results_).loc[grid.best_index_]

mean_fit_time                                                             0.0133719
std_fit_time                                                             0.00133916
mean_score_time                                                          0.00570903
std_score_time                                                          0.000142563
param_dtree__class_weight                                              {0: 1, 1: 5}
param_dtree__criterion                                                         gini
param_dtree__max_depth                                                            4
param_dtree__max_features                                                         1
param_dtree__min_samples_leaf                                                   150
param_dtree__min_samples_split                                                  200
param_dtree__splitter                                                        random
param_pca__n_components                                                     

In [15]:
grid.scorer_

make_scorer(recall_score, average=binary)

In [16]:
grid.n_splits_

5

In [17]:
grid.refit_time_

0.0334014892578125

In [18]:
y_pred = grid.predict(X_train)
cmx(y_train,y_pred)

Unnamed: 0,Pred 1,Pred 0
Act1,113,19
Act 0,492,147


In [19]:
y_pred = grid.predict(X_test)
cmx(y_test, y_pred)

Unnamed: 0,Pred 1,Pred 0
Act1,42,2
Act 0,155,59
