# Import Library & Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from func import outlier_counter, get_all_univariate_outlier_index
from modelling_purpose import Xy, algorithm_report_accumulation

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('csv/imputed.csv')

# With / Without Outlier

Kita akan membuat 2 dataset dari `imputed.csv`, yaitu:
- Dengan Outlier
- Tanpa Outlier (Univariate)

In [3]:
df_without_outlier = df.copy()
outlier_columns  = ['TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole',
                    'YearsSinceLastPromotion', 'YearsWithCurrManager', 'TrainingTimesLastYear',
                    'NumCompaniesWorked', 'MonthlyIncome']
outlier_index = get_all_univariate_outlier_index(df_without_outlier, outlier_columns)
df_without_outlier.drop(df_without_outlier.index[outlier_index], inplace=True)

In [4]:
df.shape

(1029, 31)

In [5]:
df_without_outlier.shape

(650, 31)

`df` adalah dataset dengan outlier. Dan `df_without_outlier` adalah dataset tanpa outlier.

# Feature Engineering

In [6]:
def one_hot(df,column):
    df = pd.concat(
    [
        df,
        pd.get_dummies(df[column], prefix=column, drop_first=True)
    ],
    axis=1)
    df = df.drop(columns=column)

In [7]:
# Ini hanya untuk eksplanasi ---------------
X = df.drop('Attrition',axis=1)
y = df['Attrition'].map({'Yes':1,'No':0})

categorical_features = X.select_dtypes(include='O').columns.tolist()
ordinal = ['Education', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel',
           'JobSatisfaction', 'PerformanceRating', 'RelationshipSatisfaction',
           'StockOptionLevel', 'WorkLifeBalance']
categorical_features += ordinal
categorical_features
# Ini hanya untuk eksplanasi ---------------

['BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'OverTime',
 'Education',
 'EnvironmentSatisfaction',
 'JobInvolvement',
 'JobLevel',
 'JobSatisfaction',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StockOptionLevel',
 'WorkLifeBalance']

Semua categorical features, entah nominal ataupun ordinal akan digabung dan diaplikasikan dengan One-Hot Encoder.
Alasan mengapa ordinal features juga menggunakan One-Hot Encoder adalah karena pada kasus classifier, ordinal feature yang mempunyai continuous behaviour tidak berpengaruh seperti pada kasus regressor. Melakukan pe-ranking-an pada suatu feature menjadi tidak berguna.

# Model

Pada classifier problem, kita akan memilih salah satu metrik penilaian yang akan dijadikan acuan. Ini dikarenakan False Positive dan False Negative akan selalu trade-off satu sama lain. Jadinya, umumnya kita akan dihadapkan dengan 2 pilihan berikut:
- Kasus False Negative lebih beresiko daripada kasus False Positive
- Kasus False Positive lebih beresiko daripada kasus False Negative

Di kasus ini, False Positive dan False Negative bisa diterjemahkan seperti ini:
- FP : Pegawai yang tidak keluar, terprediksi keluar.
- FN : Pegawai yang keluar, terprediksi tidak keluar.

Untuk kasus Attrition, saya menganggap kasus **False Negative adalah yang beresiko**.

Alasannya adalah, apabila ada pegawai keluar namun terprediksi tidak keluar, perusahaan beresiko kehilangan pegawai potensialnya. Dengan decision seperti ini, maka saya putuskan untuk memilih **Recall** sebagai metric yang diutamakan.

In [8]:
X, y = Xy(df)
X_wo, y_wo = Xy(df_without_outlier)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [10]:
algorithm_list = [LogisticRegression,DecisionTreeClassifier,RandomForestClassifier, GradientBoostingClassifier]

a = algorithm_report_accumulation(algorithm_list, X, y, .2, 'with Outliers')
b = algorithm_report_accumulation(algorithm_list, X_wo, y_wo, .2, 'without Outliers')
x = pd.concat([a, b],ignore_index=True)
x.set_index('Algorithm')

Unnamed: 0_level_0,Train Recall,Test Recall,Notes
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.007092,0.028571,with Outliers
<class 'sklearn.tree._classes.DecisionTreeClassifier'>,1.0,0.285714,with Outliers
<class 'sklearn.ensemble._forest.RandomForestClassifier'>,1.0,0.142857,with Outliers
<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.765957,0.228571,with Outliers
<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.033333,0.136364,without Outliers
<class 'sklearn.tree._classes.DecisionTreeClassifier'>,1.0,0.5,without Outliers
<class 'sklearn.ensemble._forest.RandomForestClassifier'>,1.0,0.181818,without Outliers
<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.922222,0.318182,without Outliers


**Temuan** :
- LogisticRegression dengan default parameter underfit dengan parah. Entah di dataset dengan dan tanpa outlier, recall score tidak sampai 0.15.
- Model lainnya (selain LogisticRegression) overfit.

Dengan temuan ini, saya tidak melanjutkan untuk menggunakan LogisticRegression. Dengan asumsi bahwa dengan score serendah itu, tentunya akan memerlukan effort lebih untuk menaikkan scorenya, meskipun menggunakan Hyperparameter Tuning. Karena pada umumnya lebih mudah untuk melakukan tuning pada model yang overfit daripada yang underfit.

# Dataset Cross Validation Checking

In [11]:
from sklearn.model_selection import cross_val_score

In [12]:
def find_CVS(features,target,model, partition, scoring_system):
    X_train, X_test, y_train, y_test = train_test_split(features,target,random_state=11111992)
    classification = model()
    score = cross_val_score(classification,features, target, cv=partition, scoring=scoring_system).mean()
    return score

In [13]:
def cv_score_accumulation(algorithm_list, X, y, partition, scoring_system, notes):
    cv_score = []
    notes_arr = []
    for i in algorithm_list :
        score = find_CVS(X,y,i, partition, scoring_system)
        cv_score.append(score)
        notes_arr.append(notes)

    cv_df = pd.DataFrame({
        'Algorithm': algorithm_list,
        'Notes': notes_arr,
        'CV Score': cv_score
    })
    return cv_df

In [14]:
algorithm_list = [DecisionTreeClassifier,RandomForestClassifier, GradientBoostingClassifier]
a = cv_score_accumulation(algorithm_list, X, y, 5, 'recall', 'with Outliers')
b = cv_score_accumulation(algorithm_list, X_wo, y_wo, 5, 'recall', 'without Outliers')
x = pd.concat([a, b],ignore_index=True)
x.set_index('Algorithm').sort_values(by='CV Score', ascending=False)

Unnamed: 0_level_0,Notes,CV Score
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1
<class 'sklearn.tree._classes.DecisionTreeClassifier'>,without Outliers,0.357708
<class 'sklearn.tree._classes.DecisionTreeClassifier'>,with Outliers,0.352381
<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,with Outliers,0.295397
<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,without Outliers,0.259684
<class 'sklearn.ensemble._forest.RandomForestClassifier'>,without Outliers,0.197233
<class 'sklearn.ensemble._forest.RandomForestClassifier'>,with Outliers,0.152857


**Temuan** : Dengan split sebanyak 5 dan recall-scoring, ditemukan bahwa CV Score dari dataset dengan Outliers lebih baik daripada CV Score dari datest tanpa Outliers.

Maka, selanjutnya kita akan menggunakan algoritma DecisionTree, RandomForest, juga GradientBoosting menggunakan dataset dengan outliers untuk Hyperparameter Tuning.

# Hyperparameter Tuning

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
import numpy as np

In [16]:
algorithm_list = [DecisionTreeClassifier,RandomForestClassifier, GradientBoostingClassifier]
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,random_state=11111992)

def pipelining(algorithm):
    model_pipeline = Pipeline([
        ('pca', PCA()),
        ('algorithm', algorithm())
    ])
    return model_pipeline

In [37]:
params_for_DecisionTreeClassifier = {
    'pca__n_components': [10,15,20],
    'algorithm__random_state': [11111992],
    'algorithm__splitter' : ['best', 'random'],
    'algorithm__max_features' : list(range(1,X_train.shape[1])),
    'algorithm__class_weight': [{0: 1, 1: 4.75}],
    # 'algorithm__class_weight': [{0: x, 1: 1.0-x} for x in np.linspace(0.05, 0.95, 20)], ----> Terlalu berat
    # 'algorithm__class_weight': [{0: 0.75, 1: 0.25}, {0: 0.7, 1: 0.3}, {0: 0.8, 1: 0.2}], ---> Masih berat
    'algorithm__max_depth' : np.linspace(4, 15, 10, endpoint=True),
    'algorithm__min_samples_split' : [200,300,400],
    'algorithm__min_samples_leaf' : [100,150,200],
}

## _________________________________________________________________________________ Purpose :
# DecisionTreeClassifier()
## _________________________________________________________________________________ Report :
# Fitting 5 folds for each of 32940 candidates, totalling 164700 fits
# [Parallel(n_jobs=-1)]: Done 164700 out of 164700 | elapsed: 16.5min finished
## _________________________________________________________________________________ Verbose Output :
# GridSearchCV(cv=5,
#              estimator=Pipeline(steps=[('pca', PCA()),
#                                        ('algorithm',
#                                         DecisionTreeClassifier())]),
#              n_jobs=-1,
#              param_grid={'algorithm__class_weight': [{0: 1, 1: 4.75}],
#                          'algorithm__max_depth': array([ 4.        ,  5.22222222,  6.44444444,  7.66666667,  8.88888889,
#        10.11111111, 11.33333333, 12.55555556, 13.77777778, 15.        ]),
#                          'algorithm__max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9,
#                                                      10, 11, 12, 13, 14, 15, 16,
#                                                      17, 18, 19, 20, 21, 22, 23,
#                                                      24, 25, 26, 27, 28, 29, 30, ...],
#                          'algorithm__min_samples_leaf': [100, 150, 200],
#                          'algorithm__min_samples_split': [200, 300, 400],
#                          'algorithm__random_state': [11111992],
#                          'algorithm__splitter': ['best', 'random'],
#                          'pca__n_components': [10, 15, 20]},
#              scoring='recall', verbose=1)
## _________________________________________________________________________________ grid.best_params_
# {'algorithm__class_weight': {0: 1, 1: 4.75},
#  'algorithm__max_depth': 4.0,
#  'algorithm__max_features': 1,
#  'algorithm__min_samples_leaf': 100,
#  'algorithm__min_samples_split': 200,
#  'algorithm__random_state': 11111992,
#  'algorithm__splitter': 'random',
#  'pca__n_components': 10}

params_for_RandomForestClassifier = {
    'pca__n_components': [10,15,20],
    'algorithm__random_state': [11111992],
    'algorithm__max_features' : list(range(1,X_train.shape[1])),
    'algorithm__class_weight': [{0: 1, 1: 4.75}],
    'algorithm__max_depth' : np.linspace(4, 15, 10, endpoint=True),
    'algorithm__min_samples_split' : [200,300,400],
    'algorithm__min_samples_leaf' : [100,150,200],
}

In [18]:
grid = GridSearchCV(pipelining(DecisionTreeClassifier), params, cv=5, scoring='recall', n_jobs=-1, verbose=1)

In [19]:
%%script false --no-raise-error
grid.fit(X_train, y_train) # for DecisionTreeClassifier()

Fitting 5 folds for each of 32940 candidates, totalling 164700 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 1260 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 3260 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done 6060 tasks      | elapsed:   43.8s
[Parallel(n_jobs=-1)]: Done 9660 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 14060 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 19260 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 25260 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 32060 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 39660 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 48060 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 57260 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 67260 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 78060 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 89660 

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('pca', PCA()),
                                       ('algorithm',
                                        DecisionTreeClassifier())]),
             n_jobs=-1,
             param_grid={'algorithm__class_weight': [{0: 1, 1: 4.75}],
                         'algorithm__max_depth': array([ 4.        ,  5.22222222,  6.44444444,  7.66666667,  8.88888889,
       10.11111111, 11.33333333, 12.55555556, 13.77777778, 15.        ]),
                         'algorithm__max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                     10, 11, 12, 13, 14, 15, 16,
                                                     17, 18, 19, 20, 21, 22, 23,
                                                     24, 25, 26, 27, 28, 29, 30, ...],
                         'algorithm__min_samples_leaf': [100, 150, 200],
                         'algorithm__min_samples_split': [200, 300, 400],
                         'algorithm__r

In [20]:
grid.best_params_

{'algorithm__class_weight': {0: 1, 1: 4.75},
 'algorithm__max_depth': 4.0,
 'algorithm__max_features': 1,
 'algorithm__min_samples_leaf': 100,
 'algorithm__min_samples_split': 200,
 'algorithm__random_state': 11111992,
 'algorithm__splitter': 'random',
 'pca__n_components': 10}

In [21]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm__class_weight,param_algorithm__max_depth,param_algorithm__max_features,param_algorithm__min_samples_leaf,param_algorithm__min_samples_split,param_algorithm__random_state,...,param_pca__n_components,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.052028,0.023372,0.006820,0.001052,"{0: 1, 1: 4.75}",4,1,100,200,11111992,...,10,"{'algorithm__class_weight': {0: 1, 1: 4.75}, '...",0.370370,0.576923,0.500000,0.576923,0.481481,0.501140,0.076124,6157
1,0.015970,0.004384,0.006762,0.001772,"{0: 1, 1: 4.75}",4,1,100,200,11111992,...,15,"{'algorithm__class_weight': {0: 1, 1: 4.75}, '...",0.592593,0.769231,0.500000,0.653846,0.407407,0.584615,0.124500,3667
2,0.021178,0.002208,0.007619,0.000248,"{0: 1, 1: 4.75}",4,1,100,200,11111992,...,20,"{'algorithm__class_weight': {0: 1, 1: 4.75}, '...",0.555556,0.769231,0.500000,0.653846,0.629630,0.621652,0.091703,2071
3,0.016053,0.001422,0.007029,0.001068,"{0: 1, 1: 4.75}",4,1,100,200,11111992,...,10,"{'algorithm__class_weight': {0: 1, 1: 4.75}, '...",0.777778,0.923077,0.961538,0.884615,0.777778,0.864957,0.075223,1
4,0.018503,0.001096,0.013315,0.011076,"{0: 1, 1: 4.75}",4,1,100,200,11111992,...,15,"{'algorithm__class_weight': {0: 1, 1: 4.75}, '...",0.000000,0.000000,0.461538,0.346154,0.296296,0.220798,0.188082,7381
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32935,0.017221,0.001922,0.000000,0.000000,"{0: 1, 1: 4.75}",15,61,200,400,11111992,...,15,"{'algorithm__class_weight': {0: 1, 1: 4.75}, '...",,,,,,,,14212
32936,0.018153,0.001917,0.000000,0.000000,"{0: 1, 1: 4.75}",15,61,200,400,11111992,...,20,"{'algorithm__class_weight': {0: 1, 1: 4.75}, '...",,,,,,,,14211
32937,0.015956,0.003959,0.000000,0.000000,"{0: 1, 1: 4.75}",15,61,200,400,11111992,...,10,"{'algorithm__class_weight': {0: 1, 1: 4.75}, '...",,,,,,,,14210
32938,0.026303,0.008191,0.000000,0.000000,"{0: 1, 1: 4.75}",15,61,200,400,11111992,...,15,"{'algorithm__class_weight': {0: 1, 1: 4.75}, '...",,,,,,,,28961


In [22]:
grid.best_estimator_

Pipeline(steps=[('pca', PCA(n_components=10)),
                ('algorithm',
                 DecisionTreeClassifier(class_weight={0: 1, 1: 4.75},
                                        max_depth=4.0, max_features=1,
                                        min_samples_leaf=100,
                                        min_samples_split=200,
                                        random_state=11111992,
                                        splitter='random'))])

In [23]:
grid.best_score_

0.8649572649572651

In [33]:
grid.score(X_train, y_train)

0.8560606060606061

In [24]:
grid.score(X_test, y_test)

0.9545454545454546

In [35]:
from sklearn.metrics import classification_report
y_pred = grid.predict(X_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.23      0.37       639
           1       0.19      0.86      0.31       132

    accuracy                           0.34       771
   macro avg       0.54      0.54      0.34       771
weighted avg       0.77      0.34      0.36       771



In [36]:
y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.28      0.43       214
           1       0.21      0.95      0.35        44

    accuracy                           0.39       258
   macro avg       0.59      0.62      0.39       258
weighted avg       0.84      0.39      0.42       258



In [40]:
print('######################################################################################################### Random Forest')

######################################################################################################### Random Forest


In [41]:
grid = GridSearchCV(pipelining(RandomForestClassifier), params_for_RandomForestClassifier, cv=5, scoring='recall', n_jobs=-1, verbose=1)

In [42]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 16470 candidates, totalling 82350 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   38.1s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 8442 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 9792 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 11242 tasks      |

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('pca', PCA()),
                                       ('algorithm',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'algorithm__class_weight': [{0: 1, 1: 4.75}],
                         'algorithm__max_depth': array([ 4.        ,  5.22222222,  6.44444444,  7.66666667,  8.88888889,
       10.11111111, 11.33333333, 12.55555556, 13.77777778, 15.        ]),
                         'algorithm__max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                     10, 11, 12, 13, 14, 15, 16,
                                                     17, 18, 19, 20, 21, 22, 23,
                                                     24, 25, 26, 27, 28, 29, 30, ...],
                         'algorithm__min_samples_leaf': [100, 150, 200],
                         'algorithm__min_samples_split': [200, 300, 400],
                         'algorithm__r

In [43]:
grid.best_params_

{'algorithm__class_weight': {0: 1, 1: 4.75},
 'algorithm__max_depth': 5.222222222222222,
 'algorithm__max_features': 18,
 'algorithm__min_samples_leaf': 150,
 'algorithm__min_samples_split': 300,
 'algorithm__random_state': 11111992,
 'pca__n_components': 20}

In [44]:
grid.best_score_

0.6290598290598289

In [45]:
grid.score(X_train, y_train)

0.6363636363636364

In [46]:
grid.score(X_test, y_test)

0.45454545454545453

In [47]:
y_pred = grid.predict(X_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.72      0.80       639
           1       0.32      0.64      0.42       132

    accuracy                           0.70       771
   macro avg       0.61      0.68      0.61       771
weighted avg       0.80      0.70      0.74       771



In [48]:
y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.74      0.80       214
           1       0.26      0.45      0.33        44

    accuracy                           0.69       258
   macro avg       0.57      0.60      0.57       258
weighted avg       0.76      0.69      0.72       258



In [28]:
np.linspace(4, 15, 10, endpoint=True)

array([ 4.        ,  5.22222222,  6.44444444,  7.66666667,  8.88888889,
       10.11111111, 11.33333333, 12.55555556, 13.77777778, 15.        ])

In [29]:

np.linspace(0.05, 0.95, 20)

array([0.05      , 0.09736842, 0.14473684, 0.19210526, 0.23947368,
       0.28684211, 0.33421053, 0.38157895, 0.42894737, 0.47631579,
       0.52368421, 0.57105263, 0.61842105, 0.66578947, 0.71315789,
       0.76052632, 0.80789474, 0.85526316, 0.90263158, 0.95      ])

In [30]:
def x(m1, m2, m3, l):
    m_total = (m1+m2+m3)/3
    m_total = m_total*40/100
    l = l / 10
    return m_total + l

In [31]:
[int(x) for x in np.linspace(4, 20, num = 2)]

[4, 20]

In [32]:
np.linspace(4, 20, 2)

array([ 4., 20.])