# Import Library & Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder

from func import outlier_counter, get_all_univariate_outlier_index
from modelling_purpose import Xy, algorithm_report_accumulation

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('csv/imputed.csv')

# With / Without Outlier

Kita akan membuat 2 dataset dari `imputed.csv`, yaitu:
- Dengan Outlier
- Tanpa Outlier (Univariate)

In [3]:
df_without_outlier = df.copy()
outlier_columns  = ['TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole',
                    'YearsSinceLastPromotion', 'YearsWithCurrManager', 'TrainingTimesLastYear',
                    'NumCompaniesWorked', 'MonthlyIncome']
outlier_index = get_all_univariate_outlier_index(df_without_outlier, outlier_columns)
df_without_outlier.drop(df_without_outlier.index[outlier_index], inplace=True)

In [4]:
df.shape

(1029, 31)

In [5]:
df_without_outlier.shape

(650, 31)

`df` adalah dataset dengan outlier. Dan `df_without_outlier` adalah dataset tanpa outlier.

# Feature Engineering

In [6]:
# def one_hot(df,column):
#     df = pd.concat(
#     [
#         df,
#         pd.get_dummies(df[column], prefix=column, drop_first=True)
#     ],
#     axis=1)
#     final = df.drop(columns=column)
#     return final

In [7]:
X = df.drop('Attrition',axis=1)
y = df['Attrition'].map({'Yes':1,'No':0})
X_wo = df_without_outlier.drop('Attrition',axis=1)
y_wo = df_without_outlier['Attrition'].map({'Yes':1,'No':0})

categorical_features = X.select_dtypes(include='O').columns.tolist()
ordinal = ['Education', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel',
           'JobSatisfaction', 'PerformanceRating', 'RelationshipSatisfaction',
           'StockOptionLevel', 'WorkLifeBalance']
for i in ordinal:
    X[i] = X[i].astype(str)
categorical_features += ordinal
categorical_features = categorical_features
categorical_features

['BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'OverTime',
 'Education',
 'EnvironmentSatisfaction',
 'JobInvolvement',
 'JobLevel',
 'JobSatisfaction',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StockOptionLevel',
 'WorkLifeBalance']

Semua categorical features, entah nominal ataupun ordinal akan digabung dan diaplikasikan dengan One-Hot Encoder.
Alasan mengapa ordinal features juga menggunakan One-Hot Encoder adalah karena pada kasus classifier, ordinal feature yang mempunyai continuous behaviour tidak berpengaruh seperti pada kasus regressor. Melakukan pe-ranking-an pada suatu feature menjadi tidak berguna.

In [8]:
X = pd.get_dummies(X, columns=categorical_features, drop_first=True)
X_wo = pd.get_dummies(X_wo, columns=categorical_features, drop_first=True)

# Model

Pada classifier problem, kita akan memilih salah satu metrik penilaian yang akan dijadikan acuan. Ini dikarenakan False Positive dan False Negative akan selalu trade-off satu sama lain. Jadinya, umumnya kita akan dihadapkan dengan 2 pilihan berikut:
- Kasus False Negative lebih beresiko daripada kasus False Positive
- Kasus False Positive lebih beresiko daripada kasus False Negative

Di kasus ini, False Positive dan False Negative bisa diterjemahkan seperti ini:
- FP : Pegawai yang tidak keluar, terprediksi keluar.
- FN : Pegawai yang keluar, terprediksi tidak keluar.

Untuk kasus Attrition, saya menganggap kasus **False Negative adalah yang beresiko**.

Alasannya adalah, apabila ada pegawai keluar namun terprediksi tidak keluar, perusahaan beresiko kehilangan pegawai potensialnya. Dengan decision seperti ini, maka saya putuskan untuk memilih **Recall** sebagai metric yang diutamakan.

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [10]:
algorithm_list = [LogisticRegression,DecisionTreeClassifier,RandomForestClassifier, GradientBoostingClassifier]

a = algorithm_report_accumulation(algorithm_list, X, y, .2, 'with Outliers')
b = algorithm_report_accumulation(algorithm_list, X_wo, y_wo, .2, 'without Outliers')
x = pd.concat([a, b],ignore_index=True)
x.set_index('Algorithm')

Unnamed: 0_level_0,Train Recall,Test Recall,Notes
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.007092,0.028571,with Outliers
<class 'sklearn.tree._classes.DecisionTreeClassifier'>,1.0,0.257143,with Outliers
<class 'sklearn.ensemble._forest.RandomForestClassifier'>,1.0,0.142857,with Outliers
<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.765957,0.228571,with Outliers
<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.033333,0.136364,without Outliers
<class 'sklearn.tree._classes.DecisionTreeClassifier'>,1.0,0.5,without Outliers
<class 'sklearn.ensemble._forest.RandomForestClassifier'>,1.0,0.181818,without Outliers
<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.922222,0.318182,without Outliers


**Temuan** :
- LogisticRegression dengan default parameter underfit dengan parah. Entah di dataset dengan dan tanpa outlier, recall score tidak sampai 0.15.
- Model lainnya (selain LogisticRegression) overfit.

Dengan temuan ini, saya tidak melanjutkan untuk menggunakan LogisticRegression. Dengan asumsi bahwa dengan score serendah itu, tentunya akan memerlukan effort lebih untuk menaikkan scorenya, meskipun menggunakan Hyperparameter Tuning. Karena pada umumnya lebih mudah untuk melakukan tuning pada model yang overfit daripada yang underfit.

# Dataset Cross Validation Checking

In [11]:
from sklearn.model_selection import cross_val_score

In [12]:
def find_CVS(features,target,model, partition, scoring_system):
    X_train, X_test, y_train, y_test = train_test_split(features,target,random_state=11111992)
    classification = model()
    score = cross_val_score(classification,features, target, cv=partition, scoring=scoring_system).mean()
    return score

In [13]:
def cv_score_accumulation(algorithm_list, X, y, partition, scoring_system, notes):
    cv_score = []
    notes_arr = []
    for i in algorithm_list :
        score = find_CVS(X,y,i, partition, scoring_system)
        cv_score.append(score)
        notes_arr.append(notes)

    cv_df = pd.DataFrame({
        'Algorithm': algorithm_list,
        'Notes': notes_arr,
        'CV Score': cv_score
    })
    return cv_df

In [14]:
algorithm_list = [DecisionTreeClassifier,RandomForestClassifier, GradientBoostingClassifier]
a = cv_score_accumulation(algorithm_list, X, y, 5, 'recall', 'with Outliers')
b = cv_score_accumulation(algorithm_list, X_wo, y_wo, 5, 'recall', 'without Outliers')
# x = pd.concat([a, b],ignore_index=True)
# x.set_index('Algorithm').sort_values(by='CV Score', ascending=False)
a

Unnamed: 0,Algorithm,Notes,CV Score
0,<class 'sklearn.tree._classes.DecisionTreeClas...,with Outliers,0.346508
1,<class 'sklearn.ensemble._forest.RandomForestC...,with Outliers,0.153175
2,<class 'sklearn.ensemble._gb.GradientBoostingC...,with Outliers,0.295397


In [15]:
b

Unnamed: 0,Algorithm,Notes,CV Score
0,<class 'sklearn.tree._classes.DecisionTreeClas...,without Outliers,0.358498
1,<class 'sklearn.ensemble._forest.RandomForestC...,without Outliers,0.152569
2,<class 'sklearn.ensemble._gb.GradientBoostingC...,without Outliers,0.259684


**Temuan** : Dengan split sebanyak 5 dan recall-scoring, ditemukan bahwa urutan algoritma tidak berubah dengan ada atau tidaknya outlier.

Adapun saya memilih dataset dengan outliers karena pertimbangan berikut:
- Dengan memilih dataset tanpa outlier, maka jumlah rows berkurang sangat banyak. Dan ini berpengaruh dengan karakteristik data.
- Pada EDA, meskipun dianggap univariate outlier, namun secara keseluruhan ia bukan multivariate outliers (based on Mahalanobis Distance)
- According to the rank, dengan adanya outliers ataupun tidak, tidak membuat urutan algoritma tersebut berubah. 

Maka, selanjutnya kita akan menggunakan algoritma DecisionTree, RandomForest, juga GradientBoosting menggunakan dataset dengan outliers untuk Hyperparameter Tuning.

# Hyperparameter Tuning

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
import numpy as np

In [17]:
algorithm_list = [
    DecisionTreeClassifier,
    RandomForestClassifier,
    GradientBoostingClassifier
]
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,random_state=11111992)

def pipelining(algorithm):
    model_pipeline = Pipeline([
        ('pca', PCA()),
        ('algorithm', algorithm())
    ])
    return model_pipeline

In [18]:
params_for_DecisionTreeClassifier = {
    'pca__n_components': [10,15,20],
    'algorithm__random_state': [11111992],
    'algorithm__splitter' : ['best', 'random'],
    'algorithm__max_features' : list(range(1,X_train.shape[1])),
#     'algorithm__class_weight': [{0: 1, 1: 4.75}], ----> Bisa dicoba di laptop
#     'algorithm__class_weight': [{0: x, 1: 1.0-x} for x in np.linspace(0.05, 0.95, 20)], ----> Terlalu berat
#     'algorithm__class_weight': [{0: 0.75, 1: 0.25}, {0: 0.7, 1: 0.3}, {0: 0.8, 1: 0.2}], ---> Masih berat
    'algorithm__class_weight': [{0: 1, 1: 4.75}],
    'algorithm__max_depth' : np.linspace(4, 15, 10),
    'algorithm__min_samples_split' : [200,300,400],
    'algorithm__min_samples_leaf' : [100,150,200],
}

params_for_RandomForestClassifier = {
    'pca__n_components': [10,15,20],
    'algorithm__n_estimators': [100,300,500,700],
    'algorithm__random_state': [11111992],
#     'algorithm__max_features' : list(range(1,X_train.shape[1])),
    'algorithm__class_weight': [{0: 1, 1: 4.75}],
    'algorithm__max_depth' : np.linspace(4, 15, 10),
    'algorithm__min_samples_split' : [200,300,400],
    'algorithm__min_samples_leaf' : [100,150,200],
}

params_for_GradientBoostingClassifier = {
    'pca__n_components': [10,15,20],
#     'algorithm__criterion': ["friedman_mse",  "mae"],
#     'algorithm__loss':["deviance","exponential"],
    'algorithm__random_state': [11111992],
    'algorithm__learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3, 0.5],
    'algorithm__n_estimators': [100,400,800],
#     'algorithm__min_samples_split' : [200,300,400],
#     'algorithm__min_samples_leaf' : [100,150,200],
    'algorithm__max_depth' : np.linspace(4, 20, 4),
    'algorithm__subsample': [0.7,0.8],
}

In [19]:
params_list = [
    params_for_DecisionTreeClassifier,
    params_for_RandomForestClassifier,
    params_for_GradientBoostingClassifier
]

grid_list = []

for i in range(3):
    grid = GridSearchCV(pipelining(algorithm_list[i]),
                        params_list[i],
                        cv=5, scoring='recall', n_jobs=-1, verbose=2)
    print(algorithm_list[i])
    grid.fit(X_train, y_train)
    grid_list.append(grid)

<class 'sklearn.tree._classes.DecisionTreeClassifier'>
Fitting 5 folds for each of 32940 candidates, totalling 164700 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 956 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 2580 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done 4844 tasks      | elapsed:   32.7s
[Parallel(n_jobs=-1)]: Done 7764 tasks      | elapsed:   47.3s
[Parallel(n_jobs=-1)]: Done 11324 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 15540 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 20396 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 25908 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 32060 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 38868 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 46316 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 54420 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 63164 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 72564 t

<class 'sklearn.ensemble._forest.RandomForestClassifier'>
Fitting 5 folds for each of 1080 candidates, totalling 5400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   44.0s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done 3273 tasks      | elapsed: 16.5min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 20.3min
[Parallel(n_jobs=-1)]: Done 4893 tasks      | elapsed: 24.6min
[Parallel(n_jobs=-1)]: Done 5400 out of 5400 | elapsed: 27.1min finished


<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 12.1min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 19.0min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 30.2min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 44.1min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed: 55.6min
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed: 61.3min finished


In [20]:
%%script false --no-raise-error
grid.fit(X_train, y_train) # for DecisionTreeClassifier()

In [21]:
print('################################################################################################### Report')

################################################################################################### Report


In [22]:
for i in range(3):
    print('-----------------------------')
    print(algorithm_list[i])
    print(grid_list[i].best_estimator_)

-----------------------------
<class 'sklearn.tree._classes.DecisionTreeClassifier'>
Pipeline(steps=[('pca', PCA(n_components=10)),
                ('algorithm',
                 DecisionTreeClassifier(class_weight={0: 1, 1: 4.75},
                                        max_depth=4.0, max_features=1,
                                        min_samples_leaf=100,
                                        min_samples_split=200,
                                        random_state=11111992,
                                        splitter='random'))])
-----------------------------
<class 'sklearn.ensemble._forest.RandomForestClassifier'>
Pipeline(steps=[('pca', PCA(n_components=20)),
                ('algorithm',
                 RandomForestClassifier(class_weight={0: 1, 1: 4.75},
                                        max_depth=4.0, min_samples_leaf=150,
                                        min_samples_split=200, n_estimators=300,
                                        random_state=1

In [23]:
grid_list[0].best_params_

{'algorithm__class_weight': {0: 1, 1: 4.75},
 'algorithm__max_depth': 4.0,
 'algorithm__max_features': 1,
 'algorithm__min_samples_leaf': 100,
 'algorithm__min_samples_split': 200,
 'algorithm__random_state': 11111992,
 'algorithm__splitter': 'random',
 'pca__n_components': 10}

In [24]:
grid_list[1].best_params_

{'algorithm__class_weight': {0: 1, 1: 4.75},
 'algorithm__max_depth': 4.0,
 'algorithm__min_samples_leaf': 150,
 'algorithm__min_samples_split': 200,
 'algorithm__n_estimators': 300,
 'algorithm__random_state': 11111992,
 'pca__n_components': 20}

In [25]:
grid_list[2].best_params_

{'algorithm__learning_rate': 0.5,
 'algorithm__max_depth': 4.0,
 'algorithm__n_estimators': 100,
 'algorithm__random_state': 11111992,
 'algorithm__subsample': 0.7,
 'pca__n_components': 20}

In [26]:
for i in range(3):
    print('-----------------------------')
    print(algorithm_list[i])
    print(grid_list[i].best_score_)

-----------------------------
<class 'sklearn.tree._classes.DecisionTreeClassifier'>
0.8649572649572651
-----------------------------
<class 'sklearn.ensemble._forest.RandomForestClassifier'>
0.592022792022792
-----------------------------
<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>
0.22763532763532765


In [27]:
print(algorithm_list[0])
y_pred = grid_list[0].predict(X_train)
print(classification_report(y_train, y_pred))
y_pred = grid_list[0].predict(X_test)
print(classification_report(y_test, y_pred))

<class 'sklearn.tree._classes.DecisionTreeClassifier'>
              precision    recall  f1-score   support

           0       0.89      0.23      0.37       639
           1       0.19      0.86      0.31       132

    accuracy                           0.34       771
   macro avg       0.54      0.54      0.34       771
weighted avg       0.77      0.34      0.36       771

              precision    recall  f1-score   support

           0       0.97      0.28      0.43       214
           1       0.21      0.95      0.35        44

    accuracy                           0.39       258
   macro avg       0.59      0.62      0.39       258
weighted avg       0.84      0.39      0.42       258



In [28]:
print(algorithm_list[1])
y_pred = grid_list[1].predict(X_train)
print(classification_report(y_train, y_pred))
y_pred = grid_list[1].predict(X_test)
print(classification_report(y_test, y_pred))

<class 'sklearn.ensemble._forest.RandomForestClassifier'>
              precision    recall  f1-score   support

           0       0.91      0.77      0.83       639
           1       0.36      0.64      0.46       132

    accuracy                           0.74       771
   macro avg       0.64      0.70      0.65       771
weighted avg       0.82      0.74      0.77       771

              precision    recall  f1-score   support

           0       0.91      0.81      0.86       214
           1       0.40      0.61      0.48        44

    accuracy                           0.78       258
   macro avg       0.65      0.71      0.67       258
weighted avg       0.82      0.78      0.79       258



In [29]:
print(algorithm_list[2])
y_pred = grid_list[2].predict(X_train)
print(classification_report(y_train, y_pred))
y_pred = grid_list[2].predict(X_test)
print(classification_report(y_test, y_pred))

<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       639
           1       0.99      0.96      0.98       132

    accuracy                           0.99       771
   macro avg       0.99      0.98      0.99       771
weighted avg       0.99      0.99      0.99       771

              precision    recall  f1-score   support

           0       0.86      0.93      0.89       214
           1       0.42      0.25      0.31        44

    accuracy                           0.81       258
   macro avg       0.64      0.59      0.60       258
weighted avg       0.78      0.81      0.79       258



In [30]:
print('###################################################################################################### End of Report')

###################################################################################################### End of Report


**Temuan** :
- DecisionTreeClassifier memerlukan waktu yang lebih singkat (15 menit), namun mendapatkan score recall yang lebih baik daripada RandomForestClassifier (27 menit) dan juga GradientBoosting (61 menit)

Oleh karenanya, selanjutnya saya akan menggunakan DecisionTreeClassifier pada dataset dengan outlier. Akan tetapi, di notebook selanjutnya akan saya lakukan GridSearchCV sekali lagi pada model ini dengan parameter yang mungkin sedikit lebih banyak daripada pada notebook ini.