# Attrition

In [2]:
#Importing the required modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#importing the required sklearn methods
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score

%matplotlib inline

In [3]:
df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")

dec_cols = ['Age', 'Attrition', 'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'TotalWorkingYears', 'WorkLifeBalance', 'YearsInCurrentRole']
df = df[dec_cols]

categorical_col = []
for column in df.columns:
    if df[column].dtype == object and len(df[column].unique()) <= 50:
        categorical_col.append(column)
        
df['Attrition'] = df.Attrition.astype("category").cat.codes

categorical_col.remove('Attrition')


label = LabelEncoder()
for column in categorical_col:
    df[column] = label.fit_transform(df[column])

norm_feature = ['Age', 'MonthlyIncome','PercentSalaryHike', 'TotalWorkingYears', 'YearsInCurrentRole']
for feature_name in norm_feature:
    max_value = df[feature_name].max()
    min_value = df[feature_name].min()
    df[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)

df = pd.get_dummies(df, columns=categorical_col)

In [4]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [5]:
X = df.drop('Attrition', axis=1)
y = df.Attrition

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
y_test_list = y_test.tolist()


In [6]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=100)
rf_clf.fit(X_train, y_train)


print("Details of the Random Forest classifier using Imbalance dataset\n ________________")
#print_score(rf_clf, X_train, y_train, X_test, y_test, train=True)
print_score(rf_clf, X_train, y_train, X_test, y_test, train=False)

y_pred_rf = rf_clf.predict(X_test)
y_pred_rf_list = y_pred_rf.tolist()

print("\n================================================")
print("AOC value = ", (np.round(roc_auc_score(y_test, y_pred_rf_list),4)))

Details of the Random Forest classifier using Imbalance dataset
 ________________
Test Result:
Accuracy Score: 86.14%
_______________________________________________
CLASSIFICATION REPORT:
                    0          1  accuracy   macro avg  weighted avg
precision    0.883191   0.411765  0.861413    0.647478      0.821701
recall       0.968750   0.145833  0.861413    0.557292      0.861413
f1-score     0.923994   0.215385  0.861413    0.569689      0.831567
support    320.000000  48.000000  0.861413  368.000000    368.000000
_______________________________________________
Confusion Matrix: 
 [[310  10]
 [ 41   7]]


AOC value =  0.5573


In [7]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(sampling_strategy='minority')
X = df.drop('Attrition', axis=1)
y = df.Attrition
X_smote, y_smote = sm.fit_resample(X, y)
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_smote, y_smote, test_size=0.25, random_state=42)
y_test_list = y_test.tolist()

In [8]:
from sklearn.ensemble import RandomForestClassifier

rf_clf_bal = RandomForestClassifier(n_estimators=100)
rf_clf_bal.fit(X_train_bal, y_train_bal)


print("Details of the Random Forest classifier using balance dataset\n ________________")
#print_score(rf_clf_bal, X_train_bal, y_train_bal, X_test, y_test, train=True)
print_score(rf_clf_bal, X_train_bal, y_train_bal, X_test_bal, y_test_bal, train=False)

y_pred_rf_bal = rf_clf_bal.predict(X_test_bal)
y_pred_rf_bal_list = y_pred_rf_bal.tolist()

print("\n================================================")
print("AOC value = ", (np.round(roc_auc_score(y_test_bal, y_pred_rf_bal_list),4)))

Details of the Random Forest classifier using balance dataset
 ________________
Test Result:
Accuracy Score: 88.49%
_______________________________________________
CLASSIFICATION REPORT:
                    0           1  accuracy   macro avg  weighted avg
precision    0.877358    0.892977  0.884927    0.885168      0.885104
recall       0.897106    0.872549  0.884927    0.884828      0.884927
f1-score     0.887122    0.882645  0.884927    0.884884      0.884902
support    311.000000  306.000000  0.884927  617.000000    617.000000
_______________________________________________
Confusion Matrix: 
 [[279  32]
 [ 39 267]]


AOC value =  0.8848


import shap

shap.initjs()
explainer = shap.TreeExplainer(rf_clf_bal)
shap_values = explainer.shap_values(X_smote)

shap.summary_plot(shap_values, X_smote)