## Imports

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from datatile.summary.df import DataFrameSummary
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
import seaborn as sns
from matplotlib import pyplot
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import matplotlib

This is a fictional data set created by IBM data scientists to explore attrition and employee retention rates, usually this is data that would be gathered by the Human resources department.

In [3]:
# ref: https://www.kaggle.com/datasets/pavansubhasht/ibm-hr-analytics-attrition-dataset
ec_df = pd.read_csv('./WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [4]:
ec_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


In [5]:
ec_df.shape

(1470, 35)

### Preprocessing pipeline

In [6]:
#Getting the columns that are categorical and the ones that are numerical
columns = ec_df.columns

In [7]:
categorical_columns = []
numerical_columns = []
for i in columns:
    if ec_df[i].dtype == 'object':
        categorical_columns.append(i)
    else:
        numerical_columns.append(i)

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
numeric_pipeline = Pipeline(steps = [('num_imputer',SimpleImputer(strategy='median')),('scaler',MinMaxScaler())])
category_pipeline = Pipeline(steps = [('cat_imputer',SimpleImputer(strategy='most_frequent')),('encoder',OneHotEncoder(handle_unknown='ignore',drop='first'))])
#Creating a transformer pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
preprocessing = ColumnTransformer(transformers= [('num',numeric_pipeline, make_column_selector(dtype_exclude='object')),
                                               ('cat',category_pipeline, make_column_selector(dtype_include='object')),])

In [9]:
X = ec_df.drop('Attrition',axis =1 )
y = ec_df['Attrition']

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size = 0.3, stratify=y,random_state=100)

### Baseline Models

### Decision Tree Classifier

In [11]:
from sklearn.tree import DecisionTreeClassifier
dec_clf = DecisionTreeClassifier()
dec_pipe = Pipeline(steps = [('preprocessing',preprocessing),('model',dec_clf)])

In [12]:
dec_pipe.fit(X_train,y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('num_imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   MinMaxScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x13ae6d490>),
                                                 ('cat',
                                                  Pipeline(steps=[('cat_imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(drop='first',
        

In [13]:
from sklearn.metrics import roc_auc_score,confusion_matrix,accuracy_score
train_preds = dec_pipe.predict(X_train)
val_preds = dec_pipe.predict(X_val)

train_preds_proba = dec_pipe.predict_proba(X_train)
val_preds_proba = dec_pipe.predict_proba(X_val)

print('Training & Validation ROC AUC Scores:\n', '-'*40)
print('Training   roc auc score= {:.4f}'.format(roc_auc_score(y_train, train_preds_proba[:, 1])))
print('Validation roc auc score= {:.4f}'.format(roc_auc_score(y_val, val_preds_proba[:, 1])))
print('')
print('Training & Validation Confusion Metrices:')
print('Training   confusion matrix:\n', confusion_matrix(y_train, train_preds))
print('Validation confusion matrix:\n', confusion_matrix(y_val, val_preds))
print('Training Accuracy of the model:\n',accuracy_score(y_train,train_preds))
print('Testing Accuracy of the model:\n',round(accuracy_score(y_val,val_preds),4))

Training & Validation ROC AUC Scores:
 ----------------------------------------
Training   roc auc score= 1.0000
Validation roc auc score= 0.5882

Training & Validation Confusion Metrices:
Training   confusion matrix:
 [[863   0]
 [  0 166]]
Validation confusion matrix:
 [[305  65]
 [ 46  25]]
Training Accuracy of the model:
 1.0
Testing Accuracy of the model:
 0.7483


### Hyper-parameter tuning for DecisionTreesClassifier

In [14]:
import warnings
warnings.filterwarnings('ignore')
import time
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
start = time.time()
parameters = {'model__criterion':['gini','entropy'],'model__max_depth':[2,4,6,8],'model__min_samples_split':[2,3,4]}
clf=GridSearchCV(dec_pipe,parameters,scoring='roc_auc',n_jobs = -1)
clf.fit(X_train, y_train)
print(clf.best_params_)
predicted=clf.predict(X_val)
proba = clf.predict_proba(X_val)
print('Accuracy of the model after hyperparameter tuning')
print(round(accuracy_score(y_val, predicted),4))
print('AUC Score of the result is:')
print(round(roc_auc_score(y_val, proba[:, 1]),4))
end = time.time()
print('Execution time is:')
print(end - start)

{'model__criterion': 'entropy', 'model__max_depth': 2, 'model__min_samples_split': 2}
Accuracy of the model after hyperparameter tuning
0.8277
AUC Score of the result is:
0.6902
Execution time is:
1.5954921245574951


### KNN

In [15]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
knn_pipe = Pipeline(steps = [('preprocessing',preprocessing),('model',knn_clf)])

In [16]:
knn_pipe.fit(X_train,y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('num_imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   MinMaxScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x13ae6d490>),
                                                 ('cat',
                                                  Pipeline(steps=[('cat_imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(drop='first',
        

In [17]:
from sklearn.metrics import roc_auc_score,confusion_matrix,accuracy_score
train_preds = knn_pipe.predict(X_train)
val_preds = knn_pipe.predict(X_val)

train_preds_proba = knn_pipe.predict_proba(X_train)
val_preds_proba = knn_pipe.predict_proba(X_val)

print('Training & Validation ROC AUC Scores:\n', '-'*40)
print('Training   roc auc score= {:.4f}'.format(roc_auc_score(y_train, train_preds_proba[:, 1])))
print('Validation roc auc score= {:.4f}'.format(roc_auc_score(y_val, val_preds_proba[:, 1])))
print('')
print('Training & Validation Confusion Metrices:')
print('Training   confusion matrix:\n', confusion_matrix(y_train, train_preds))
print('Validation confusion matrix:\n', confusion_matrix(y_val, val_preds))
print('Training Accuracy of the model:\n',round(accuracy_score(y_train,train_preds),4))
print('Testing Accuracy of the model:\n',round(accuracy_score(y_val,val_preds),4))

Training & Validation ROC AUC Scores:
 ----------------------------------------
Training   roc auc score= 0.8949
Validation roc auc score= 0.6546

Training & Validation Confusion Metrices:
Training   confusion matrix:
 [[851  12]
 [120  46]]
Validation confusion matrix:
 [[365   5]
 [ 61  10]]
Training Accuracy of the model:
 0.8717
Testing Accuracy of the model:
 0.8503


### Hyper-parameter tuning for KNN

In [18]:
import warnings
warnings.filterwarnings('ignore')
import time
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
start = time.time()
parameters = {'model__n_neighbors':[3,5,10,15,20],'model__weights':['uniform','distance'],'model__metric':['euclidean','manhattan']}
clf=GridSearchCV(knn_pipe,parameters,scoring='roc_auc',n_jobs = -1)
clf.fit(X_train, y_train)
print(clf.best_params_)
predicted=clf.predict(X_val)
proba = clf.predict_proba(X_val)
print('Accuracy of the model after hyperparameter tuning')
print(round(accuracy_score(y_val, predicted),4))
print('AUC Score of the result is:')
print(round(roc_auc_score(y_val, proba[:, 1]),4))
end = time.time()
print('Execution time is:')
print(end - start)

{'model__metric': 'manhattan', 'model__n_neighbors': 20, 'model__weights': 'uniform'}
Accuracy of the model after hyperparameter tuning
0.8413
AUC Score of the result is:
0.769
Execution time is:
0.4786062240600586
