In [1]:
#import libraies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#import data
data = pd.read_csv('.\data\WA_Fn-UseC_-HR-Employee-Attrition.csv')

## Data Exploration

In [3]:
# Data Analysis
data.shape

(1470, 35)

In [4]:
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [6]:
# Unique values in each column
data.nunique().sort_values(ascending=True)

Over18                         1
StandardHours                  1
EmployeeCount                  1
Gender                         2
Attrition                      2
PerformanceRating              2
OverTime                       2
MaritalStatus                  3
Department                     3
BusinessTravel                 3
StockOptionLevel               4
EnvironmentSatisfaction        4
JobInvolvement                 4
JobSatisfaction                4
RelationshipSatisfaction       4
WorkLifeBalance                4
Education                      5
JobLevel                       5
EducationField                 6
TrainingTimesLastYear          7
JobRole                        9
NumCompaniesWorked            10
PercentSalaryHike             15
YearsSinceLastPromotion       16
YearsWithCurrManager          18
YearsInCurrentRole            19
DistanceFromHome              29
YearsAtCompany                37
TotalWorkingYears             40
Age                           43
HourlyRate

In [7]:
# split data to features and target
X = data.drop(['Attrition'], axis=1)
y = data['Attrition']

## Feature Engineering

In [8]:
# Drop unnecessary columns
X = X.drop(['EmployeeCount', 'StandardHours', 'Over18'], axis=1)

In [9]:
# mask for columns with unique values less than 10
# cahnge to categorical
mask = data.nunique() < 10
X.loc[:, mask] = X.loc[:, mask].astype('category')

In [10]:
# numeric columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
numeric_cols

Index(['Age', 'DailyRate', 'DistanceFromHome', 'EmployeeNumber', 'HourlyRate',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'PercentSalaryHike', 'TotalWorkingYears', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [11]:
# categorical columns
categorical_cols = X.select_dtypes(include=['category', 'object']).columns
categorical_cols

Index(['BusinessTravel', 'Department', 'Education', 'EducationField',
       'EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobLevel',
       'JobRole', 'JobSatisfaction', 'MaritalStatus', 'OverTime',
       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
       'TrainingTimesLastYear', 'WorkLifeBalance'],
      dtype='object')

In [12]:
# normalize numeric columns
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X.loc[:, numeric_cols] = scaler.fit_transform(X.loc[:, numeric_cols])
X.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,0.547619,Travel_Rarely,0.71582,Sales,0.0,2,Life Sciences,0.0,2,Female,...,3,1,0,0.2,0,1,0.15,0.222222,0.0,0.294118
1,0.738095,Travel_Frequently,0.1267,Research & Development,0.25,1,Life Sciences,0.000484,3,Male,...,4,4,1,0.25,3,3,0.25,0.388889,0.066667,0.411765
2,0.452381,Travel_Rarely,0.909807,Research & Development,0.035714,2,Other,0.001451,4,Male,...,3,2,0,0.175,3,3,0.0,0.0,0.0,0.0
3,0.357143,Travel_Frequently,0.923407,Research & Development,0.071429,4,Life Sciences,0.001935,4,Female,...,3,3,0,0.2,3,3,0.2,0.388889,0.2,0.0
4,0.214286,Travel_Rarely,0.350036,Research & Development,0.035714,1,Medical,0.002903,1,Male,...,3,4,1,0.15,3,3,0.05,0.111111,0.133333,0.117647


In [13]:
# categorical columns unique values
X.loc[:, categorical_cols].nunique().sort_values(ascending=True)

PerformanceRating           2
Gender                      2
OverTime                    2
BusinessTravel              3
Department                  3
MaritalStatus               3
StockOptionLevel            4
RelationshipSatisfaction    4
JobSatisfaction             4
WorkLifeBalance             4
JobInvolvement              4
EnvironmentSatisfaction     4
JobLevel                    5
Education                   5
EducationField              6
TrainingTimesLastYear       7
JobRole                     9
dtype: int64

In [14]:
# normalize categorical columns
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X.loc[:, categorical_cols] = X.loc[:, categorical_cols].apply(le.fit_transform)
X.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,0.547619,2,0.71582,2,0.0,1,1,0.0,1,0,...,0,0,0,0.2,0,0,0.15,0.222222,0.0,0.294118
1,0.738095,1,0.1267,1,0.25,0,1,0.000484,2,1,...,1,3,1,0.25,3,2,0.25,0.388889,0.066667,0.411765
2,0.452381,2,0.909807,1,0.035714,1,4,0.001451,3,1,...,0,1,0,0.175,3,2,0.0,0.0,0.0,0.0
3,0.357143,1,0.923407,1,0.071429,3,1,0.001935,3,0,...,0,2,0,0.2,3,2,0.2,0.388889,0.2,0.0
4,0.214286,2,0.350036,1,0.035714,0,3,0.002903,0,1,...,0,3,1,0.15,3,2,0.05,0.111111,0.133333,0.117647


In [15]:
y.head()

0    Yes
1     No
2    Yes
3     No
4     No
Name: Attrition, dtype: object

In [16]:
#normalize target
y = y.replace({'Yes': 1, 'No': 0})
y.head()

0    1
1    0
2    1
3    0
4    0
Name: Attrition, dtype: int64

In [17]:
#split data to train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
# Importing classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier

models = []
models.append(('LR', LogisticRegression()))
models.append(('RF', RandomForestClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVC', SVC()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('AB', AdaBoostClassifier()))
models.append(('GB', GradientBoostingClassifier()))
models.append(('BC', BaggingClassifier()))
models.append(('ET', ExtraTreesClassifier()))
models.append(('VC', VotingClassifier(estimators=models)))
models.append(('XGB', XGBClassifier()))



In [23]:
# Evaluate each model in turn
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, make_scorer, roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import plot_confusion_matrix
#import model_selection
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, cross_val_predict
results = []
names = []

# training model function
def train_predict(model, model_name, X_train, y_train, X_test, y_test):
    # fit the model
    model.fit(X_train, y_train)
    # make predictions
    y_pred = model.predict(X_test)
    # calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    # summarize metrics
    print('model: {}'.format(model_name))
    print('accuracy: {}'.format(accuracy))
    print('f1: {}'.format(f1))
    print('precision: {}'.format(precision))
    print('recall: {}'.format(recall))
    print('roc_auc: {}'.format(roc_auc))
    print('confusion_matrix: {}'.format(confusion_matrix(y_test, y_pred)))
    print('classification_report: {}'.format(classification_report(y_test, y_pred)))
    # store metrics
    results.append([accuracy, f1, precision, recall, roc_auc])
    names.append(model_name)

In [24]:
# train models
for name, model in models:
    train_predict(model, name, X_train, y_train, X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


model: LR
accuracy: 0.9013605442176871
f1: 0.4727272727272727
precision: 0.8125
recall: 0.3333333333333333
roc_auc: 0.6607843137254902
confusion_matrix: [[252   3]
 [ 26  13]]
classification_report:               precision    recall  f1-score   support

           0       0.91      0.99      0.95       255
           1       0.81      0.33      0.47        39

    accuracy                           0.90       294
   macro avg       0.86      0.66      0.71       294
weighted avg       0.89      0.90      0.88       294

model: RF
accuracy: 0.8775510204081632
f1: 0.18181818181818182
precision: 0.8
recall: 0.10256410256410256
roc_auc: 0.5493212669683258
confusion_matrix: [[254   1]
 [ 35   4]]
classification_report:               precision    recall  f1-score   support

           0       0.88      1.00      0.93       255
           1       0.80      0.10      0.18        39

    accuracy                           0.88       294
   macro avg       0.84      0.55      0.56       294
weig

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


model: AB
accuracy: 0.8673469387755102
f1: 0.380952380952381
precision: 0.5
recall: 0.3076923076923077
roc_auc: 0.630316742081448
confusion_matrix: [[243  12]
 [ 27  12]]
classification_report:               precision    recall  f1-score   support

           0       0.90      0.95      0.93       255
           1       0.50      0.31      0.38        39

    accuracy                           0.87       294
   macro avg       0.70      0.63      0.65       294
weighted avg       0.85      0.87      0.85       294

model: GB
accuracy: 0.8877551020408163
f1: 0.4406779661016949
precision: 0.65
recall: 0.3333333333333333
roc_auc: 0.6529411764705881
confusion_matrix: [[248   7]
 [ 26  13]]
classification_report:               precision    recall  f1-score   support

           0       0.91      0.97      0.94       255
           1       0.65      0.33      0.44        39

    accuracy                           0.89       294
   macro avg       0.78      0.65      0.69       294
weighted a

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


: 

: 