In [2]:
#Liberary Section
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report 
from imblearn.over_sampling import SMOTE
from collections import Counter

In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv


In [4]:
# Let's go to import data
data = pd.read_csv("/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")
print(data.shape)

(1470, 35)


In [5]:
# Check data type of columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [8]:
# Let's check if are missing value
data.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [None]:
We know that Attrition column is our target variable but it is categorized, we need to convert it to numerical

In [9]:
data['Attrition'] = data['Attrition'].apply(lambda x:1 if x == 'Yes' else 0) 

In [None]:
Other binary features are "Over18", "OverTime" and "Gender":

Over18 : Specifies if the worker is over 18 years old.

OverTime: Specifies if the worker is working overtime.

Gender: Specifies the worker gender.

In [10]:
data['Over18'] = data['Over18'].apply(lambda x: 1 if x == 'Yes' else 0)
data['OverTime'] = data['OverTime'].apply(lambda x: 1 if x == 'Yes' else 0)
data['Gender'] = data['Gender'].apply(lambda x: 1 if x == 'Female' else 0)

In [None]:
Let's apply the ".describe" method to find how workers who quit behave

In [13]:
data[data['Attrition'] == 1].describe()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,237.0,237.0,237.0,237.0,237.0,237.0,237.0,237.0,237.0,237.0,...,237.0,237.0,237.0,237.0,237.0,237.0,237.0,237.0,237.0,237.0
mean,33.607595,1.0,750.362869,10.632911,2.839662,1.0,1010.345992,2.464135,0.367089,65.57384,...,2.599156,80.0,0.527426,8.244726,2.624473,2.658228,5.130802,2.902954,1.945148,2.852321
std,9.68935,0.0,401.899519,8.452525,1.008244,0.0,580.750572,1.169791,0.483031,20.099958,...,1.125437,0.0,0.856361,7.169204,1.254784,0.816453,5.949984,3.174827,3.153077,3.143349
min,18.0,1.0,103.0,1.0,1.0,1.0,1.0,1.0,0.0,31.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,28.0,1.0,408.0,3.0,2.0,1.0,514.0,1.0,0.0,50.0,...,2.0,80.0,0.0,3.0,2.0,2.0,1.0,0.0,0.0,0.0
50%,32.0,1.0,699.0,9.0,3.0,1.0,1017.0,3.0,0.0,66.0,...,3.0,80.0,0.0,7.0,2.0,3.0,3.0,2.0,1.0,2.0
75%,39.0,1.0,1092.0,17.0,4.0,1.0,1486.0,4.0,1.0,84.0,...,4.0,80.0,1.0,10.0,3.0,3.0,7.0,4.0,2.0,5.0
max,58.0,1.0,1496.0,29.0,5.0,1.0,2055.0,4.0,1.0,100.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,15.0,15.0,14.0


In [14]:
data[data['Attrition'] == 0].describe()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1233.0,1233.0,1233.0,1233.0,1233.0,1233.0,1233.0,1233.0,1233.0,1233.0,...,1233.0,1233.0,1233.0,1233.0,1233.0,1233.0,1233.0,1233.0,1233.0,1233.0
mean,37.561233,0.0,812.504461,8.915653,2.927007,1.0,1027.656123,2.77129,0.406326,65.952149,...,2.733982,80.0,0.845093,11.862936,2.832928,2.781022,7.369019,4.484185,2.234388,4.367397
std,8.88836,0.0,403.208379,8.012633,1.027002,0.0,606.217074,1.071132,0.491346,20.380754,...,1.071603,0.0,0.841985,7.760719,1.293585,0.681907,6.096298,3.649402,3.234762,3.594116
min,18.0,0.0,102.0,1.0,1.0,1.0,2.0,1.0,0.0,30.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,31.0,0.0,477.0,2.0,2.0,1.0,483.0,2.0,0.0,48.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,0.0,817.0,7.0,3.0,1.0,1022.0,3.0,0.0,66.0,...,3.0,80.0,1.0,10.0,3.0,3.0,6.0,3.0,1.0,3.0
75%,43.0,0.0,1176.0,13.0,4.0,1.0,1574.0,4.0,1.0,83.0,...,4.0,80.0,1.0,16.0,3.0,3.0,10.0,7.0,3.0,7.0
max,60.0,0.0,1499.0,29.0,5.0,1.0,2068.0,4.0,1.0,100.0,...,4.0,80.0,3.0,38.0,6.0,4.0,37.0,18.0,15.0,17.0


In [None]:
A short analysis reveals the following key points:

For people who leave the company (on average):

. They are younger: 33 years
. They live further from their work: 11km
. Less satisfaction with the work environment: 2
. Lower level of work: 1
. Less satisfaction with work: 2
. Lower monthly salary: $ 4800.00
. Work more overtime: 0.5
. Less years in the company: 5
. Fewer years in current position: 2
. Fewer years with current manager: 2.8

**1.1 Relationship With the Boss**

In [19]:
fig = px.histogram(data, x="YearsWithCurrManager", color="Attrition", marginal="box")
fig.show() 

In [None]:
We can observe that employees who resign have less time with their manager than employees who keep their jobs.

**1.2 Bored and Unchallenged by the Work Itself**

In [20]:
job_satisfaction = data.groupby(["JobSatisfaction", "Attrition"]).agg(count_col=pd.NamedAgg(column="Attrition", aggfunc="count")).reset_index()
fig = px.histogram(job_satisfaction, x="JobSatisfaction", y = 'count_col' , color="Attrition")
fig.update_layout(barmode='group')
fig.show()

In [None]:
A high degree of attraction can be observed when job satisfaction is low, but also when the value it's high. 
This means that employees must leave the company for other reasons.

**1.3 Relationships With Coworkers**

In [22]:
fig = px.box(data, x = 'Attrition', y = 'JobSatisfaction', color = 'Attrition')
fig.update_layout(title = 'Relationships With Coworkers')
fig.show()

In [None]:
A majority of employees are grouped between quartile 1 and 2 which corresponds to a lower satisfaction rating with co-workers

**1.4 Salary and attrition**

In [23]:
fig = px.box(data, x = 'Attrition', y = 'MonthlyIncome', color = 'Attrition')
fig.update_layout(title = 'Relationships With Coworkers')
fig.show()

**1.5 Overtime and attrition**

In [24]:
job_satisfaction = data.groupby(['OverTime', 'Attrition']).agg(count_col=pd.NamedAgg(column="Attrition", aggfunc="count")).reset_index()
fig = px.histogram(job_satisfaction, x="OverTime", y = 'count_col', color="Attrition")
fig.update_layout(barmode='group')
fig.show()

**2. Feature Selection**

In [None]:
To Drop:
        EmployeeCount: All values have the same value.
        Over18: All values have the same value.
        StandartHours: All values have the same value.
        EmployeeNumber: Irrelevant variable, it is only an employee identifier.   
        Also drop DailyRate, HourlyRate and MonthlyRate
        
I decided to drop those variebles and keep only with "MonthlyIncome" that is the total salary.

In [27]:
data.drop(columns = ["EmployeeCount", "Over18", "StandardHours", "EmployeeNumber", "DailyRate", "HourlyRate", "MonthlyRate"], inplace = True)
data.shape

(1470, 28)

**2.1 Imput and Output variables (X & Y)**

In [29]:
# Create an object scaler
MMS = MinMaxScaler()
# get dummies
dummies = pd.get_dummies(data[data.columns.difference(["Attrition"])])
X = MMS.fit_transform(dummies)
y = data[['Attrition']].values.ravel()

In [31]:
# Split the data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 0, shuffle = True)

In [32]:
Counter(y_train)

Counter({0: 923, 1: 179})

**3. Modeling**

In [None]:
Can you predict who will leave the company?
To achieve this I used 2 models:

Logistic regresion
Random Forest

**3.1 Logistic Regression**

In [36]:
log_reg_model = LogisticRegression(max_iter=1000, solver = "newton-cg")
log_reg_model.fit(X_train, y_train)

LogisticRegression(max_iter=1000, solver='newton-cg')

In [37]:
y_pred = log_reg_model.predict(X_test) 
print("Model Accuracy score: {}".format(accuracy_score(y_test, y_pred)))

Model Accuracy score: 0.8885869565217391


In [38]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.98      0.94       310
           1       0.79      0.40      0.53        58

    accuracy                           0.89       368
   macro avg       0.84      0.69      0.73       368
weighted avg       0.88      0.89      0.87       368



In [None]:
We can see that the model predicts quite well the "none quite employees" (94% accuracy) 
but it doesn't predict as well the "quite employees" (53% accuracy).

**3.2 Random Forest Classifier**

In [39]:
random_forest_model = RandomForestClassifier(random_state = 0)
random_forest_model.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [40]:
y_pred = random_forest_model.predict(X_test)
print("Model Accuracy score: {}".format(accuracy_score(y_test,y_pred)))

Model Accuracy score: 0.8668478260869565


In [41]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      1.00      0.93       310
           1       0.91      0.17      0.29        58

    accuracy                           0.87       368
   macro avg       0.89      0.58      0.61       368
weighted avg       0.87      0.87      0.83       368



In [None]:
Again, the model predicts quite well the "none quite employees" (92% accuracy)
but it have a poor prediction of "quite employees" (29% accuracy).

**3.3 SMOTE Data**

In [None]:
For the SMOTING technique I only followed one golden rule:

DON'T PUT SYNTHETIC DATA IN YOUR TEST DATA!!!

In [42]:
smt = SMOTE(random_state=0, sampling_strategy = 0.4)
X_train_SMOTE, y_train_SMOTE = smt.fit_sample(X_train, y_train)

In [43]:
Counter(y_train_SMOTE)

Counter({0: 923, 1: 369})

**3.3.1 Logistic Regression with SMOTE data**

In [44]:
log_reg_model = LogisticRegression(max_iter=1000, solver = "newton-cg")
log_reg_model.fit(X_train_SMOTE, y_train_SMOTE)

LogisticRegression(max_iter=1000, solver='newton-cg')

In [46]:
y_pred = log_reg_model.predict(X_test)
print("Model Accuracy Score: {}".format(accuracy_score(y_test, y_pred)))

Model Accuracy Score: 0.8885869565217391


In [47]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.94      0.93       310
           1       0.65      0.62      0.64        58

    accuracy                           0.89       368
   macro avg       0.79      0.78      0.79       368
weighted avg       0.89      0.89      0.89       368



In [None]:
With the SMOTE technique it is possible to get a better precision in the attrition cases (62 %)

**3.3.2 Random Forest Classifier with SMOTE**

In [49]:
random_forest_model = RandomForestClassifier(random_state=0)
random_forest_model.fit(X_train_SMOTE, y_train_SMOTE)

RandomForestClassifier(random_state=0)

In [51]:
y_pred = random_forest_model.predict(X_test)
print("Model Accuracy Score: {}".format(accuracy_score(y_test, y_pred)))

Model Accuracy Score: 0.8777173913043478


In [52]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.99      0.93       310
           1       0.84      0.28      0.42        58

    accuracy                           0.88       368
   macro avg       0.86      0.63      0.67       368
weighted avg       0.87      0.88      0.85       368



In [None]:
In case of RFC we have a better prediction of the "quite cases" but it doesn't better that logistic regresion.

In [None]:
Logistic regression proved to be a good tool to classify and predict which employees will not quit, however, the unbalance
of the data set does not help to predict which employees will quit. To compensate for this, the SMOTE technique was used
to generate synthetic data to compensate for the lack data from employees who quit.

I recommend using this tool carefully because it generates synthetic data around a cluster, which is not always good.