# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter



pd.set_option('display.max_columns', 0) #this allow us to visualize all columns

ModuleNotFoundError: No module named 'plotly'

# Importing Dataset

In [None]:
df=pd.read_excel("Hr_data.xls")

In [None]:
df.head(6)

In [None]:
df.columns.values

# Interchange the Last Column of Dataset

In [None]:
column_interchange = list(df.columns)

In [None]:
column_interchange

In [None]:
column_interchange[-1], column_interchange[-2] = column_interchange[-2], column_interchange[-1]

In [None]:
column_interchange

In [None]:
df = df[column_interchange]
df

In [None]:
# Get the number of Rows and Cols
df.shape

In [None]:
# Get the Column Data Types
df.dtypes

In [None]:
# Get a count of the empty values for each column
df.isna().sum()

In [None]:
# Check for any missing / null value in the data
df.isnull().values.any()

# Drop Emp Number Column

In [None]:
df.drop('EmpNumber', axis='columns', inplace=True)

In [None]:
df.info()

In [None]:
# View Some Statistics
df.describe()

In [None]:
df['Attrition'] = df['Attrition'].apply(lambda x: 1 if x == 'Yes' else 0)

In [None]:
df['OverTime'] = df['OverTime'].apply(lambda x: 1 if x == 'Yes' else 0)
df['Gender'] = df['Gender'].apply(lambda x: 1 if x == 'Female' else 0)

# Get the number of count of Employee Attrition ( Stayed & Left the Company)

In [None]:
# No - Employees Stayed in the Company 
# Yes - Employees Left the Company
df['Attrition'].value_counts()

In [None]:
df[df['Attrition'] == 0].describe()

In [None]:
df[df['Attrition'] == 1].describe()

In [None]:
df.shape

# Histogram and Box Plot Attrition

# Visualize the Employees that Stay and Left the Company

In [None]:
sns.countplot(df['Attrition'])

In [None]:
# This is just percentage if we guessed No for Attrition
(1022 - 178)/1022

# Histogram

In [None]:
fig = px.histogram(df, x="YearsWithCurrManager", color="Attrition", marginal="box")
fig.show()

# Plot the number of Employees that left and Stayed in the Company by Age wise

In [None]:
plt.subplots(figsize = (16, 6))
sns.countplot(x = 'Age', hue = 'Attrition', data = df, palette = 'colorblind' )

Findings :

1) Employee Age 29 and 31 are mostly Leave the Company
2) Employee Age 34, 35 and 36 are msotly staying in the company

In [None]:
df.columns

# Bar Plot for Employee Job Satisfaction vs Attrition

In [None]:
job_satisfaction = df.groupby(["EmpJobSatisfaction", "Attrition"]).agg(count_col=pd.NamedAgg(column="Attrition", aggfunc="count")).reset_index()
fig = px.histogram(job_satisfaction, x="EmpJobSatisfaction", y = 'count_col' ,color="Attrition")
fig.update_layout(barmode='group')
fig.show()

A high degree of attraction can be observed when job satisfaction is low, but also when the value it's high. This means that employees must leave the company for other reasons.

# Relationships With Coworkers

In [None]:
fig = px.box(df, x = 'Attrition', y = 'EmpJobSatisfaction', color = 'Attrition')
fig.update_layout(title = 'Relationships With Coworkers')
fig.show()

A majority of employees are grouped between quartile 1 and 2 which corresponds to a lower satisfaction rating with co-workers

In [None]:
df.head(3)

# Overtime and attrition

In [None]:
job_satisfaction = df.groupby(["OverTime", "Attrition"]).agg(count_col=pd.NamedAgg(column="Attrition", aggfunc="count")).reset_index()
fig = px.histogram(job_satisfaction, x="OverTime", y = 'count_col' ,color="Attrition")
fig.update_layout(barmode='group')
fig.show()

It is a fact that the number of employees who quit is higher when the employee works overtime

In [None]:
df.drop(columns = ["EmpHourlyRate"], inplace = True)
df.shape

In [None]:
df.head(1)

# Input and Output variables (X & Y)

In [None]:
# Create an object scaler
MMS = MinMaxScaler()
# get dummies 
dummies = pd.get_dummies(df[df.columns.difference(["Attrition"])])
# scaling the data and define features
X = MMS.fit_transform(dummies)
# Define target variable
y = df[["Attrition"]].values.ravel()

In [None]:
#split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0, shuffle = True)

In [None]:
Counter(y_train)

# Modeling

# Logistic regression

In [None]:
log_reg_model = LogisticRegression(max_iter=1000, solver = "newton-cg")
log_reg_model.fit(X_train, y_train)

In [None]:
y_pred = log_reg_model.predict(X_test)
print("Model accruracy score: {}".format(accuracy_score(y_test, y_pred)))

In [None]:
print(classification_report(y_test, y_pred))

We can see that the model predicts quite well the "none quite employees" (93% accuracy) but it doesn't predict as well the "quite employees" (24% accuracy).

# Random Forest Classifier

In [None]:
random_forest_model = RandomForestClassifier(random_state = 0)
random_forest_model.fit(X_train, y_train)

In [None]:
y_pred = random_forest_model.predict(X_test)
print("Model accruracy score: {}".format(accuracy_score(y_test, y_pred)))

In [None]:
print(classification_report(y_test, y_pred))

Again, the model predicts quite well the "none quite employees" (93% accuracy) but it have a poor prediction of "quite employees" (0.6% accuracy).

# SMOTE Data

In [None]:
smt = SMOTE(random_state=0, sampling_strategy = 0.4)
X_train_SMOTE, y_train_SMOTE = smt.fit_sample(X_train, y_train)

In [None]:
Counter(y_train_SMOTE) #new shape of the target

# Logistic regresion with SMOTE data

In [None]:
log_reg_model = LogisticRegression(max_iter=1000, solver = "newton-cg")
log_reg_model.fit(X_train_SMOTE, y_train_SMOTE)

In [None]:
y_pred = log_reg_model.predict(X_test)
print("Model accruracy score: {}".format(accuracy_score(y_test, y_pred)))

In [None]:
print(classification_report(y_test, y_pred))

With the SMOTE technique it is possible to get a better precision in the attrition cases.

# Random Forest Classifier with SMOTE

In [None]:
random_forest_model = RandomForestClassifier(random_state = 0)
random_forest_model.fit(X_train_SMOTE, y_train_SMOTE)

In [None]:
y_pred = random_forest_model.predict(X_test)
print("Model accruracy score: {}".format(accuracy_score(y_test, y_pred)))

In [None]:
print(classification_report(y_test, y_pred))

# About the model


Logistic regression proved to be a good tool to classify and predict which employees will not quit, however, the unbalance of the data set does not help to predict which employees will quit. To compensate for this, the SMOTE technique was used to generate synthetic data to compensate for the lack data from employees who quit.