# Employee Turnover Analysis

 <h3>Loading Libraries and Data</h3>

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
import plotly.graph_objs as go
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE 

In [2]:
df=pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [4]:
df.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

## Data Visualization

In [5]:
age_att=df.groupby(["Age","Attrition"]).apply(lambda x:x['DailyRate'].count()).reset_index(name='Counts')
px.line(age_att,x="Age",y="Counts",color='Attrition',title="Agewise Counts of People in an Organization")

In [6]:
rate_att=df.groupby(['MonthlyIncome','Attrition']).apply(lambda x:x['MonthlyIncome'].count()).reset_index(name='Counts')
rate_att['MonthlyIncome']=round(rate_att['MonthlyIncome'],-3)
rate_att=rate_att.groupby(['MonthlyIncome','Attrition']).apply(lambda x:x['MonthlyIncome'].count()).reset_index(name='Counts')
px.line(rate_att,x="MonthlyIncome",y="Counts",color="Attrition",title='Monthly Income basis counts of People in an Organization')

In [7]:
dept_att=df.groupby(['Department','Attrition']).apply(lambda x:x['DailyRate'].count()).reset_index(name='Counts')
fig=px.bar(dept_att,x='Department',y='Counts',color='Attrition',title='Department wise Counts of People in an Organization')
fig.show()

In [9]:
sats_att=df.groupby(['EnvironmentSatisfaction',"Attrition"]).apply(lambda x:x["DailyRate"].count()).reset_index(name="Counts")
px.area(sats_att,x='EnvironmentSatisfaction',y='Counts',color='Attrition',title='Environment Satisfaction level Counts of People in an Organization')

In [41]:
stock_att=df.groupby(["StockOptionLevel","Attrition"]).apply(lambda x:x["DailyRate"].count()).reset_index(name="Counts")
px.bar(stock_att,x="StockOptionLevel",y="Counts",color="Attrition",title="Stock facilities level wise People in an Organization")

In [44]:
yrscr_att=df.groupby(['YearsInCurrentRole','Attrition']).apply(lambda x:x['DailyRate'].count()).reset_index(name='Counts')
px.line(yrscr_att,x='YearsInCurrentRole',y='Counts',color='Attrition',title='Counts of People working for years in an Organization')

## Label Encoder

In [3]:
label=LabelEncoder()

df_label = df.apply(lambda x: label.fit_transform(x) if x.dtype == "object" else x)


### Split The Data and Normlization


In [16]:
X=df_label.drop('Attrition',axis=1)
y=df_label["Attrition"]
# smote = SMOTE()
# X,y = smote.fit_resample(X,y)


In [17]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state= 0)
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
stay = (y_train.value_counts()[0] / y_train.shape)[0]
leave = (y_train.value_counts()[1] / y_train.shape)[0]

print("===============TRAIN=================")
print(f"Staying Rate: {stay * 100:.2f}%")
print(f"Leaving Rate: {leave * 100 :.2f}%")

stay = (y_test.value_counts()[0] / y_test.shape)[0]
leave = (y_test.value_counts()[1] / y_test.shape)[0]

print("===============TEST=================")
print(f"Staying Rate: {stay * 100:.2f}%")
print(f"Leaving Rate: {leave * 100 :.2f}%")

Staying Rate: 83.77%
Leaving Rate: 16.23%
Staying Rate: 84.13%
Leaving Rate: 15.87%


## Model Prediction

In [19]:
logistic=LogisticRegression()
logistic.fit(X_train,y_train)
y_train_pred=logistic.predict(X_train)
y_test_pred=logistic.predict(X_test)

print('Accuracy Score on train data: ', metrics.accuracy_score(y_train, y_train_pred))
print('Accuracy Score on test data: ', metrics.accuracy_score(y_test, y_test_pred))

Accuracy Score on train data:  0.8756073858114675
Accuracy Score on test data:  0.8820861678004536


In [20]:
xgb=XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=42,
    max_depth=4,
    n_estimators=100,
    learning_rate=0.1
    
)
xgb.fit(X_train,y_train)
y_train_pred=xgb.predict(X_train)
y_test_pred=xgb.predict(X_test)

print('Accuracy Score on train data: ', metrics.accuracy_score(y_train, y_train_pred))
print('Accuracy Score on test data: ', metrics.accuracy_score(y_test, y_test_pred))

Accuracy Score on train data:  0.9786200194363459
Accuracy Score on test data:  0.873015873015873


In [21]:
svm=SVC()
svm.fit(X_train,y_train)
y_train_pred=svm.predict(X_train)
y_test_pred=svm.predict(X_test)

print('Accuracy Score on train data: ', metrics.accuracy_score(y_train, y_train_pred))
print('Accuracy Score on test data: ', metrics.accuracy_score(y_test, y_test_pred))

Accuracy Score on train data:  0.8960155490767736
Accuracy Score on test data:  0.8571428571428571


In [22]:
gbc=GradientBoostingClassifier()
gbc.fit(X_train,y_train)
y_train_pred=gbc.predict(X_train)
y_test_pred=gbc.predict(X_test)

print('Accuracy Score on train data: ', metrics.accuracy_score(y_train, y_train_pred))
print('Accuracy Score on test data: ', metrics.accuracy_score(y_test, y_test_pred))

Accuracy Score on train data:  0.966958211856171
Accuracy Score on test data:  0.8752834467120182


In [23]:

rf=RandomForestClassifier( )
rf.fit(X_train,y_train)
y_train_pred=gbc.predict(X_train)
y_test_pred=gbc.predict(X_test)

print('Accuracy Score on train data: ', metrics.accuracy_score(y_train, y_train_pred))
print('Accuracy Score on test data: ', metrics.accuracy_score(y_test, y_test_pred))

Accuracy Score on train data:  0.966958211856171
Accuracy Score on test data:  0.8752834467120182


In [24]:
abc=AdaBoostClassifier()
abc.fit(X_train,y_train)
y_train_pred=abc.predict(X_train)
y_test_pred=abc.predict(X_test)

print('Accuracy Score on train data: ', metrics.accuracy_score(y_train, y_train_pred))
print('Accuracy Score on test data: ', metrics.accuracy_score(y_test, y_test_pred))

Accuracy Score on train data:  0.9135082604470359
Accuracy Score on test data:  0.8707482993197279


In [25]:
dtc=DecisionTreeClassifier()
dtc.fit(X_train,y_train)
y_train_pred=abc.predict(X_train)
y_test_pred=abc.predict(X_test)

print('Accuracy Score on train data: ', metrics.accuracy_score(y_train, y_train_pred))
print('Accuracy Score on test data: ', metrics.accuracy_score(y_test, y_test_pred))

Accuracy Score on train data:  0.9135082604470359
Accuracy Score on test data:  0.8707482993197279
