In [14]:
#Employee Attrition: Exploratory Data Analysis (EDA)

# The dataset collected contains the following fields that describe employee attrition for a company:

# satisfaction_level: value between 0 - 1 that describes how satisfied the employee was at their current role
# last_evaluation: value between 0 - 1 that describes how well the employee performed
# number_project: number of projects the employee was on
# average_montly_hours: average number of hours worked monthly time_spend_company: number of years at the company
# work_accident: boolean for if the employee has been in an accident at work
# left: boolean that describes if the employee has left the company
# role: employee's role
# salary: low, medium, high --> indicates how high employee salary was
    
#Can we describe which factors are the strongest indicator if an employee is going to attrit?


In [None]:
#Import the libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
from scipy import stats

In [3]:
#I decided to import the dataset (https://docs.google.com/spreadsheets/d/1lJL6IYuQqBV8xTdc5IZPjz_pKRluSUeHuKUOJss7uB8/edit#gid=1240005)

df= pd.read_csv('employee_attrition.csv')

In [6]:
#Exploring the data set further to see the missing values and proportion

df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,role,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [7]:
df.isnull().any()

df.isnull().sum()/df.shape[0]

satisfaction_level       0.0
last_evaluation          0.0
number_project           0.0
average_montly_hours     0.0
time_spend_company       0.0
Work_accident            0.0
left                     0.0
promotion_last_5years    0.0
role                     0.0
salary                   0.0
dtype: float64

In [10]:
#To evaluate the data types of the variables indicated

df.dtypes

satisfaction_level       float64
last_evaluation          float64
number_project             int64
average_montly_hours       int64
time_spend_company         int64
Work_accident              int64
left                       int64
promotion_last_5years      int64
role                      object
salary                    object
dtype: object

In [11]:

# Next we need to convert role and salary to an integer value
# We'll use pd.Categorical to convert these to integer values

# Sets up the category codes for the given columns values
df.role = pd.Categorical(df.role)
df.salary = pd.Categorical(df.salary)

# Setting the categories as codes in the data frame
df['role'] = df.role.cat.codes
df['salary'] = df.salary.cat.codes


In [12]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,role,salary
0,0.38,0.53,2,157,3,0,1,0,7,1
1,0.8,0.86,5,262,6,0,1,0,7,2
2,0.11,0.88,7,272,4,0,1,0,7,2
3,0.72,0.87,5,223,5,0,1,0,7,1
4,0.37,0.52,2,159,3,0,1,0,7,1


In [15]:
df.dtypes


satisfaction_level       float64
last_evaluation          float64
number_project             int64
average_montly_hours       int64
time_spend_company         int64
Work_accident              int64
left                       int64
promotion_last_5years      int64
role                        int8
salary                      int8
dtype: object

In [16]:
# List of columns that need to be converted to integers
columns = ['satisfaction_level', 'last_evaluation']

for column_name in columns:
    df[column_name] = pd.to_numeric(df[column_name], downcast='integer')

In [22]:
# iterating through the columns and printing the pearson r and the p value

for column_name, column in df.transpose().iterrows():
    print(column_name)
    print(stats.pearsonr(df[column_name],df['left']))
    print("\n")

satisfaction_level
(-0.3883749834241129, 0.0)


last_evaluation
(0.006567120447534072, 0.4212701963736362)


number_project
(0.023787185071773645, 0.003575213870937918)


average_montly_hours
(0.07128717878330057, 2.311303556750929e-18)


time_spend_company
(0.14482217493938518, 4.2076804576977094e-71)


Work_accident
(-0.15462163370512924, 6.613049400540801e-81)


left
(0.99999999999998, 0.0)


promotion_last_5years
(-0.061788106579200364, 3.624047224141857e-14)


role
(0.032105293633677874, 8.402000854899883e-05)


salary
(-0.001293716832934065, 0.8741188539785435)




In [None]:
#Stack-ranking both Pearson R and p-value, we rank attrition in a decreasing order of factor significance as:

#satisfaction_level>>Work_accident>>time_spend_company>>average_montly_hours>promotion_last_5years
#>>role>>number_project>last_evaluation>>salary

# Pearson's R tells us how linearly correlated two variables are, where as p-value tells use how significant our Pearson's R is.
# The p-value ranges from 0 - 1.0, with 0.00 being highly significant and 1.0 being not significant. 
# The p-value takes into account both the strength of the correlation, R, as well as the number of samples.