In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load Dataset

df = pd.read_csv('dataset_ass7.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [3]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'EmployeeID', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4410 entries, 0 to 4409
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      4410 non-null   int64  
 1   Attrition                4410 non-null   object 
 2   BusinessTravel           4410 non-null   object 
 3   Department               4410 non-null   object 
 4   DistanceFromHome         4410 non-null   int64  
 5   Education                4410 non-null   int64  
 6   EducationField           4410 non-null   object 
 7   EmployeeCount            4410 non-null   int64  
 8   EmployeeID               4410 non-null   int64  
 9   Gender                   4410 non-null   object 
 10  JobLevel                 4410 non-null   int64  
 11  JobRole                  4410 non-null   object 
 12  MaritalStatus            4410 non-null   object 
 13  MonthlyIncome            4410 non-null   int64  
 14  NumCompaniesWorked      

In [5]:
df.isnull().any()

Age                        False
Attrition                  False
BusinessTravel             False
Department                 False
DistanceFromHome           False
Education                  False
EducationField             False
EmployeeCount              False
EmployeeID                 False
Gender                     False
JobLevel                   False
JobRole                    False
MaritalStatus              False
MonthlyIncome              False
NumCompaniesWorked          True
Over18                     False
PercentSalaryHike          False
StandardHours              False
StockOptionLevel           False
TotalWorkingYears           True
TrainingTimesLastYear      False
YearsAtCompany             False
YearsSinceLastPromotion    False
YearsWithCurrManager       False
dtype: bool

In [6]:
df.fillna(0,inplace =True)

In [7]:
df.isnull().any() # no null values

Age                        False
Attrition                  False
BusinessTravel             False
Department                 False
DistanceFromHome           False
Education                  False
EducationField             False
EmployeeCount              False
EmployeeID                 False
Gender                     False
JobLevel                   False
JobRole                    False
MaritalStatus              False
MonthlyIncome              False
NumCompaniesWorked         False
Over18                     False
PercentSalaryHike          False
StandardHours              False
StockOptionLevel           False
TotalWorkingYears          False
TrainingTimesLastYear      False
YearsAtCompany             False
YearsSinceLastPromotion    False
YearsWithCurrManager       False
dtype: bool

In [8]:
df.drop(['EmployeeCount','EmployeeID','StandardHours', 'Over18'],axis=1,inplace=True) # removing useless features

In [9]:
df.head() # unnecessary features removed

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,Female,1,Healthcare Representative,Married,131160,1.0,11,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,Female,1,Research Scientist,Single,41890,0.0,23,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,Male,4,Sales Executive,Married,193280,1.0,15,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,Male,3,Human Resources,Married,83210,3.0,11,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,Male,1,Sales Executive,Single,23420,4.0,12,2,9.0,2,6,0,4


In [11]:
use_cols = df[['Age','Attrition','BusinessTravel','DistanceFromHome','Education', 'EducationField','Gender', 'JobLevel', 'JobRole',
       'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked',
       'PercentSalaryHike', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion',
       'YearsWithCurrManager']]

In [12]:
#Convert all the Categorical data into numerical data
# get unique values from each categorical feature
print(df['BusinessTravel'].unique())
print(df['EducationField'].unique())
print(df['Gender'].unique())
print(df['Department'].unique())
print(df['JobRole'].unique())
print(df['MaritalStatus'].unique())

['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']
['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree'
 'Human Resources']
['Female' 'Male']
['Sales' 'Research & Development' 'Human Resources']
['Healthcare Representative' 'Research Scientist' 'Sales Executive'
 'Human Resources' 'Research Director' 'Laboratory Technician'
 'Manufacturing Director' 'Sales Representative' 'Manager']
['Married' 'Single' 'Divorced']


In [13]:
# sklearn used to convert data to numerical
from sklearn.preprocessing import LabelEncoder
cat_x = LabelEncoder()

df['BusinessTravel'] = cat_x.fit_transform(df['BusinessTravel'])
df['Department'] = cat_x.fit_transform(df['Department'])
df['EducationField'] = cat_x.fit_transform(df['EducationField'])
df['Gender'] = cat_x.fit_transform(df['Gender'])
df['JobRole'] = cat_x.fit_transform(df['JobRole'])
df['MaritalStatus'] = cat_x.fit_transform(df['MaritalStatus'])

In [14]:
# Attrition is dependent feature so put on y axis
from sklearn.preprocessing import LabelEncoder
cat_y = LabelEncoder()
df['Attrition'] = cat_y.fit_transform(df['Attrition'])

In [15]:
# checking categorical values again it's changed to numerical
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,0,2,2,6,2,1,0,1,0,1,131160,1.0,11,0,1.0,6,1,0,0
1,31,1,1,1,10,1,1,0,1,6,2,41890,0.0,23,1,6.0,3,5,1,4
2,32,0,1,1,17,4,4,1,4,7,1,193280,1.0,15,3,5.0,2,5,0,3
3,38,0,0,1,2,5,1,1,3,1,1,83210,3.0,11,3,13.0,5,8,7,5
4,32,0,2,1,10,1,3,1,1,7,2,23420,4.0,12,2,9.0,2,6,0,4


In [17]:
# attrition = Yes and no seprate
att_yes = df[df['Attrition']== 1]
att_no = df[df['Attrition']== 0]

# For this dataset we have 1 dependent categorical variable so we can perform 2 tests 
## 1. ManWhitney
## 2. Chi Square

In [48]:
# defining function for ManWhitney tests
def manwhitney(stats, p, b):
    print('\nH0 = There is no significant difference between Attrition_yes with',b, 'and Attrition_No with', b)
    print('H1 = There is significant difference between Attrition_yes with',b, 'and Attrition_No with', b, '\n')
    print(stats, 'P Value:', p,'\n')
    if p < 0.05:
        print('P-Value < 0.05 hence H0 rejected, Accepting H1 Hypothesis')
    else:
        print("P-Value >= 0.05 hence H0 Accepted")
    print('-----------------------------------------------------------------------------------------------------------------')

In [49]:
from scipy.stats import mannwhitneyu
stats, p = mannwhitneyu(att_yes.Age, att_no.Age)
manwhitney(stats, p, 'Age')


H0 = There is no significant difference between Attrition_yes with Age and Attrition_No with Age
H1 = There is significant difference between Attrition_yes with Age and Attrition_No with Age 

961731.0 P Value: 2.9951588479067175e-30 

P-Value < 0.05 hence H0 rejected, Accepting H1 Hypothesis
-----------------------------------------------------------------------------------------------------------------


In [50]:
from scipy.stats import mannwhitneyu
stats, p = mannwhitneyu(att_yes.DistanceFromHome, att_no.DistanceFromHome)
manwhitney(stats, p, 'DistanceFromHome')


H0 = There is no significant difference between Attrition_yes with DistanceFromHome and Attrition_No with DistanceFromHome
H1 = There is significant difference between Attrition_yes with DistanceFromHome and Attrition_No with DistanceFromHome 

1312110.0 P Value: 0.4629185205822659 

P-Value >= 0.05 hence H0 Accepted
-----------------------------------------------------------------------------------------------------------------


In [51]:
from scipy.stats import mannwhitneyu
stats, p = mannwhitneyu(att_yes.Education, att_no.Education)
manwhitney(stats, p, 'Education')


H0 = There is no significant difference between Attrition_yes with Education and Attrition_No with Education
H1 = There is significant difference between Attrition_yes with Education and Attrition_No with Education 

1280146.5 P Value: 0.12035477215449608 

P-Value >= 0.05 hence H0 Accepted
-----------------------------------------------------------------------------------------------------------------


In [52]:
from scipy.stats import mannwhitneyu
stats, p = mannwhitneyu(att_yes.MonthlyIncome, att_no.MonthlyIncome)
manwhitney(stats, p, 'MonthlyIncome')


H0 = There is no significant difference between Attrition_yes with MonthlyIncome and Attrition_No with MonthlyIncome
H1 = There is significant difference between Attrition_yes with MonthlyIncome and Attrition_No with MonthlyIncome 

1264900.5 P Value: 0.053577283839938566 

P-Value >= 0.05 hence H0 Accepted
-----------------------------------------------------------------------------------------------------------------


In [53]:
from scipy.stats import mannwhitneyu
stats, p = mannwhitneyu(att_yes.NumCompaniesWorked, att_no.NumCompaniesWorked)
manwhitney(stats, p, 'NumCompaniesWorked')


H0 = There is no significant difference between Attrition_yes with NumCompaniesWorked and Attrition_No with NumCompaniesWorked
H1 = There is significant difference between Attrition_yes with NumCompaniesWorked and Attrition_No with NumCompaniesWorked 

1259144.0 P Value: 0.03266173775282211 

P-Value < 0.05 hence H0 rejected, Accepting H1 Hypothesis
-----------------------------------------------------------------------------------------------------------------


In [54]:
from scipy.stats import mannwhitneyu
stats, p = mannwhitneyu(att_yes.PercentSalaryHike, att_no.PercentSalaryHike)
manwhitney(stats, p, 'PercentSalaryHike')


H0 = There is no significant difference between Attrition_yes with PercentSalaryHike and Attrition_No with PercentSalaryHike
H1 = There is significant difference between Attrition_yes with PercentSalaryHike and Attrition_No with PercentSalaryHike 

1250640.0 P Value: 0.018660129917539733 

P-Value < 0.05 hence H0 rejected, Accepting H1 Hypothesis
-----------------------------------------------------------------------------------------------------------------


In [55]:
from scipy.stats import mannwhitneyu
stats, p = mannwhitneyu(att_yes.TotalWorkingYears, att_no.TotalWorkingYears)
manwhitney(stats, p, 'TotalWorkingYears')


H0 = There is no significant difference between Attrition_yes with TotalWorkingYears and Attrition_No with TotalWorkingYears
H1 = There is significant difference between Attrition_yes with TotalWorkingYears and Attrition_No with TotalWorkingYears 

907502.5 P Value: 1.0203529765342384e-39 

P-Value < 0.05 hence H0 rejected, Accepting H1 Hypothesis
-----------------------------------------------------------------------------------------------------------------


In [56]:
from scipy.stats import mannwhitneyu
stats, p = mannwhitneyu(att_yes.TrainingTimesLastYear, att_no.TrainingTimesLastYear)
manwhitney(stats, p, 'TrainingTimesLastYear')


H0 = There is no significant difference between Attrition_yes with TrainingTimesLastYear and Attrition_No with TrainingTimesLastYear
H1 = There is significant difference between Attrition_yes with TrainingTimesLastYear and Attrition_No with TrainingTimesLastYear 

1238940.0 P Value: 0.005167954938699059 

P-Value < 0.05 hence H0 rejected, Accepting H1 Hypothesis
-----------------------------------------------------------------------------------------------------------------


In [57]:
from scipy.stats import mannwhitneyu
stats, p = mannwhitneyu(att_yes.YearsAtCompany, att_no.YearsAtCompany)
manwhitney(stats, p, 'YearsAtCompany')


H0 = There is no significant difference between Attrition_yes with YearsAtCompany and Attrition_No with YearsAtCompany
H1 = There is significant difference between Attrition_yes with YearsAtCompany and Attrition_No with YearsAtCompany 

923238.0 P Value: 6.047598261692858e-37 

P-Value < 0.05 hence H0 rejected, Accepting H1 Hypothesis
-----------------------------------------------------------------------------------------------------------------


In [58]:
from scipy.stats import mannwhitneyu
stats, p = mannwhitneyu(att_yes.YearsWithCurrManager, att_no.YearsWithCurrManager)
manwhitney(stats, p, 'YearsWithCurrManager')


H0 = There is no significant difference between Attrition_yes with YearsWithCurrManager and Attrition_No with YearsWithCurrManager
H1 = There is significant difference between Attrition_yes with YearsWithCurrManager and Attrition_No with YearsWithCurrManager 

957253.5 P Value: 1.2365483142169853e-31 

P-Value < 0.05 hence H0 rejected, Accepting H1 Hypothesis
-----------------------------------------------------------------------------------------------------------------


In [60]:
# defining function for CHI Square tests
def chi2(stats, p, b):
    print('\nHo= There is no dependency betweem Attrition and', b)
    print('H1= There is dependency betweem Attrition and', b, '\n')
    print(chitable,'\n')
    print(stats, 'P Value:', p,'\n')
    if p < 0.05:
        print('P-Value < 0.05 hence H0 rejected, Accepting H1 Hypothesis')
    else:
        print("P-Value >= 0.05 hence H0 Accepted")
    print('-----------------------------------------------------------------------------------------------------------------')

In [61]:
# CHI Square testcan be perform with Categorical variable only
# Categorical variables : BusinessTravel , EducationField , Gender ,  Department , JobRole , MaritalStatus , JobLevel , StockOptionLevel
from scipy.stats import chi2_contingency

In [62]:
chitable = pd.crosstab(df.Attrition, df.BusinessTravel)
stats, p, dof, expected = chi2_contingency(chitable)
chi2(stats, p, 'BusinessTravel')


Ho= There is no dependency betweem Attrition and BusinessTravel
H1= There is dependency betweem Attrition and BusinessTravel 

BusinessTravel    0    1     2
Attrition                     
0               414  624  2661
1                36  207   468 

72.54724105696552 P Value: 1.764276972983189e-16 

P-Value < 0.05 hence H0 rejected, Accepting H1 Hypothesis
-----------------------------------------------------------------------------------------------------------------


In [69]:
chitable = pd.crosstab(df.Attrition, df.EducationField)
stats, p, dof, expected = chi2_contingency(chitable)
chi2(stats, p, 'EducationField')


Ho= There is no dependency betweem Attrition and EducationField
H1= There is dependency betweem Attrition and EducationField 

EducationField   0     1    2     3    4    5
Attrition                                    
0               48  1515  402  1167  216  351
1               33   303   75   225   30   45 

46.194921001730584 P Value: 8.288917469574179e-09 

P-Value < 0.05 hence H0 rejected, Accepting H1 Hypothesis
-----------------------------------------------------------------------------------------------------------------


In [70]:
chitable = pd.crosstab(df.Attrition, df.Gender)
stats, p, dof, expected = chi2_contingency(chitable)
chi2(stats, p, 'Gender')


Ho= There is no dependency betweem Attrition and Gender
H1= There is dependency betweem Attrition and Gender 

Gender        0     1
Attrition            
0          1494  2205
1           270   441 

1.349904410246582 P Value: 0.24529482862926827 

P-Value >= 0.05 hence H0 Accepted
-----------------------------------------------------------------------------------------------------------------


In [71]:
chitable = pd.crosstab(df.Attrition, df.Department)
stats, p, dof, expected = chi2_contingency(chitable)
chi2(stats, p, 'Department')


Ho= There is no dependency betweem Attrition and Department
H1= There is dependency betweem Attrition and Department 

Department    0     1     2
Attrition                  
0           132  2430  1137
1            57   453   201 

29.090274924488266 P Value: 4.820888218170406e-07 

P-Value < 0.05 hence H0 rejected, Accepting H1 Hypothesis
-----------------------------------------------------------------------------------------------------------------


In [72]:
chitable = pd.crosstab(df.Attrition, df.JobRole)
stats, p, dof, expected = chi2_contingency(chitable)
chi2(stats, p, 'JobRole')


Ho= There is no dependency betweem Attrition and JobRole
H1= There is dependency betweem Attrition and JobRole 

JobRole      0    1    2    3    4    5    6    7    8
Attrition                                             
0          336  135  651  264  387  183  717  813  213
1           57   21  126   42   48   57  159  165   36 

25.116313674604072 P Value: 0.001485544744815264 

P-Value < 0.05 hence H0 rejected, Accepting H1 Hypothesis
-----------------------------------------------------------------------------------------------------------------


In [73]:
chitable = pd.crosstab(df.Attrition, df.MaritalStatus)
stats, p, dof, expected = chi2_contingency(chitable)
chi2(stats, p, 'MaritalStatus')


Ho= There is no dependency betweem Attrition and MaritalStatus
H1= There is dependency betweem Attrition and MaritalStatus 

MaritalStatus    0     1     2
Attrition                     
0              882  1767  1050
1               99   252   360 

138.49102962254608 P Value: 8.45385940605786e-31 

P-Value < 0.05 hence H0 rejected, Accepting H1 Hypothesis
-----------------------------------------------------------------------------------------------------------------


In [74]:
chitable = pd.crosstab(df.Attrition, df.JobLevel)
stats, p, dof, expected = chi2_contingency(chitable)
chi2(stats, p, 'JobLevel')


Ho= There is no dependency betweem Attrition and JobLevel
H1= There is dependency betweem Attrition and JobLevel 

JobLevel      1     2    3    4    5
Attrition                           
0          1377  1317  558  267  180
1           252   285   96   51   27 

6.2691759264759925 P Value: 0.1799276801337184 

P-Value >= 0.05 hence H0 Accepted
-----------------------------------------------------------------------------------------------------------------


In [75]:
chitable = pd.crosstab(df.Attrition, df.StockOptionLevel)
stats, p, dof, expected = chi2_contingency(chitable)
chi2(stats, p, 'StockOptionLevel')


Ho= There is no dependency betweem Attrition and StockOptionLevel
H1= There is dependency betweem Attrition and StockOptionLevel 

StockOptionLevel     0     1    2    3
Attrition                             
0                 1575  1518  390  216
1                  318   270   84   39 

3.046265305068262 P Value: 0.38454683657380506 

P-Value >= 0.05 hence H0 Accepted
-----------------------------------------------------------------------------------------------------------------
