In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_csv('general_data.csv')

In [3]:
dummy=pd.get_dummies(df['Attrition'])
df2=pd.concat((df,dummy),axis=1)
df2=df2.drop(['Attrition'],axis=1)
df2=df2.drop(['No'],axis=1)
df2=df2.rename(columns={"Yes":"Attrition"})
df2.head()

Unnamed: 0,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,JobLevel,...,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,51,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,1,...,Y,11,8,0,1.0,6,1,0,0,0
1,31,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,1,...,Y,23,8,1,6.0,3,5,1,4,1
2,32,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,4,...,Y,15,8,3,5.0,2,5,0,3,0
3,38,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,3,...,Y,11,8,3,13.0,5,8,7,5,0
4,32,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,1,...,Y,12,8,2,9.0,2,6,0,4,0


# Mann-Whitney test

### Attrition and Distance from home

In [4]:
from scipy.stats import mannwhitneyu
stats,p= mannwhitneyu(df2.Attrition,df2.DistanceFromHome)
print(stats,p)

221832.0 0.0


### Attrition and Education

In [5]:
from scipy.stats import mannwhitneyu
stats,p= mannwhitneyu(df2.Attrition,df2.Education)
print(stats,p)

181305.0 0.0


#H0: There is no significant differences in the attrition yes and no for Education
#Ha: There is significant differences in the attrition yes and no for Education
#The p value is 0.00 ,So p value is less than 0.5 
#Hence Ho (null hypothesis) is rejected and Ha (alternative hypothesis) accepted.
#There is significant differences in the attrition yes and no for Education

### Attrition and TotalWorkingYears

In [6]:
stats,p=mannwhitneyu(df2.Attrition,df2.TotalWorkingYears)
print(stats,p)

170527.5 0.0


#H0: There is no significant differences in the attrition yes and no for TotalWorkingYears
#Ha: There is significant differences in the attrition yes and no for TotalWorkingYears
#The p value is 0.00 ,So p value is less than 0.5 
#Hence Ho (null hypothesis) is rejected and Ha (alternative hypothesis) accepted.
#There is significant differences in the attrition yes and no for TotalWorkingYears

### Attrition and YearsAtCompany

In [7]:
stats,p=mannwhitneyu(df2.Attrition,df2.YearsAtCompany)
print(stats,p)

520357.5 0.0


#H0: There is no significant differences in the attrition yes and no for YearsAtCompany
#Ha: There is significant differences in the attrition yes and no for YearsAtCompany
#The p value is 0.00 ,So p value is less than 0.5 
#Hence Ho (null hypothesis) is rejected and Ha (alternative hypothesis) accepted.
#There is significant differences in the attrition yes and no for YearsAtCompany

### Attrition and YearsWithCurrManager

In [8]:
stats,p=mannwhitneyu(df2.Attrition,df2.YearsWithCurrManager)
print(stats,p)

2101288.5 0.0


#H0: There is no significant differences in the attrition yes and no for YearsWithCurrManager
#Ha: There is significant differences in the attrition yes and no for YearsWithCurrManager
#The p value is 0.00 ,So p value is less than 0.5 
#Hence Ho (null hypothesis) is rejected and Ha (alternative hypothesis) accepted.
#There is significant differences in the attrition yes and no for YearsWithCurrManager

# Chai-Square Test

### Attrition and Gender

In [9]:
from scipy.stats import chi2_contingency

In [10]:
chitable = pd.crosstab(df2.Attrition,df2.Gender)

In [11]:
chitable

Gender,Female,Male
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1494,2205
1,270,441


In [12]:
stats,p,dof,expected=chi2_contingency(chitable)

In [13]:
print(stats,p)

1.349904410246582 0.24529482862926827


#H0: There is no significant differences in the attrition yes and no for Gender
#Ha: There is significant differences in the attrition yes and no for Gender
#The p value is 0.24529482862926827 ,So p value is greater than 0.5 
#Hence Ho (null hypothesis) is accepted and Ha (alternative hypothesis) rejected.
#There is no significant differences in the attrition yes and no for Gender.

### Attrition and JobRole

In [14]:
chitable = pd.crosstab(df2.Attrition,df2.JobRole)

In [15]:
chitable

JobRole,Healthcare Representative,Human Resources,Laboratory Technician,Manager,Manufacturing Director,Research Director,Research Scientist,Sales Executive,Sales Representative
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,336,135,651,264,387,183,717,813,213
1,57,21,126,42,48,57,159,165,36


In [16]:
stats,p,dof,expected=chi2_contingency(chitable)

In [17]:
print(stats,p)

25.116313674604072 0.001485544744815264


#H0: There is no significant differences in the attrition yes and no for JobRole
#Ha: There is significant differences in the attrition yes and no for JobRole
#The p value is 0.001485544744815264 ,So p value is less than 0.5 
#Hence Ho (null hypothesis) is rejected and Ha (alternative hypothesis) accepted.
#There is significant differences in the attrition yes and no for JobRole

### Attrition and Department

In [18]:
chitable = pd.crosstab(df2.Attrition,df2.Department)

In [19]:
chitable

Department,Human Resources,Research & Development,Sales
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,132,2430,1137
1,57,453,201


In [20]:
stats,p,dof,expected=chi2_contingency(chitable)
print(stats,p)

29.090274924488266 4.820888218170406e-07


#H0: There is no significant differences in the attrition yes and no for Department
#Ha: There is significant differences in the attrition yes and no for Department
#The p value is 4.820888218170406e-07 ,i.e. 0.00000004820888218170406 So p value is less than 0.5 
#Hence Ho (null hypothesis) is rejected and Ha (alternative hypothesis) accepted.
#There is significant differences in the attrition yes and no for Department