In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('employee_attrition.csv')

In [3]:
# Categorical Features: Department, JobRole, Gender
# Testing if Attrition depends on these categories using Chi-Square Test

from scipy.stats import chi2_contingency

 # Department vs Attrition
ct = pd.crosstab(df['Department'], df['Attrition'])
chi2, p, dof, expected = chi2_contingency(ct)

print("Chi-square:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)

Chi-square: 10.79600732241067
p-value: 0.004525606574479633
Degrees of freedom: 2


In [5]:
# JobRole vs Attrition
ct = pd.crosstab(df['JobRole'], df['Attrition'])
chi2, p, dof, expected = chi2_contingency(ct)

print("Chi-square:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)

Chi-square: 86.19025367670434
p-value: 2.752481638050657e-15
Degrees of freedom: 8


In [6]:
# MaritalStatus vs Attrition
ct = pd.crosstab(df['MaritalStatus'], df['Attrition'])
chi2, p, dof, expected = chi2_contingency(ct)

print("Chi-square:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)

Chi-square: 46.163676540848705
p-value: 9.45551106034083e-11
Degrees of freedom: 2


In [7]:
# EducationField vs Attrition
ct = pd.crosstab(df['EducationField'], df['Attrition'])
chi2, p, dof, expected = chi2_contingency(ct)

print("Chi-square:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)

Chi-square: 16.024674119585427
p-value: 0.006773980139025212
Degrees of freedom: 5


In [8]:
# Gender vs Attrition
ct = pd.crosstab(df['Gender'], df['Attrition'])
chi2, p, dof, expected = chi2_contingency(ct)

print("Chi-square:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)

Chi-square: 1.1169671241970975
p-value: 0.29057244902890855
Degrees of freedom: 1


In [11]:
# Numerical Features: Age, MonthlyIncome, YearsAtCompany

# test if differences between “left” vs “stayed” employees are significant.
from scipy.stats import ttest_ind, mannwhitneyu

# Separate numeric data by Attrition
age_yes = df.loc[df['Attrition']=='Yes', 'Age']
age_no  = df.loc[df['Attrition']=='No', 'Age']

# T-test (parametric)
t_stat, p_val = ttest_ind(age_yes, age_no)
print("Age T-test p-value:", p_val)

# Mann-Whitney (non-parametric, safe if data not normal)
u_stat, p_val_mw = mannwhitneyu(age_yes, age_no)
print("Age Mann-Whitney p-value:", p_val_mw)


Age T-test p-value: 8.356308021103587e-10
Age Mann-Whitney p-value: 5.304342075341702e-11


In [13]:
# MonthlyIncome (Salary)
# Separate numeric data by Attrition
MonthlyIncome_yes = df.loc[df['Attrition']=='Yes', 'MonthlyIncome']
MonthlyIncome_no  = df.loc[df['Attrition']=='No', 'MonthlyIncome']

# T-test (parametric)
t_stat, p_val = ttest_ind(MonthlyIncome_yes, MonthlyIncome_no)
print("MonthlyIncome T-test p-value:", p_val)

# Mann-Whitney (non-parametric, safe if data not normal)
u_stat, p_val_mw = mannwhitneyu(MonthlyIncome_yes, MonthlyIncome_no)
print("MonthlyIncome Mann-Whitney p-value:", p_val_mw)


MonthlyIncome T-test p-value: 7.147363985353811e-10
MonthlyIncome Mann-Whitney p-value: 2.950830917288873e-14


In [14]:
# YearsAtCompany (Tenure)
# Separate numeric data by Attrition
YearsAtCompany_yes = df.loc[df['Attrition']=='Yes', 'YearsAtCompany']
YearsAtCompany_no  = df.loc[df['Attrition']=='No', 'YearsAtCompany']

# T-test (parametric)
t_stat, p_val = ttest_ind(YearsAtCompany_yes, YearsAtCompany_no)
print("YearsAtCompany T-test p-value:", p_val)

# Mann-Whitney (non-parametric, safe if data not normal)
u_stat, p_val_mw = mannwhitneyu(YearsAtCompany_yes, YearsAtCompany_no)
print("YearsAtCompany Mann-Whitney p-value:", p_val_mw)

YearsAtCompany T-test p-value: 2.3188716103863036e-07
YearsAtCompany Mann-Whitney p-value: 2.916191369956416e-13
