In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sbn
import numpy as np
import scipy.stats as st

## chi formula

$ \chi^2 = \sum \frac{(O_i - E_i)^2}{E_i}$
- $\chi^2$ = chi squared
- $O_i$ = observed value
- $E_i$ = expected value



The **expected value** (or **mean**)

$
E(X) = \sum [x_i \cdot P(x_i)]
$

where:  
- $ x_i $ are the possible values of the variable (Job Satisfaction: 1, 2, 3, 4).  
- $ P(x_i) $ is the probability of each $x_i$, which is found by:

$
P(x_i) = \frac{\text{Total count for } x_i}{\text{Overall total count}}
$


In [10]:
data = pd.read_csv('WA_Fn-UseC_HR-Employee-Attrition.csv')
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [12]:
data.describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,2.912925,1.0,1024.865306,2.721769,65.891156,2.729932,2.063946,...,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,403.5091,8.106864,1.024165,0.0,602.024335,1.093082,20.329428,0.711561,1.10694,...,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,1.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,2.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,5.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [16]:
data.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [26]:
data['Attrition'].value_counts()

Attrition
No     1233
Yes     237
Name: count, dtype: int64

In [28]:
data['JobSatisfaction'].value_counts()

JobSatisfaction
4    459
3    442
1    289
2    280
Name: count, dtype: int64

In [34]:
ct = pd.crosstab(data.Attrition, data.JobSatisfaction, margins=True)

In [36]:
ct

JobSatisfaction,1,2,3,4,All
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
No,223,234,369,407,1233
Yes,66,46,73,52,237
All,289,280,442,459,1470


In [77]:
ct.iloc[0,0:4]

JobSatisfaction
1    223
2    234
3    369
4    407
Name: No, dtype: int64

## observed value

In [68]:
obs = np.append(ct.iloc[0][0:4], ct.iloc[1][0:4])
obs

array([223, 234, 369, 407,  66,  46,  73,  52], dtype=int64)

## expected value

In [112]:
exp = []
grand_total = ct.iloc[2, 4]
for row_total in ct.iloc[0:2, 4]:
    for column_total in ct.iloc[2, 0:4]:
        print('{} * {} / {} = {:.2f}'.format(column_total, row_total, grand_total, column_total * row_total / grand_total))
        exp.append(column_total * row_total / grand_total)      

289 * 1233 / 1470 = 242.41
280 * 1233 / 1470 = 234.86
442 * 1233 / 1470 = 370.74
459 * 1233 / 1470 = 385.00
289 * 237 / 1470 = 46.59
280 * 237 / 1470 = 45.14
442 * 237 / 1470 = 71.26
459 * 237 / 1470 = 74.00


In [138]:
exp = np.array(exp)
type(exp) == type(obs)

True

In [140]:
(obs-exp)**2 / exp

array([1.55358117e+00, 3.12825860e-03, 8.15490711e-03, 1.25738277e+00,
       8.08255523e+00, 1.62748644e-02, 4.24261623e-02, 6.54157365e+00])

In [163]:
chi2 = np.sum((obs-exp)**2 / exp)

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/3/35/Chi-square_pdf.svg/800px-Chi-square_pdf.svg.png">

In [159]:
dof = (ct.iloc[:-1, :-1].shape[0] - 1) * (ct.iloc[:-1, :-1].shape[1] - 1)
dof

3

In [183]:
p_value = (1 - st.chi2.cdf(chi2, dof)) 
print(p_value)

0.000556300451038716


<p style="direction:rtl; text-align:right">
    چون مقدار p value کمتر از 5% شده پس میتونیم بگیم احتمال شانسی بودن رابطه بین رضایت شغلی و اصطکاک شغلی خیلی کم هست و این دو باهم دیگه ارتبط دارن
    <br>
    تذکر: اینجا صرفا داریم نشون میدیم یک ارتباطی هست و ارتباط علت و معلولی رو باید با A/B Testing متوجه بشیم
</p>

In [193]:
# --------------------------------

In [195]:
ct

JobSatisfaction,1,2,3,4,All
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
No,223,234,369,407,1233
Yes,66,46,73,52,237
All,289,280,442,459,1470


In [209]:
d = np.array([ct.iloc[0, 0:4], ct.iloc[1, 0:4]])
d

array([[223, 234, 369, 407],
       [ 66,  46,  73,  52]], dtype=int64)

In [215]:
st.chi2_contingency(d)

Chi2ContingencyResult(statistic=17.505077010348, pvalue=0.0005563004510387556, dof=3, expected_freq=array([[242.40612245, 234.85714286, 370.73877551, 384.99795918],
       [ 46.59387755,  45.14285714,  71.26122449,  74.00204082]]))

## Attrition and Education

In [218]:
ct2 = pd.crosstab(data.Attrition, data.Education, margins=True)
ct2

Education,1,2,3,4,5,All
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
No,139,238,473,340,43,1233
Yes,31,44,99,58,5,237
All,170,282,572,398,48,1470


In [220]:
d2 = np.array([ct2.iloc[0, :-1], ct2.iloc[1, :-1]])
d2

array([[139, 238, 473, 340,  43],
       [ 31,  44,  99,  58,   5]], dtype=int64)

In [224]:
st.chi2_contingency(d2)

Chi2ContingencyResult(statistic=3.0739613982367193, pvalue=0.5455253376565949, dof=4, expected_freq=array([[142.59183673, 236.53469388, 479.77959184, 333.83265306,
         40.26122449],
       [ 27.40816327,  45.46530612,  92.22040816,  64.16734694,
          7.73877551]]))

<p style="direction:rtl">
    چون p value شده 50% و از 5% بیشتره رابطه تحصیلات با اصطکاک از لحاظ آماری اثبات شده نیست
</p>


# **Step 1: What is Expected Value?**  
Expected value (𝔼[𝑋]) is a way to **predict the average outcome** of something happening **if we repeat an experiment many times**.  

### 🎲 **Example: Rolling a Die**
Imagine you roll a fair 6-sided die. The numbers you can get are **1, 2, 3, 4, 5, 6**. Each number has an **equal chance** (1/6 probability).  

The expected value of rolling a die is:  

$E[X] = 1 \cdot \frac{1}{6} + 2 \cdot \frac{1}{6} + 3 \cdot \frac{1}{6} + 4 \cdot \frac{1}{6} + 5 \cdot \frac{1}{6} + 6 \cdot \frac{1}{6}$

$E[X] = \frac{1+2+3+4+5+6}{6} = 3.5$

This means, **on average**, if you roll the die **many times**, you would expect to get **3.5** (even though you can never actually roll a 3.5).  

---

# **Step 2: General Formula for Expected Value**  
The general formula for expected value is:

$E[X] = \sum (x_i \cdot P(x_i))$

Where:  
- $x_i$ = possible values the random variable can take  
- $P(x_i)$ = probability of getting $x_i$  
- $\sum$ = sum over all possible values  

### **Another Example: Coin Flip**  
If you flip a fair coin:  
- You get **Heads (1)** with probability **0.5**  
- You get **Tails (0)** with probability **0.5**  

Expected value:

$E[X] = (1 \times 0.5) + (0 \times 0.5) = 0.5$

So, if you flip the coin many times, you expect to get **Heads 50% of the time**.

---

# **Step 3: How Does This Relate to Our Problem?**  
Now, let’s apply the **expected value concept** to our **Job Satisfaction & Attrition** data.

Instead of rolling dice or flipping coins, we are looking at how **people are distributed** in categories (Attrition vs. Job Satisfaction levels).  

In our case, expected value is calculated using:  

$E_{ij} = \frac{\text{Row Total} \times \text{Column Total}}{\text{Grand Total}}$

- This formula helps us **predict** how many people **should be in each category if there was no relationship** between job satisfaction and attrition.  
- It’s like saying, **“If attrition happened randomly, how many people should we expect in each group?”**  

---

# **Step 4: Why Do We Need Expected Values?**
Expected values help us **compare reality vs. expectation**:
1. If the **actual (observed) values** are **very different** from the **expected values**, it suggests that **job satisfaction and attrition might be related**.  
2. If the **observed values are close to the expected values**, then job satisfaction and attrition **might not be related** (it could just be random).  

---

# **Step 5: Final Example from Our Data**
We use:

$E_{ij} = \frac{\text{Row Total} \times \text{Column Total}}{\text{Grand Total}}$

For example, for **No Attrition & Job Satisfaction 1**:

$E_{(No, 1)} = \frac{(1233) \times (289)}{1470} = 242.41$

This means, if there was **no relationship**, we would expect **242** people to have **No Attrition and Job Satisfaction = 1**.  
But the actual number is **223**—this difference helps us test if there's a real relationship.

---

# **Step 6: Next Step – Chi-Square Test**
Now that we have **expected values**, we can calculate **how different** the observed data is using the **Chi-Square Test**:

$\chi^2 = \sum \frac{(O_i - E_i)^2}{E_i}$

Where:  
- $O_i$ = Observed value (actual data)  
- $E_i$ = Expected value (calculated)  

If $\chi^2$ is **large**, it means the difference is **too big to be random**, so job satisfaction **likely affects** attrition.

---

# **Final Summary**
- **Expected value** is the predicted average outcome if things were random.  
- **Formula**: $E[X] = \sum (x_i \cdot P(x_i))$ for general probability OR $E_{ij} = \frac{\text{Row Total} \times \text{Column Total}}{\text{Grand Total}}$ for tables.  
- **We use expected values** to compare with actual data.  
- **If observed values are very different from expected values, there might be a real relationship** between job satisfaction and attrition.  
- **Next step:** Use the **Chi-Square test** to measure the difference and check if it’s significant.
