In [2]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [3]:
### import data 
df_base = pd.read_csv("03_association_rule-HR-Employee-Attrition.csv")


In [4]:
### check data and data types
print((df_base.columns))
df_base.dtypes
print(df_base.HourlyRate.describe())


Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')
count    1470.000000
mean       65.891156
std        20.329428
min        30.000000
25%        48.000000
50%        66.000000
75%        83.750000
max       100.000000
Name: HourlyRate, dtype: float64


In [5]:
### data clean

### turn age into a categorical variable
### function pd qcut --> category

df_base['age_range'] = pd.qcut(df_base['Age'], q=4, labels=['<=30', '>30 <=36', '>36 <=43', '>43'])
df_base['HourlyRate_bin'] = pd.qcut(df_base['HourlyRate'], q=4,
labels=['<=48','<=66','<=83.75','<=100'])

### get column list
def column_list():
    columns_in_use = []
    columns_drop = []
    for i in range(0,len(df_base.columns)):
        if df_base.dtypes[i] == 'O' or df_base.dtypes[i].name == 'category':
            columns_in_use.append(df_base.columns[i])
        else:
            columns_drop.append(df_base.columns[i])
    return columns_in_use,columns_drop

### turn value to 0 or 1 if more than once (function)
def frequence_encode(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1



In [6]:
### run function & drop columns
columns_in_use,columns_drop = column_list()
df_model = df_base.drop(columns = columns_drop)
### example of apply,applymap UDF
#df_model = df_base.applymap(frequence_encode)

In [10]:
### turn objects, categories to dummy variable 
### get_dummies function
### alternative: one-hot encoding

df_model_new = pd.get_dummies(df_model)


In [11]:
df_model_new.head(10)

Unnamed: 0,Attrition_No,Attrition_Yes,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,...,OverTime_No,OverTime_Yes,age_range_<=30,age_range_>30 <=36,age_range_>36 <=43,age_range_>43,HourlyRate_bin_<=48,HourlyRate_bin_<=66,HourlyRate_bin_<=83.75,HourlyRate_bin_<=100
0,0,1,0,0,1,0,0,1,0,1,...,0,1,0,0,1,0,0,0,0,1
1,1,0,0,1,0,0,1,0,0,1,...,1,0,0,0,0,1,0,1,0,0
2,0,1,0,0,1,0,1,0,0,0,...,0,1,0,0,1,0,0,0,0,1
3,1,0,0,1,0,0,1,0,0,1,...,0,1,0,1,0,0,0,1,0,0
4,1,0,0,0,1,0,1,0,0,0,...,1,0,1,0,0,0,1,0,0,0
5,1,0,0,1,0,0,1,0,0,1,...,1,0,0,1,0,0,0,0,1,0
6,1,0,0,0,1,0,1,0,0,0,...,0,1,0,0,0,1,0,0,1,0
7,1,0,0,0,1,0,1,0,0,1,...,1,0,1,0,0,0,0,0,1,0
8,1,0,0,1,0,0,1,0,0,1,...,1,0,0,0,1,0,1,0,0,0
9,1,0,0,0,1,0,1,0,0,0,...,1,0,0,1,0,0,0,0,0,1


### Apriori method 

- ```df``` (pandas DataFrame)
- ```min_support```: float (minimum times the itmes returns), e.g. 0.05，5 out of 100 pattern occurs in the dataset
- ```use_colnames```: bool(False), if ture, df use column names instead of column index
- ```max_len```: int(None), maximum length of the itemsets generated
- ```verbose```:int(0),shows the number of iterations
- ```low memory```:bool(False),If True, uses an iterator to search for combinations above min_support. ***Note that while low_memory=True should only be used for large dataset if memory resources are limited, because this implementation is approx. 3-6x slower than the default.***

### association_rules

- ```df``` : panda DF
- ```metric```: string(confidence), 'support','confidence','lift'
- ```min_threshold```:float(0.8): minimal threshold for the evaluation metric
- ```support_only```: bool(False), Only computes the rule support and fills the other metric columns with NaNs. This is useful if:

a) the input DataFrame is incomplete, e.g., does not contain support values for all rule antecedents and consequents

b) you simply want to speed up the computation because you don't need the other metrics.

- more details: association rule part : https://www.edrawmind.com/app/editor/zQTGTYkqnd1t3YehB3a7x2Td4fZx4llu



In [95]:
def apply_rule(data,min_support,use_column,max_len,metric,min_threshold):
    frequent_items = apriori(df = data,min_support = min_support,
    use_colnames = use_column,max_len = max_len)
    rules = association_rules(frequent_items,metric=metric,min_threshold = min_threshold)
    return frequent_items,rules

In [96]:
## data,min_support,use_column(True,False),max_len,metric(string),min_threshold
frequent_items,rules= apply_rule(df_model,0.03,True,3,'lift',1)

### Apply filter to rules and interpretation result

- use case for real case: based off transaction categories to find which category is associated with high-value customer 
- use case for HR dataset: find out patteurns associated with attrition
- other use case: basket analysis, market research etc.

In [106]:
### non-attrition employee 
target_variable = ['Attrition_No','Attrition_Yes']
rule_non_attrition = rules[rules["consequents"].astype(str).str.contains(f"{target_variable[0]}")]\
    .sort_values(by=['confidence','lift'],ascending=False)

rule_attrition = rules[rules["consequents"].astype(str).str.contains(f"{target_variable[1]}")]\
    .sort_values(by=['confidence','lift'],ascending=False)

In [105]:
### understand result - persons has manager title and no overtime happends 5.1% of the whole dataset
### 98% of those do not have attrition

rule_non_attrition.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1223,"(JobRole_Manager, OverTime_No)",(Attrition_No),0.05102,0.838776,0.05034,0.986667,1.176318,0.007545,12.091837
1257,"(JobRole_Research Director, OverTime_No)",(Attrition_No),0.038776,0.838776,0.038095,0.982456,1.171298,0.005571,9.189796
1132,"(JobRole_Research Director, Gender_Male)",(Attrition_No),0.031973,0.838776,0.031293,0.978723,1.166848,0.004475,7.577551
730,(JobRole_Research Director),"(Department_Research & Development, Attrition_No)",0.054422,0.563265,0.053061,0.975,1.730978,0.022407,17.469388
20,(JobRole_Research Director),(Attrition_No),0.054422,0.838776,0.053061,0.975,1.162409,0.007414,6.44898
726,"(Department_Research & Development, JobRole_Re...",(Attrition_No),0.054422,0.838776,0.053061,0.975,1.162409,0.007414,6.44898
1250,"(Over18_Y, JobRole_Research Director)",(Attrition_No),0.054422,0.838776,0.053061,0.975,1.162409,0.007414,6.44898
1254,(JobRole_Research Director),"(Over18_Y, Attrition_No)",0.054422,0.838776,0.053061,0.975,1.162409,0.007414,6.44898
1387,"(HourlyRate_bin_<=83.75, MaritalStatus_Divorced)",(Attrition_No),0.04966,0.838776,0.048299,0.972603,1.159551,0.006646,5.884694
608,"(BusinessTravel_Travel_Rarely, JobRole_Researc...",(Attrition_No),0.042177,0.838776,0.040816,0.967742,1.153756,0.005439,4.997959


In [111]:
### 7% of data who are age <= 30 and work overtime 
### 50% of them attrited or 
### for people who are age <=30 and work overtime has 3 x chance of quitting the job 
### 8% of data who are single and work overtime has 3 x chance of quitting the job

rule_attrition[rule_attrition['lift']>=2]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1821,"(age_range_<=30, OverTime_Yes)",(Attrition_Yes),0.072109,0.161224,0.036054,0.5,3.101266,0.024429,1.677551
1773,"(MaritalStatus_Single, OverTime_Yes)",(Attrition_Yes),0.089116,0.161224,0.044218,0.496183,3.077592,0.02985,1.664842
1778,"(age_range_<=30, MaritalStatus_Single)",(Attrition_Yes),0.101361,0.161224,0.040136,0.395973,2.456036,0.023794,1.388639
1688,"(Department_Sales, OverTime_Yes)",(Attrition_Yes),0.087075,0.161224,0.032653,0.375,2.325949,0.018614,1.342041
1675,"(MaritalStatus_Single, Department_Sales)",(Attrition_Yes),0.104082,0.161224,0.036054,0.346405,2.148589,0.019274,1.283327
1737,"(Gender_Male, OverTime_Yes)",(Attrition_Yes),0.160544,0.161224,0.054422,0.338983,2.102553,0.028538,1.268917
1644,(JobRole_Laboratory Technician),"(Department_Research & Development, Attrition_...",0.17619,0.090476,0.042177,0.239382,2.645804,0.026236,1.19577
1671,(JobRole_Sales Executive),"(Attrition_Yes, Department_Sales)",0.221769,0.062585,0.038776,0.174847,2.793745,0.024896,1.136049
1673,(Department_Sales),"(JobRole_Sales Executive, Attrition_Yes)",0.303401,0.038776,0.038776,0.127803,3.295964,0.027011,1.102072
1701,(OverTime_Yes),"(EducationField_Life Sciences, Attrition_Yes)",0.282993,0.060544,0.034694,0.122596,2.024903,0.01756,1.070722


### How to evaluate the recommender system 

- A/B testing 
- north star metrics (example): % of purchase from recommendation list (Items recommended by the current system have typically been placed at the tops of lists)
- possible bias: position bias - fighting the bias: inverse propensity scoring
- solution (example): holdout lane