In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#for association
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
dataset_association = pd.read_csv('train.csv')

In [3]:
dataset_association = dataset_association.drop(['employee_id'], axis = 1)
dataset_association.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   department            54808 non-null  object 
 1   region                54808 non-null  object 
 2   education             52399 non-null  object 
 3   gender                54808 non-null  object 
 4   recruitment_channel   54808 non-null  object 
 5   no_of_trainings       54808 non-null  int64  
 6   age                   54808 non-null  int64  
 7   previous_year_rating  50684 non-null  float64
 8   length_of_service     54808 non-null  int64  
 9   KPIs_met >80%         54808 non-null  int64  
 10  awards_won?           54808 non-null  int64  
 11  avg_training_score    54808 non-null  int64  
 12  is_promoted           54808 non-null  int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 5.4+ MB


## We need to transform all the variables into categorical with minimum number of unique values in order to use apriori and understand its output well

In [4]:
conditions = [
    (dataset_association['no_of_trainings'] <= 3),
    (dataset_association['no_of_trainings'] > 3) & (dataset_association['no_of_trainings'] <= 6),
    (dataset_association['no_of_trainings'] > 6)
]

values = ['low', 'medium', 'high']

dataset_association['no_of_trainings'] = np.select(conditions, values)

In [5]:
conditions = [
    (dataset_association['age'] <= 30),
    (dataset_association['age'] > 30) & (dataset_association['age'] <= 50),
    (dataset_association['age'] > 50)
]

values = ['young', 'middle_age', 'old']

dataset_association['age'] = np.select(conditions, values)

In [6]:
conditions = [
    (dataset_association['previous_year_rating'] < 3),
    (dataset_association['previous_year_rating'] == 3),
    (dataset_association['previous_year_rating'] > 3)
]

values = ['below_avg', 'avg', 'above_avg']

dataset_association['previous_year_rating'] = np.select(conditions, values)


In [7]:
conditions = [
    (dataset_association['length_of_service'] <= 5),
    (dataset_association['length_of_service'] > 5) & (dataset_association['length_of_service'] <= 10),
    (dataset_association['length_of_service'] > 10) & (dataset_association['length_of_service'] <= 15),
    (dataset_association['length_of_service'] > 15)
]

values = ['short', 'medium', 'long','very_long']

dataset_association['length_of_service'] = np.select(conditions, values)

In [8]:
conditions = [
    (dataset_association['avg_training_score'] <= 50),
    (dataset_association['avg_training_score'] > 50) & (dataset_association['avg_training_score'] <= 70),
    (dataset_association['avg_training_score'] > 70) & (dataset_association['avg_training_score'] <= 100),
]

values = ['low', 'medium', 'high']

dataset_association['avg_training_score'] = np.select(conditions, values)

In [9]:
dataset_association['KPIs_met >80%'] = dataset_association['KPIs_met >80%'].astype('str')
dataset_association['KPIs_met >80%'] = dataset_association['KPIs_met >80%'].replace(['0', '1'], ['no', 'yes'])

In [10]:
dataset_association['awards_won?'] = dataset_association['awards_won?'].astype('str')
dataset_association['awards_won?'] = dataset_association['awards_won?'].replace(['0', '1'], ['no', 'yes'])

In [11]:
dataset_association['is_promoted'] = dataset_association['is_promoted'].astype('str')
dataset_association['is_promoted'] = dataset_association['is_promoted'].replace(['0', '1'], ['no', 'yes'])

## We need to transform all the columns back using one hot encoding

In [4]:
dataset_association = pd.get_dummies(dataset_association)

## Now we apply the apriori algorithm

In [13]:
dataset_association['is_promoted_no'].value_counts()
#The data is impalanced, so we need to take this into account

1    50140
0     4668
Name: is_promoted_no, dtype: int64

- There are way more cases of No than Yes so we’ll also need to take that in consideration choosing our threshold since one is more common than the other.
- For No, let’s increase the threshold to 0.1 and filter the consequents column for is_promoted_no:

In [16]:
#Apriori min support for the is_promoted = no and features
min_support = 0.2
frequent_items_no = apriori(dataset_association, use_colnames=True, min_support=min_support, max_len=2)
rules_no = association_rules(frequent_items_no, metric='lift', min_threshold=1)
target = '{\'is_promoted_no\'}'
results_promoted_no = rules_no[rules_no['consequents'].astype(str).str.contains(target, na=False)].sort_values(by='confidence', ascending=False)
results_promoted_no

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
119,(avg_training_score_low),(is_promoted_no),0.219001,0.91483,0.210353,0.96051,1.049933,0.010004,2.156743
111,(KPIs_met >80%_no),(is_promoted_no),0.648026,0.91483,0.622373,0.960413,1.049827,0.029539,2.151483
120,(avg_training_score_medium),(is_promoted_no),0.474256,0.91483,0.441322,0.930558,1.017193,0.007459,1.226496
13,(department_Sales & Marketing),(is_promoted_no),0.307254,0.91483,0.285123,0.927969,1.014362,0.004037,1.18241
101,(previous_year_rating_avg),(is_promoted_no),0.339695,0.91483,0.314972,0.927221,1.013545,0.004209,1.170255
117,(awards_won?_no),(is_promoted_no),0.976828,0.91483,0.901857,0.923251,1.009205,0.008226,1.109719
16,(region_region_2),(is_promoted_no),0.225204,0.91483,0.20716,0.919874,1.005513,0.001136,1.062946
29,(education_Bachelor's),(is_promoted_no),0.669045,0.91483,0.614162,0.917969,1.003431,0.0021,1.038265
51,(gender_m),(is_promoted_no),0.702379,0.91483,0.643975,0.916849,1.002206,0.001418,1.024276
58,(recruitment_channel_other),(is_promoted_no),0.555503,0.91483,0.508867,0.916048,1.001332,0.000677,1.01451


In [17]:
#Apriori min support for the is_promoted = yes and features
min_support = 0.005
frequent_items_yes = apriori(dataset_association, use_colnames=True, min_support=min_support, max_len=2)
rules_yes = association_rules(frequent_items_yes, metric='lift', min_threshold=1)
target = '{\'is_promoted_yes\'}'
results_attrition_yes = rules_yes[rules_yes['consequents'].astype(str).str.contains(target, na=False)].sort_values(by='confidence', ascending=False)
results_attrition_yes.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1293,(awards_won?_yes),(is_promoted_yes),0.023172,0.08517,0.010199,0.440157,5.167984,0.008226,1.634084
1283,(KPIs_met >80%_yes),(is_promoted_yes),0.351974,0.08517,0.059517,0.169094,1.985374,0.029539,1.101003
1294,(avg_training_score_high),(is_promoted_yes),0.306744,0.08517,0.043589,0.142101,1.668437,0.017463,1.066361
1203,(previous_year_rating_above_avg),(is_promoted_yes),0.394431,0.08517,0.049354,0.125127,1.469146,0.01576,1.045672
579,(region_region_22),(is_promoted_yes),0.117282,0.08517,0.013392,0.114188,1.340705,0.003403,1.032759
289,(department_Technology),(is_promoted_yes),0.130236,0.08517,0.014013,0.107593,1.263275,0.00292,1.025127
887,(region_region_7),(is_promoted_yes),0.088363,0.08517,0.009415,0.106546,1.250974,0.001889,1.023925
987,(education_Master's & above),(is_promoted_yes),0.272314,0.08517,0.026839,0.098559,1.157208,0.003646,1.014853
180,(department_Procurement),(is_promoted_yes),0.130236,0.08517,0.012553,0.096386,1.131684,0.001461,1.012412
23,(department_Analytics),(is_promoted_yes),0.09765,0.08517,0.009342,0.095665,1.123226,0.001025,1.011605


because of the unblancing in the data the insights are not good, but it make sense that when:
- when avg_training_score is low its more likely to be not promoted, with support= 0.210353 and confidence= 0.960510
- when KPIs_met >80% is no its more likely to be not promoted, with support= 0.622373 and confidence= 0.960413 
- when avg_training_score is high its more likely to be promoted, with support= 0.043589 and confidence= 0.142101
- when KPIs_met >80% is yes its more likely to be promoted, with support= 0.059517 and confidence= 0.169094 
 and so on as we have deduce those results from pravious visualization.