In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#for association
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [4]:
dataset_association = pd.read_csv('train.csv')

In [5]:
dataset_association = dataset_association.drop(['employee_id'], axis = 1)
dataset_association.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   department            54808 non-null  object 
 1   region                54808 non-null  object 
 2   education             52399 non-null  object 
 3   gender                54808 non-null  object 
 4   recruitment_channel   54808 non-null  object 
 5   no_of_trainings       54808 non-null  int64  
 6   age                   54808 non-null  int64  
 7   previous_year_rating  50684 non-null  float64
 8   length_of_service     54808 non-null  int64  
 9   KPIs_met >80%         54808 non-null  int64  
 10  awards_won?           54808 non-null  int64  
 11  avg_training_score    54808 non-null  int64  
 12  is_promoted           54808 non-null  int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 5.4+ MB


## We need to transform all the variables into categorical with minimum number of unique values in order to use apriori and understand its output well

In [6]:
conditions = [
    (dataset_association['no_of_trainings'] <= 3),
    (dataset_association['no_of_trainings'] > 3) & (dataset_association['no_of_trainings'] <= 6),
    (dataset_association['no_of_trainings'] > 6)
]

values = ['low', 'medium', 'high']

dataset_association['no_of_trainings'] = np.select(conditions, values)

In [7]:
conditions = [
    (dataset_association['age'] <= 30),
    (dataset_association['age'] > 30) & (dataset_association['age'] <= 50),
    (dataset_association['age'] > 50)
]

values = ['young', 'middle_age', 'old']

dataset_association['age'] = np.select(conditions, values)

In [8]:
conditions = [
    (dataset_association['previous_year_rating'] < 3),
    (dataset_association['previous_year_rating'] == 3),
    (dataset_association['previous_year_rating'] > 3)
]

values = ['below_avg', 'avg', 'above_avg']

dataset_association['previous_year_rating'] = np.select(conditions, values)


In [9]:
conditions = [
    (dataset_association['length_of_service'] <= 5),
    (dataset_association['length_of_service'] > 5) & (dataset_association['length_of_service'] <= 10),
    (dataset_association['length_of_service'] > 10) & (dataset_association['length_of_service'] <= 15),
    (dataset_association['length_of_service'] > 15)
]

values = ['short', 'medium', 'long','very_long']

dataset_association['length_of_service'] = np.select(conditions, values)

In [10]:
conditions = [
    (dataset_association['avg_training_score'] <= 50),
    (dataset_association['avg_training_score'] > 50) & (dataset_association['avg_training_score'] <= 70),
    (dataset_association['avg_training_score'] > 70) & (dataset_association['avg_training_score'] <= 100),
]

values = ['low', 'medium', 'high']

dataset_association['avg_training_score'] = np.select(conditions, values)

In [11]:
dataset_association['KPIs_met >80%'] = dataset_association['KPIs_met >80%'].astype('str')
dataset_association['KPIs_met >80%'] = dataset_association['KPIs_met >80%'].replace(['0', '1'], ['no', 'yes'])

In [12]:
dataset_association['awards_won?'] = dataset_association['awards_won?'].astype('str')
dataset_association['awards_won?'] = dataset_association['awards_won?'].replace(['0', '1'], ['no', 'yes'])

In [13]:
dataset_association['is_promoted'] = dataset_association['is_promoted'].astype('str')
dataset_association['is_promoted'] = dataset_association['is_promoted'].replace(['0', '1'], ['no', 'yes'])

## We need to transform all the columns back using one hot encoding

In [14]:
dataset_association = pd.get_dummies(dataset_association)

## Now we apply the apriori algorithm

In [15]:
#now the dataset is ready to be used for association rules (apriori)
print(dataset_association.shape)
#we have a total of 54808 rows, so we choose a min support of 0.005
#This means that only results that occurred more than (0.005*54808) times in our data will be considered
#Apriori min support

min_support = 0.005
frequent_items = apriori(dataset_association, use_colnames=True, min_support=min_support, max_len=2)
rules = association_rules(frequent_items, metric='lift', min_threshold=1)
rules.head(10).sort_values(by='confidence', ascending=False)

(54808, 74)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
8,(department_Analytics),(no_of_trainings_low),0.09765,0.98783,0.097066,0.994021,1.006267,0.000605,2.035392
5,(department_Analytics),(gender_m),0.09765,0.702379,0.08829,0.904148,1.287265,0.019703,3.105003
2,(department_Analytics),(education_Bachelor's),0.09765,0.669045,0.072581,0.743274,1.110948,0.007248,1.289136
6,(department_Analytics),(recruitment_channel_sourcing),0.09765,0.423661,0.041892,0.428999,1.012599,0.000521,1.009348
1,(department_Analytics),(region_region_22),0.09765,0.117282,0.020325,0.208146,1.77475,0.008873,1.114749
0,(region_region_22),(department_Analytics),0.117282,0.09765,0.020325,0.173304,1.77475,0.008873,1.091514
4,(gender_m),(department_Analytics),0.702379,0.09765,0.08829,0.125701,1.287265,0.019703,1.032084
3,(education_Bachelor's),(department_Analytics),0.669045,0.09765,0.072581,0.108484,1.110948,0.007248,1.012152
7,(recruitment_channel_sourcing),(department_Analytics),0.423661,0.09765,0.041892,0.09888,1.012599,0.000521,1.001365
9,(no_of_trainings_low),(department_Analytics),0.98783,0.09765,0.097066,0.098262,1.006267,0.000605,1.000679


In [16]:
dataset_association['is_promoted_no'].value_counts()
#The data is impalanced, so we need to take this into account

1    50140
0     4668
Name: is_promoted_no, dtype: int64

- There are way more cases of No than Yes so we’ll also need to take that in consideration choosing our threshold since one is more common than the other.
- For No, let’s increase the threshold to 0.1 and filter the consequents column for is_promoted_no:

In [17]:
#Apriori min support
min_support = 0.5
frequent_items_no = apriori(dataset_association, use_colnames=True, min_support=min_support, max_len=2)
rules_no = association_rules(frequent_items_no, metric='lift', min_threshold=1)
target = '{\'is_promoted_no\'}'
results_attrition_no = rules_no[rules_no['consequents'].astype(str).str.contains(target, na=False)].sort_values(by='confidence', ascending=False)
results_attrition_no.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
11,(KPIs_met >80%_no),(is_promoted_no),0.648026,0.91483,0.622373,0.960413,1.049827,0.029539,2.151483
13,(awards_won?_no),(is_promoted_no),0.976828,0.91483,0.901857,0.923251,1.009205,0.008226,1.109719
1,(education_Bachelor's),(is_promoted_no),0.669045,0.91483,0.614162,0.917969,1.003431,0.0021,1.038265
3,(gender_m),(is_promoted_no),0.702379,0.91483,0.643975,0.916849,1.002206,0.001418,1.024276
5,(recruitment_channel_other),(is_promoted_no),0.555503,0.91483,0.508867,0.916048,1.001332,0.000677,1.01451


In [20]:
#Apriori min support
min_support = 0.05
frequent_items_yes = apriori(dataset_association, use_colnames=True, min_support=min_support, max_len=2)
rules_yes = association_rules(frequent_items_yes, metric='lift', min_threshold=1)
target = '{\'is_promoted_yes\'}'
results_attrition_yes = rules_yes[rules_yes['consequents'].astype(str).str.contains(target, na=False)].sort_values(by='confidence', ascending=False)
results_attrition_yes.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
311,(KPIs_met >80%_yes),(is_promoted_yes),0.351974,0.08517,0.059517,0.169094,1.985374,0.029539,1.101003
240,(age_middle_age),(is_promoted_yes),0.617392,0.08517,0.053405,0.0865,1.01562,0.000821,1.001456
225,(no_of_trainings_low),(is_promoted_yes),0.98783,0.08517,0.084604,0.085647,1.005597,0.000471,1.000521
