# Association Rule Learning

### In this notebook, we perform association rule learning to extract statistiscally relevant associations between the professions.

### Import libraries

In [1]:
import os, sys, time
import pandas as pd, numpy as np
import random
import pickle
from natsort import natsorted
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules

### Global variables

In [2]:
DATA = os.path.join('..', 'data')
RESULTS = os.path.join('..', 'results')

## Load the data

In [3]:
with open(os.path.join(DATA, 'dataset.pkl'), 'rb') as d:
    pkl_data = pickle.load(d)

In [4]:
type(pkl_data)

pandas.core.frame.DataFrame

In [5]:
print(f'Size of the data: {pkl_data.shape}\n\n')
pkl_data.head(3)

Size of the data: (325989, 6)




Unnamed: 0,worker_id,order_id,list_profession_2,list_profession_3,order_list_profession_3,order_list_profession_2
0,0003e59c-a459-4842-b0ec-67a996c9f2fc,88f6eb57-c59a-4808-a05f-23350cb17a08,[Vente],[Vendeur],"[Préparateur de commandes, Agent de conditionn...","[Magasinage, Magasinage, Manutention]"
1,00afbab3-b57c-4417-9bc1-4ebeb05496d4,88f6eb57-c59a-4808-a05f-23350cb17a08,"[Entretien, Manutention, Service en salle]","[Agent de nettoyage, Manutentionnaire, Serveur...","[Préparateur de commandes, Agent de conditionn...","[Magasinage, Magasinage, Manutention]"
2,06ac473d-3114-4e9a-9e58-8f187e3959d1,88f6eb57-c59a-4808-a05f-23350cb17a08,"[Magasinage, Magasinage, Manutention]","[Agent de conditionnement, Manutentionnaire, P...","[Préparateur de commandes, Agent de conditionn...","[Magasinage, Magasinage, Manutention]"


# Association Learning

We use statistical learning to extract the potential valid associations between the different professions.

# Step 1: Creating a list with the required data

### We will use only the column list_profession_3 that designates the jobs as such

In [6]:
dataset = pkl_data.list_profession_3.tolist() + pkl_data.order_list_profession_3.tolist()
dataset = [natsorted(list(set(el))) for el in dataset]
dataset = [[w for w in el if w and w != 'nan' and len(w.split())>0] for el in dataset]
random.shuffle(dataset)

In [7]:
dataset[:5]

[['Préparateur de commandes', 'Serveur et barman'],
 ['Commis de cuisine', 'Plongeur', 'Préparateur de commandes', 'Vendeur'],
 ['Vendeur'],
 ['Serveur et barman'],
 ['Préparateur de commandes']]

# Step 2: Convert list to dataframe with boolean values

In [8]:
te = TransactionEncoder()
te_array = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_array, columns=te.columns_)

In [9]:
print(f'Data size: {df.shape}\n\n')
df.head()

Data size: (651978, 183)




Unnamed: 0,Accompagnant éducatif et social,Accompagnateur,Acheteur,Administration des ventes,Affrètement,Agent d'entretien,Agent d'exploitation,Agent de conditionnement,Agent de distribution,Agent de nettoyage,...,Tourneur / Fraiseur,Traducteur,Transaction immobilière,Transport de personnes,Trésorerie,Tuyauteur,Téléprospecteur,Veilleur de nuit,Vendeur,Éducateur de jeunes enfants
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# Step 3: Find frequently occurring itemsets using F-P Growth

### We can define a threshold for the number of occurrences of the itemset we want to retain.

In [10]:
min_occurences = 10
min_support = round(min_occurences / len(df), 5)
# min_support = 0.0001 * len(df)
min_support

2e-05

In [11]:
%%time
frequent_itemsets_fp = fpgrowth(df, min_support=min_support, use_colnames=True)

CPU times: user 3.21 s, sys: 24 ms, total: 3.23 s
Wall time: 3.23 s


In [12]:
frequent_itemsets_fp

Unnamed: 0,support,itemsets
0,0.375448,(Préparateur de commandes)
1,0.022867,(Serveur et barman)
2,0.166814,(Vendeur)
3,0.013527,(Commis de cuisine)
4,0.005730,(Plongeur)
...,...,...
70597,0.000069,"(Electrotechnicien, Electronicien, Technicien ..."
70598,0.000069,"(Mécanicien, Electromécanicien, Electronicien,..."
70599,0.000023,"(Préparateur de commandes, Electronicien, Manu..."
70600,0.000021,"(Electrotechnicien, Electronicien, Electricien)"


# Step 4: Mine the Association Rules

### In this final step we will perform the association rule mining in Python for the frequent itemsets which we calculated in Step 3.

In [13]:
%%time
rules_fp = association_rules(frequent_itemsets_fp, metric="lift", min_threshold=1) # lift > 1: associated professions are not independant
# we could consider another metric to set the treshold, e.g. the confidence: 
# rules_fp = association_rules(frequent_itemsets_fp, metric="confidence", min_threshold=0.8)

CPU times: user 33.2 s, sys: 1.99 s, total: 35.2 s
Wall time: 35.2 s


In [14]:
rules_fp.sort_values(['confidence', 'lift'], ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
85375,"(Chauffeur VL, Restauration diverse, Commis de...","(Agent de conditionnement, Cariste, Plongeur)",0.000021,0.000021,0.000021,1.000000,46569.857143,2.147266e-05,inf
85394,"(Agent de conditionnement, Cariste, Plongeur)","(Chauffeur VL, Restauration diverse, Commis de...",0.000021,0.000021,0.000021,1.000000,46569.857143,2.147266e-05,inf
85757,"(Chauffeur VL, Manutentionnaire, Commis de cui...","(Agent de conditionnement, Cariste, Plongeur)",0.000021,0.000021,0.000021,1.000000,46569.857143,2.147266e-05,inf
85768,"(Agent de conditionnement, Cariste, Plongeur)","(Chauffeur VL, Manutentionnaire, Commis de cui...",0.000021,0.000021,0.000021,1.000000,46569.857143,2.147266e-05,inf
85883,"(Préparateur de commandes, Chauffeur VL, Commi...","(Agent de conditionnement, Cariste, Plongeur)",0.000021,0.000021,0.000021,1.000000,46569.857143,2.147266e-05,inf
...,...,...,...,...,...,...,...,...,...
3634233,(Préparateur de commandes),"(Accompagnateur, Vendeur, Déménageur)",0.375448,0.000055,0.000021,0.000057,1.035799,7.421461e-07,1.000002
4425609,(Préparateur de commandes),"(Animatrice d'éveil, Agent de service hospital...",0.375448,0.000055,0.000021,0.000057,1.035799,7.421461e-07,1.000002
2810572,(Préparateur de commandes),"(Secrétaire médical / médicale, Hôte d'accueil...",0.375448,0.000057,0.000021,0.000057,1.007804,1.662858e-07,1.000000
2810784,(Préparateur de commandes),"(Hôte d'accueil, Opérateur de saisie, Assistan...",0.375448,0.000057,0.000021,0.000057,1.007804,1.662858e-07,1.000000


In [15]:
rules_fp_reduced = rules_fp.copy()
cols = rules_fp_reduced.columns
rules_fp_reduced['antecedents_length'] = rules_fp_reduced['antecedents'].map(lambda x: len(x))
rules_fp_reduced['consequents_length'] = rules_fp_reduced['consequents'].map(lambda x: len(x))
rules_fp_reduced = rules_fp_reduced[rules_fp_reduced['antecedents_length'] == 1]

for c in ['antecedents', 'consequents']:
    rules_fp_reduced[c] = rules_fp_reduced[c].apply(list)
    
rules_fp_reduced['antecedents'] = rules_fp_reduced['antecedents'].apply(lambda x: x[0])
rules_fp_reduced = rules_fp_reduced[cols]
rules_fp_reduced = rules_fp_reduced.sort_values(['confidence', 'lift'], ascending=False).reset_index(drop=True)

### We can add a confidence threshold to filter the rules we want to retain

In [16]:
min_conf_threshold = 0.5
rules_fp_reduced = rules_fp_reduced[rules_fp_reduced['confidence'] > min_conf_threshold]
rules_fp_reduced = rules_fp_reduced.sort_values(['confidence', 'lift'], ascending=False).reset_index(drop=True)
rules_fp_reduced = rules_fp_reduced.round(decimals={c: 2 for c in list(rules_fp_reduced.columns[2:])})
rules_fp_reduced

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,Infirmier de santé au travail,[Infirmier],0.00,0.00,0.00,0.88,845.99,0.00,8.49
1,Réceptionnaire,[Préparateur de commandes],0.00,0.38,0.00,0.82,2.17,0.00,3.38
2,Déménageur,[Préparateur de commandes],0.01,0.38,0.00,0.79,2.10,0.00,2.93
3,Agent de conditionnement,[Préparateur de commandes],0.11,0.38,0.09,0.78,2.07,0.04,2.81
4,Déménageur,[Manutentionnaire],0.01,0.19,0.00,0.75,3.87,0.00,3.19
...,...,...,...,...,...,...,...,...,...
544,Examinateur,[Assistant polyvalent / Secrétaire],0.00,0.05,0.00,0.51,10.63,0.00,1.94
545,Agent de distribution,[Préparateur de commandes],0.00,0.38,0.00,0.51,1.35,0.00,1.27
546,Réceptionnaire,"[Préparateur de commandes, Manutentionnaire]",0.00,0.14,0.00,0.50,3.69,0.00,1.74
547,Chauffeur SPL messagerie,[Chauffeur SPL national / international],0.00,0.00,0.00,0.50,625.91,0.00,2.01


### We reduce the rules by joining the list of consequents with the same antecedents
### Without taking into account the confidence, lift, leverage and conviction

In [17]:
antecedents = natsorted(list(set(rules_fp_reduced.antecedents)))
clusters = {}

for antecedent in antecedents:
    consequents_list = list(rules_fp_reduced[rules_fp_reduced['antecedents'] == antecedent]['consequents'])
    clusters[antecedent] = natsorted(list(set([consequent for consequents in consequents_list for consequent in consequents])))
    
clusters_df = pd.DataFrame.from_dict(clusters, orient='index')
professions = []

for idx in range(len(clusters_df)):
    professions.append(clusters_df.iloc[idx, :].tolist())
professions = [[prof for prof in profs if prof] for profs in professions]

clusters_df['consequents'] = professions
clusters_df = clusters_df['consequents']
clusters_df = clusters_df.reset_index()
clusters_df.columns = ['antecedents', 'consequents']
clusters_df['size'] = clusters_df['consequents'].apply(lambda x: len(x))
clusters_df = clusters_df.sort_values('size', ascending=False).reset_index(drop=True)
clusters_df = clusters_df.reset_index()
clusters_df.columns.values[0] = 'cluster'
clusters_df

Unnamed: 0,cluster,antecedents,consequents,size
0,0,Déménageur,"[Chauffeur VL, Hotliner, Manoeuvre, Manutentio...",9
1,1,Réceptionnaire,"[Cariste, Magasinier, Manutentionnaire, Prépar...",4
2,2,Electronicien,"[Electrotechnicien, Technicien de maintenance]",2
3,3,Manoeuvre,"[Manutentionnaire, Préparateur de commandes]",2
4,4,Chargé d'affaires,"[Chargé de clientèle, Commercial]",2
5,5,Agent de conditionnement,"[Manutentionnaire, Préparateur de commandes]",2
6,6,Examinateur,"[Assistant polyvalent / Secrétaire, Vendeur]",2
7,7,Agent de distribution,[Préparateur de commandes],1
8,8,Plaquiste,[Préparateur de commandes],1
9,9,Peintre industriel,[Préparateur de commandes],1


In [18]:
clusters_df['size'].value_counts()

1    19
2     5
9     1
4     1
Name: size, dtype: int64

# Conclusion

We can retain the rules whose rules have lift > 1 and  sufficiently "high" confidence at the same time as valid associations of professions.

## Save the result

In [19]:
clusters_df.to_csv(os.path.join(RESULTS, 'association-rules-professions-per-cluster.csv'), index=False)