# Jobs Referential

### In this notebook, we create the jobs refential from the differents clusters of the professions derived from the previous analyses.

### Import libraries

In [1]:
import os, sys, time
import pandas as pd, numpy as np
import random
from natsort import natsorted
import ast
from collections import Counter

### Global variables

In [2]:
RESULTS = os.path.join('..', 'results')

## STEP 0: Load the clusters

#### We load all the clusters obtained so far:

#### HDBSCAN clustering

In [3]:
hdbscan_clusters = pd.read_csv(os.path.join(RESULTS, 'hdbscan-professions-per-cluster.csv'))

print(f'Number of clusters: {len(hdbscan_clusters)}\n\n')
hdbscan_clusters.head()

Number of clusters: 27




Unnamed: 0,cluster,professions,size
0,25,"['Chauffeur VL', 'Déménageur', 'Hotliner', 'Ma...",7
1,12,"[""Chef d'atelier"", 'Electromécanicien', 'Elect...",7
2,14,"['Auditeur comptable', 'Comptable clients', 'C...",6
3,6,"['Infirmier', 'Infirmier anesthésiste', 'Infir...",5
4,23,"['Jardinier', 'Maçon', 'Nacelliste', 'Paysagis...",4


#### Graph clustering - Louvain algorithm (NetworkX)

In [4]:
louvain_clusters = pd.read_csv(os.path.join(RESULTS, 'graph-louvain-professions-per-cluster.csv'))

print(f'Number of clusters: {len(louvain_clusters)}\n\n')
louvain_clusters.head()

Number of clusters: 26




Unnamed: 0,cluster,professions,size
0,2,"['Acheteur', 'Administration des ventes', 'Age...",40
1,19,"['Carreleur', 'Charpentier', 'Coffreur', ""Cond...",14
2,23,"['Câbleur', ""Chef d'atelier"", ""Chef d'équipe t...",14
3,3,"['Affrètement', ""Agent d'exploitation"", 'Agent...",13
4,34,"['Audit', 'Chercheur', 'Ingénieur', 'Manipulat...",9


#### Association Rules

In [5]:
association_rules = pd.read_csv(os.path.join(RESULTS, 'association-rules-professions-per-cluster.csv'))

print(f'Number of association rules: {len(association_rules)}\n\n')
association_rules.head()

Number of association rules: 26




Unnamed: 0,cluster,antecedents,consequents,size
0,0,Déménageur,"['Chauffeur VL', 'Hotliner', 'Manoeuvre', 'Man...",9
1,1,Réceptionnaire,"['Cariste', 'Magasinier', 'Manutentionnaire', ...",4
2,2,Electronicien,"['Electrotechnicien', 'Technicien de maintenan...",2
3,3,Manoeuvre,"['Manutentionnaire', 'Préparateur de commandes']",2
4,4,Chargé d'affaires,"['Chargé de clientèle', 'Commercial']",2


# Step 1: Synthesize the clusters

### We perform a synthesis of the clusters to obtained final clusters correpondind to associations of the professions.
### For each profession, we determine all the related professions.

### Hypotheses:

##### To determine the final associations between the profession, we consider the 3 following strategies, from the less conservative to the most conservative:

##### 1. We retain the union of the clusters
##### 2. We proceed to a majority vote based on the three models: for each profession, we retain the related professions associated to it at least twice (over the three models)
##### 3. We retain only the intersection of the clusters

##### We distinguish the particular case of the associations rules, as they correspond to implications:

##### profesion A --> [profession B, profession C] means: when we see profession A, w can seuggest professions B and C, but there is no reciprocity in this relation.

##### So, for this particular case we do not consider the professions B and C to be related, at least directly.

## Step 1.0: We get all the professions and the associations

In [6]:
professions = hdbscan_clusters.professions.tolist() + louvain_clusters.professions.tolist() + association_rules.consequents.tolist()
professions = [ast.literal_eval(x) for x in professions]
professions = [prof for professions_list in professions for prof in professions_list]
# professions += association_rules.antecedents.tolist()
professions = natsorted(list(set(professions)))

print(f'Number of professions: {len(professions)}\n\n')
print('Sample of profesions:\n')
professions[:5]

Number of professions: 173


Sample of profesions:



['Accompagnant éducatif et social',
 'Accompagnateur',
 'Acheteur',
 'Administration des ventes',
 'Affrètement']

In [7]:
# we consider the clusters otained using HDBSCAN and Louvain (graph) clustering algorithms
clusters_hdbscan = hdbscan_clusters.professions.tolist()
clusters_hdbscan = [ast.literal_eval(x) for x in clusters_hdbscan]
clusters_louvain = louvain_clusters.professions.tolist()
clusters_louvain = [ast.literal_eval(x) for x in clusters_louvain]
clusters = clusters_hdbscan + clusters_louvain

# add the association rules
association_rules_dic = {association_rules.iloc[idx, 1]: association_rules.iloc[idx, 2] for idx in range(len(association_rules))}
association_rules_dic = {k: ast.literal_eval(v) for k, v in association_rules_dic.items()}

## Step 1.1: We consider the union of the clusters to group them

In [8]:
associations_union_dic = {}

# loop over the professions
for profession in professions:
    related_profs = []
    # get the related professions in the HDBSCAN and Louvain clusters
    for cluster in clusters:
        if profession in cluster:
            related_profs.append([x for x in cluster if x != profession])
    # get the related professions in association rules
    if profession in list(association_rules_dic.keys()):
        related_profs.append(association_rules_dic[profession])
        
    # flatten the list of related profesions
    related_profs = list(set([p for el in related_profs for p in el]))
    
    # append the related professions to the dictionary
    associations_union_dic[profession] = related_profs

# reduce the lists of related professions
associations_union_dic = {k: list(set(v)) for k, v in associations_union_dic.items() if list(set(v))}
    
# create the final dataframe
associations_union_df = pd.DataFrame.from_dict(associations_union_dic, orient='index')
profs = []

for idx in range(len(associations_union_df)):
    profs.append(associations_union_df.iloc[idx, :].tolist())
profs = [[p for p in el if p] for el in profs]

associations_union_df['related_professions'] = profs
associations_union_df = associations_union_df['related_professions']
associations_union_df = associations_union_df.reset_index()
associations_union_df.columns = ['profession', 'related_professions']
associations_union_df['size'] = associations_union_df['related_professions'].apply(lambda x: len(x))

print(f'Number of associations: {len(associations_union_df)}\n\n')
associations_union_df.head()

Number of associations: 173




Unnamed: 0,profession,related_professions,size
0,Accompagnant éducatif et social,"[Aide soignant, Moniteur éducateur, Educateur ...",3
1,Accompagnateur,[Kinesithérapeute],1
2,Acheteur,"[Assistant de direction, Transaction immobiliè...",39
3,Administration des ventes,"[Assistant de direction, Transaction immobiliè...",39
4,Affrètement,"[Educateur technique spécialisé, Chef d'équipe...",12


## Step 1.2: We consider the majority vote

In [9]:
associations_dic = {}

# loop over the professions
for profession in professions:
    related_profs = []
    # get the related professions in the HDBSCAN and Louvain clusters
    for cluster in clusters:
        if profession in cluster:
            related_profs.append([x for x in cluster if x != profession])
    # get the related professions in association rules
    if profession in list(association_rules_dic.keys()):
        related_profs.append(association_rules_dic[profession])
        
    # flatten the list of related profesions
    related_profs = [p for el in related_profs for p in el]
    
    # append the related professions to the dictionary
    associations_dic[profession] = related_profs

# reduce the lists of related professions
associations_dic = {k: v for k, v in associations_dic.items() if v}

# for each profession, we only retain the professions that appears at least twice in each list of its related professions:
associations_majority_dic = {prof: [p for p in associations_dic[prof] if Counter(associations_dic[prof])[p] > 1] for prof in list(associations_dic.keys())}
associations_majority_dic = {k: list(set(v)) for k, v in associations_majority_dic.items() if list(set(v))}

# create the final dataframe
associations_majority_df = pd.DataFrame.from_dict(associations_majority_dic, orient='index')
profs = []

for idx in range(len(associations_majority_df)):
    profs.append(associations_majority_df.iloc[idx, :].tolist())
profs = [[p for p in el if p] for el in profs]

associations_majority_df['related_professions'] = profs
associations_majority_df = associations_majority_df['related_professions']
associations_majority_df = associations_majority_df.reset_index()
associations_majority_df.columns = ['profession', 'related_professions']
associations_majority_df['size'] = associations_majority_df['related_professions'].apply(lambda x: len(x))

print(f'Number of associations: {len(associations_majority_df)}\n\n')
associations_majority_df.head()

Number of associations: 82




Unnamed: 0,profession,related_professions,size
0,Accompagnant éducatif et social,[Aide soignant],1
1,Affrètement,"[Agent de transit, Agent d'exploitation]",2
2,Agent d'entretien,[Agent de nettoyage],1
3,Agent d'exploitation,"[Agent de transit, Affrètement]",2
4,Agent de conditionnement,"[Préparateur de commandes, Manutentionnaire]",2


## Step 1.3: We consider the intersection of the clusters to group them

In [10]:
associations_intersect_dic = {}

# loop over the professions
for profession in professions:
    cluster_hdbscan = None
    cluster_louvain = None
    association_rule = None
    related_profs = []
    # get the related professions in the HDBSCAN clusters
    for cluster in clusters_hdbscan:
        if profession in cluster:
            cluster_hdbscan = [x for x in cluster if x != profession]
            break
    # get the related professions in the Louvain clusters
    for cluster in clusters_louvain:
        if profession in cluster:
            cluster_louvain = [x for x in cluster if x != profession]
            break
    # get the related professions in association rules
    if profession in list(association_rules_dic.keys()):
        association_rule = association_rules_dic[profession]
    
    if cluster_hdbscan and cluster_louvain and association_rule:
        related_profs.append(cluster_hdbscan)
        related_profs.append(cluster_louvain)
        related_profs.append(association_rule)
        # keep only the intersection
        related_profs = list(set.intersection(*map(set, related_profs)))
        # append the related professions to the dictionary
        associations_intersect_dic[profession] = related_profs

# reduce the lists of related professions
associations_intersect_dic = {k: list(set(v)) for k, v in associations_intersect_dic.items() if list(set(v))}

# create the final dataframe
associations_intersect_df = pd.DataFrame.from_dict(associations_intersect_dic, orient='index')
profs = []

for idx in range(len(associations_intersect_df)):
    profs.append(associations_intersect_df.iloc[idx, :].tolist())
profs = [[p for p in el if p] for el in profs]

associations_intersect_df['related_professions'] = profs
associations_intersect_df = associations_intersect_df['related_professions']
associations_intersect_df = associations_intersect_df.reset_index()
associations_intersect_df.columns = ['profession', 'related_professions']
associations_intersect_df['size'] = associations_intersect_df['related_professions'].apply(lambda x: len(x))

print(f'Number of associations: {len(associations_intersect_df)}\n\n')
associations_intersect_df.head()

Number of associations: 17




Unnamed: 0,profession,related_professions,size
0,Accompagnant éducatif et social,[Aide soignant],1
1,Agent de conditionnement,"[Préparateur de commandes, Manutentionnaire]",2
2,Animateur de ventes / Promotion,[Vendeur],1
3,Artistique,[Vendeur],1
4,Assistant de direction,[Assistant polyvalent / Secrétaire],1


## Step 1.4: We compare the referentials

##### We can interpret the differences between the refrentials as the associations of professions that are less strong.
##### Indeed, these associations are less robust as the professions involved appear in the same clusters or do not depending on the clustering methods used.
##### Here, we define as non-robust associations those that are present in table resulting from the union of the clusters but not in one resulting from the majority-vote.

In [11]:
# get the flatten list of job associations for the union
associations_union_flatten = [[profession, similar] for profession in list(associations_union_dic.keys()) for similar in associations_union_dic[profession]]
associations_union_flatten = [natsorted(el) for el in associations_union_flatten]
associations_union_flatten = [list(x) for x in set(tuple(x) for x in associations_union_flatten)]
associations_union_flatten = sorted(associations_union_flatten, key=lambda x: (x[0], x[1]))

union_flatten_df = pd.DataFrame(associations_union_flatten, columns=['profession', 'related'])

print(f'Number of flatten associations in the union: {len(union_flatten_df)}\n\n')
union_flatten_df.head()

Number of flatten associations in the union: 1262




Unnamed: 0,profession,related
0,Accompagnant éducatif et social,Aide soignant
1,Accompagnant éducatif et social,Educateur spécialisé
2,Accompagnant éducatif et social,Moniteur éducateur
3,Accompagnateur,Kinesithérapeute
4,Acheteur,Administration des ventes


In [12]:
# get the flatten list of job associations for the majority
associations_majority_flatten = [[profession, similar] for profession in list(associations_majority_dic.keys()) for similar in associations_majority_dic[profession]]
associations_majority_flatten = [natsorted(el) for el in associations_majority_flatten]
associations_majority_flatten = [list(x) for x in set(tuple(x) for x in associations_majority_flatten)]
associations_majority_flatten = sorted(associations_majority_flatten, key=lambda x: (x[0], x[1]))

majority_flatten_df = pd.DataFrame(associations_majority_flatten, columns=['profession', 'related'])

print(f'Number of flatten associations in the majority vote clustering: {len(majority_flatten_df)}\n\n')
majority_flatten_df.head()

Number of flatten associations in the majority vote clustering: 107




Unnamed: 0,profession,related
0,Accompagnant éducatif et social,Aide soignant
1,Affrètement,Agent d'exploitation
2,Affrètement,Agent de transit
3,Agent d'entretien,Agent de nettoyage
4,Agent d'exploitation,Agent de transit


### "Weak" associations
Tha associations that are in the majority vote clustering could be considered as strong.
Now, we define as weak associations those that are not in the majority vote clustering

In [13]:
# association that are in the union but not in the majority
weak = []
for el in associations_union_flatten:
    if el not in associations_majority_flatten:
        weak.append(el)
        
# create the final dataframe
weak_df = pd.DataFrame(weak, columns=['profession', 'related'])

print(f'Number of weak associations: {len(weak_df)}\n\n')
weak_df.head()

Number of weak associations: 1155




Unnamed: 0,profession,related
0,Accompagnant éducatif et social,Educateur spécialisé
1,Accompagnant éducatif et social,Moniteur éducateur
2,Accompagnateur,Kinesithérapeute
3,Acheteur,Administration des ventes
4,Acheteur,Agent de recouvrement


## Export the referentials

In [14]:
associations_intersect_df.to_csv(os.path.join(RESULTS, 'jobs-referential-intersect.csv'), index=False)
associations_union_df.to_csv(os.path.join(RESULTS, 'jobs-referential-union.csv'), index=False)
associations_majority_df.to_csv(os.path.join(RESULTS, 'jobs-referential-majority.csv'), index=False)
weak_df.to_csv(os.path.join(RESULTS, 'jobs-weak-or-less-probable-associations.csv'), index=False)