# Similarity / Dissimilarity

### In this notebook, we:

### - Calculate the similarity / dissimilarity matrices

### - Create the adjency matrix (graph table)

### That we will be using later to compare and group the professions.

### Import libraries

In [1]:
import os, sys, time
import pandas as pd, numpy as np
import random
import pickle
from natsort import natsorted
from mlxtend.preprocessing import TransactionEncoder
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
import itertools

### Global variables

In [2]:
DATA = os.path.join('..', 'data')
RESULTS = os.path.join('..', 'results')

## Load the data

In [3]:
with open(os.path.join(DATA, 'dataset.pkl'), 'rb') as d:
    pkl_data = pickle.load(d)

In [4]:
type(pkl_data)

pandas.core.frame.DataFrame

In [5]:
print(f'Size of the data: {pkl_data.shape}\n\n')
pkl_data.head(3)

Size of the data: (325989, 6)




Unnamed: 0,worker_id,order_id,list_profession_2,list_profession_3,order_list_profession_3,order_list_profession_2
0,0003e59c-a459-4842-b0ec-67a996c9f2fc,88f6eb57-c59a-4808-a05f-23350cb17a08,[Vente],[Vendeur],"[Préparateur de commandes, Agent de conditionn...","[Magasinage, Magasinage, Manutention]"
1,00afbab3-b57c-4417-9bc1-4ebeb05496d4,88f6eb57-c59a-4808-a05f-23350cb17a08,"[Entretien, Manutention, Service en salle]","[Agent de nettoyage, Manutentionnaire, Serveur...","[Préparateur de commandes, Agent de conditionn...","[Magasinage, Magasinage, Manutention]"
2,06ac473d-3114-4e9a-9e58-8f187e3959d1,88f6eb57-c59a-4808-a05f-23350cb17a08,"[Magasinage, Magasinage, Manutention]","[Agent de conditionnement, Manutentionnaire, P...","[Préparateur de commandes, Agent de conditionn...","[Magasinage, Magasinage, Manutention]"


# Step 1: Quick exploration of the data

### Process the data
### We will use only the column list_profession_3 that designates the jobs as such

In [6]:
dataset = pkl_data.list_profession_3.tolist() + pkl_data.order_list_profession_3.tolist()
dataset = [natsorted(list(set(el))) for el in dataset]
dataset = [[w for w in el if w and w != 'nan' and len(w.split())>0] for el in dataset]
random.shuffle(dataset)

### Encode the columns
### For each row (transaction), we consider the presence or absence of the different jobs

In [7]:
te = TransactionEncoder()
te_array = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_array, columns=te.columns_)

In [8]:
print(f'Data size: {df.shape}\n\n')
df.head()

Data size: (651978, 183)




Unnamed: 0,Accompagnant éducatif et social,Accompagnateur,Acheteur,Administration des ventes,Affrètement,Agent d'entretien,Agent d'exploitation,Agent de conditionnement,Agent de distribution,Agent de nettoyage,...,Tourneur / Fraiseur,Traducteur,Transaction immobilière,Transport de personnes,Trésorerie,Tuyauteur,Téléprospecteur,Veilleur de nuit,Vendeur,Éducateur de jeunes enfants
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [9]:
df = df.astype('int')

print(f'Data size: {df.shape}\n\n')
df.head()

Data size: (651978, 183)




Unnamed: 0,Accompagnant éducatif et social,Accompagnateur,Acheteur,Administration des ventes,Affrètement,Agent d'entretien,Agent d'exploitation,Agent de conditionnement,Agent de distribution,Agent de nettoyage,...,Tourneur / Fraiseur,Traducteur,Transaction immobilière,Transport de personnes,Trésorerie,Tuyauteur,Téléprospecteur,Veilleur de nuit,Vendeur,Éducateur de jeunes enfants
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### We get the co-occurrences

In [10]:
%%time
cooccurrences = df.T.dot(df)
cooccurrences

CPU times: user 17.6 s, sys: 0 ns, total: 17.6 s
Wall time: 17.6 s


Unnamed: 0,Accompagnant éducatif et social,Accompagnateur,Acheteur,Administration des ventes,Affrètement,Agent d'entretien,Agent d'exploitation,Agent de conditionnement,Agent de distribution,Agent de nettoyage,...,Tourneur / Fraiseur,Traducteur,Transaction immobilière,Transport de personnes,Trésorerie,Tuyauteur,Téléprospecteur,Veilleur de nuit,Vendeur,Éducateur de jeunes enfants
Accompagnant éducatif et social,421,15,0,0,0,9,1,8,0,5,...,0,0,0,0,0,0,2,5,45,4
Accompagnateur,15,5985,6,6,12,214,80,329,21,212,...,3,7,0,53,7,0,112,37,2218,26
Acheteur,0,6,1307,51,8,6,91,23,0,6,...,0,0,0,1,6,0,34,0,158,0
Administration des ventes,0,6,51,3830,16,3,82,29,8,1,...,0,12,0,7,0,0,31,0,173,0
Affrètement,0,12,8,16,1352,5,351,66,1,18,...,0,0,0,22,0,0,4,0,89,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Tuyauteur,0,0,0,0,0,0,0,5,0,0,...,1,0,0,0,0,182,0,0,0,0
Téléprospecteur,2,112,34,31,4,57,45,112,12,46,...,0,5,0,2,3,0,3905,19,1077,9
Veilleur de nuit,5,37,0,0,0,69,8,37,12,68,...,0,10,0,18,0,0,19,1327,339,2
Vendeur,45,2218,158,173,89,1277,717,3292,267,1829,...,18,65,5,364,27,0,1077,339,108759,145


### We get the most frequent professions by sorting the dataframe by descending frequency, based on the diagonal terms of the co-occurence matrix.

In [11]:
diag = np.diag(cooccurrences)
idx = np.argsort(-diag)
sorted_cooccurrences = pd.DataFrame(np.array(cooccurrences)[idx,:][:,idx], columns=list(cooccurrences.columns[idx])).set_index(cooccurrences.columns[idx])
sorted_cooccurrences

Unnamed: 0,Préparateur de commandes,Manutentionnaire,Vendeur,Employé de caisse,Agent de conditionnement,Cariste,Magasinier,Assistant polyvalent / Secrétaire,Commercial,Opérateur de fabrication,...,Infirmier anesthésiste,Technicien d'analyse biomédicale,Infirmier bloc opératoire,Infirmier préleveur,Technicien de l'intervention sociale et familiale,Orthoptiste,Psychomotricien,Médecin coordinateur,Pédiatre,Radiologue
Préparateur de commandes,244784,89142,24282,13256,56188,30040,21065,2884,4160,13361,...,2,0,0,0,0,0,0,0,0,0
Manutentionnaire,89142,125960,13311,9231,48385,9507,8946,1145,1484,10599,...,0,0,0,0,0,2,0,0,0,1
Vendeur,24282,13311,108759,16086,3292,1400,4068,3709,9517,6063,...,0,0,1,0,0,0,2,0,0,0
Employé de caisse,13256,9231,16086,74990,6913,1030,968,2410,2958,1686,...,1,0,1,0,0,0,0,0,0,0
Agent de conditionnement,56188,48385,3292,6913,72253,6649,7972,652,684,5096,...,0,0,0,0,3,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Orthoptiste,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,4,0,0,0,0
Psychomotricien,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,4,0,0,0
Médecin coordinateur,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
Pédiatre,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### Most frequent jobs in the offers

In [12]:
topK = 20
print(f'The top-{topK} most frequent professions and their associated occurrences are:\n')
for prof, occ in zip(list(sorted_cooccurrences.columns[:topK]), sorted(diag, reverse=True)[:topK]):
    print(f'{prof} ({occ})')

The top-20 most frequent professions and their associated occurrences are:

Préparateur de commandes (244784)
Manutentionnaire (125960)
Vendeur (108759)
Employé de caisse (74990)
Agent de conditionnement (72253)
Cariste (59051)
Magasinier (38323)
Assistant polyvalent / Secrétaire (31199)
Commercial (29141)
Opérateur de fabrication (29007)
Merchandiser (26535)
Gestionnaire de stock (25398)
Chauffeur VL (23104)
Agent de nettoyage (19938)
Hôte d'accueil (17727)
Manoeuvre (17706)
Agent d'exploitation (16696)
Chargé de clientèle (15808)
Serveur et barman (14909)
Hotliner (14847)


# Step 2: Similarity matrix

We calculate the similarities between the professions, based on their cooccurrences betwen one and the others (context). Higher values correspond to closer or more related professions.

The values are comprised in the range [-1, 1]:

-1 to completely different professions, and

1 corresponds to absolutely similar ones.

In [13]:
%%time
cosine_similarities = cosine_similarity(cooccurrences)
cosine_similarities = pd.DataFrame(cosine_similarities, index=cooccurrences.columns, columns=cooccurrences.columns)
cosine_similarities

CPU times: user 4.45 ms, sys: 3.95 ms, total: 8.4 ms
Wall time: 24.1 ms


Unnamed: 0,Accompagnant éducatif et social,Accompagnateur,Acheteur,Administration des ventes,Affrètement,Agent d'entretien,Agent d'exploitation,Agent de conditionnement,Agent de distribution,Agent de nettoyage,...,Tourneur / Fraiseur,Traducteur,Transaction immobilière,Transport de personnes,Trésorerie,Tuyauteur,Téléprospecteur,Veilleur de nuit,Vendeur,Éducateur de jeunes enfants
Accompagnant éducatif et social,1.000000,0.113988,0.047666,0.026216,0.051995,0.071482,0.045654,0.088596,0.094443,0.048557,...,0.042389,0.065248,0.029391,0.063689,0.027064,0.018195,0.073016,0.086047,0.133977,0.088357
Accompagnateur,0.113988,1.000000,0.131318,0.076426,0.153120,0.196177,0.131938,0.252800,0.282939,0.143085,...,0.109789,0.201281,0.086069,0.205128,0.100207,0.049598,0.228897,0.240651,0.440546,0.263306
Acheteur,0.047666,0.131318,1.000000,0.154747,0.116853,0.078812,0.148713,0.122693,0.144565,0.053820,...,0.051763,0.141321,0.068646,0.095682,0.098810,0.026625,0.164355,0.105310,0.186210,0.108790
Administration des ventes,0.026216,0.076426,0.154747,1.000000,0.066737,0.034231,0.083556,0.033731,0.092458,0.019106,...,0.013550,0.190904,0.055573,0.048457,0.101500,0.005800,0.162968,0.050551,0.090522,0.055116
Affrètement,0.051995,0.153120,0.116853,0.066737,1.000000,0.110814,0.345675,0.223866,0.191568,0.084652,...,0.092913,0.096662,0.034137,0.159120,0.047007,0.048701,0.114463,0.137159,0.163120,0.136095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Tuyauteur,0.018195,0.049598,0.026625,0.005800,0.048701,0.048120,0.045351,0.117418,0.078773,0.032411,...,0.064152,0.019285,0.009315,0.055604,0.006126,1.000000,0.026472,0.051614,0.042650,0.048534
Téléprospecteur,0.073016,0.228897,0.164355,0.162968,0.114463,0.119118,0.111469,0.146229,0.195683,0.081362,...,0.059517,0.220612,0.081937,0.134258,0.092631,0.026472,1.000000,0.172168,0.339708,0.174432
Veilleur de nuit,0.086047,0.240651,0.105310,0.050551,0.137159,0.197729,0.122138,0.259065,0.266336,0.149214,...,0.111951,0.185458,0.068092,0.205136,0.060071,0.051614,0.172168,1.000000,0.339317,0.197355
Vendeur,0.133977,0.440546,0.186210,0.090522,0.163120,0.218370,0.134920,0.236002,0.392660,0.164103,...,0.119633,0.279819,0.190765,0.278188,0.126472,0.042650,0.339708,0.339317,1.000000,0.353633


# Step 3: Distance matrix

We calculate the distance matrix. Lower values correspond to closer or more related professions.

The values are comprised in the range [0, 1]:

0 corresponds to absolutely similar professions, and

1 to completely different ones.

In [14]:
%%time
cosine_distances = pairwise_distances(cooccurrences, metric='cosine')
cosine_distances = pd.DataFrame(cosine_distances, index=cooccurrences.columns, columns=cooccurrences.columns)
cosine_distances

CPU times: user 12.9 ms, sys: 89 µs, total: 13 ms
Wall time: 1.75 ms


Unnamed: 0,Accompagnant éducatif et social,Accompagnateur,Acheteur,Administration des ventes,Affrètement,Agent d'entretien,Agent d'exploitation,Agent de conditionnement,Agent de distribution,Agent de nettoyage,...,Tourneur / Fraiseur,Traducteur,Transaction immobilière,Transport de personnes,Trésorerie,Tuyauteur,Téléprospecteur,Veilleur de nuit,Vendeur,Éducateur de jeunes enfants
Accompagnant éducatif et social,0.000000,8.860117e-01,9.523340e-01,0.973784,0.948005,0.928518,0.954346,0.911404,0.905557,0.951443,...,0.957611,0.934752,0.970609,0.936311,0.972936,0.981805,0.926984,0.913953,0.866023,0.911643
Accompagnateur,0.886012,6.661338e-16,8.686821e-01,0.923574,0.846880,0.803823,0.868062,0.747200,0.717061,0.856915,...,0.890211,0.798719,0.913931,0.794872,0.899793,0.950402,0.771103,0.759349,0.559454,0.736694
Acheteur,0.952334,8.686821e-01,3.330669e-16,0.845253,0.883147,0.921188,0.851287,0.877307,0.855435,0.946180,...,0.948237,0.858679,0.931354,0.904318,0.901190,0.973375,0.835645,0.894690,0.813790,0.891210
Administration des ventes,0.973784,9.235735e-01,8.452527e-01,0.000000,0.933263,0.965769,0.916444,0.966269,0.907542,0.980894,...,0.986450,0.809096,0.944427,0.951543,0.898500,0.994200,0.837032,0.949449,0.909478,0.944884
Affrètement,0.948005,8.468802e-01,8.831468e-01,0.933263,0.000000,0.889186,0.654325,0.776134,0.808432,0.915348,...,0.907087,0.903338,0.965863,0.840880,0.952993,0.951299,0.885537,0.862841,0.836880,0.863905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Tuyauteur,0.981805,9.504025e-01,9.733755e-01,0.994200,0.951299,0.951880,0.954649,0.882582,0.921227,0.967589,...,0.935848,0.980715,0.990685,0.944396,0.993874,0.000000,0.973528,0.948386,0.957350,0.951466
Téléprospecteur,0.926984,7.711035e-01,8.356446e-01,0.837032,0.885537,0.880882,0.888531,0.853771,0.804317,0.918638,...,0.940483,0.779388,0.918063,0.865742,0.907369,0.973528,0.000000,0.827832,0.660292,0.825568
Veilleur de nuit,0.913953,7.593489e-01,8.946903e-01,0.949449,0.862841,0.802271,0.877862,0.740935,0.733664,0.850786,...,0.888049,0.814542,0.931908,0.794864,0.939929,0.948386,0.827832,0.000000,0.660683,0.802645
Vendeur,0.866023,5.594543e-01,8.137904e-01,0.909478,0.836880,0.781630,0.865080,0.763998,0.607340,0.835897,...,0.880367,0.720181,0.809235,0.721812,0.873528,0.957350,0.660292,0.660683,0.000000,0.646367


# Step 4: Graph Database

We created a table containing :

- The professions, as the nodes

- The similarity values, as the weights

In [15]:
edges = list(itertools.product(list(cosine_similarities.columns), repeat=2))
indices = np.triu_indices(len(cosine_similarities), k=1)

In [16]:
weights = np.array(cosine_similarities)[indices]
edges = np.array(['-separator-'.join(el) for el in edges]).reshape((cosine_similarities.shape))[indices]

In [17]:
graph_table = []
for idx in range(len(edges)):
    v = edges[idx].split(r'-separator-')
    v.append(round(weights[idx], 2))
    graph_table.append(v)

In [18]:
graph_df = pd.DataFrame(graph_table, columns=['parent', 'child', 'weight'])
graph_df

Unnamed: 0,parent,child,weight
0,Accompagnant éducatif et social,Accompagnateur,0.11
1,Accompagnant éducatif et social,Acheteur,0.05
2,Accompagnant éducatif et social,Administration des ventes,0.03
3,Accompagnant éducatif et social,Affrètement,0.05
4,Accompagnant éducatif et social,Agent d'entretien,0.07
...,...,...,...
16648,Téléprospecteur,Vendeur,0.34
16649,Téléprospecteur,Éducateur de jeunes enfants,0.17
16650,Veilleur de nuit,Vendeur,0.34
16651,Veilleur de nuit,Éducateur de jeunes enfants,0.20


Display some stats

In [19]:
graph_df.weight.describe()

count    16653.000000
mean         0.117794
std          0.100079
min          0.000000
25%          0.040000
50%          0.090000
75%          0.170000
max          0.980000
Name: weight, dtype: float64

# Save the results

In [20]:
cosine_similarities.to_csv(os.path.join(RESULTS, 'cosine-similarities.csv'))
cosine_distances.to_csv(os.path.join(RESULTS, 'cosine-distances.csv'))
graph_df.to_csv(os.path.join(RESULTS, 'graph-table.csv'), index=False)