In [1]:
import sqlite3
import random
from string import punctuation

import pandas as pd
import numpy as np
from pandas.core.series import Series
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
offers_connection = sqlite3.connect('../../datasets/offers_dataset.db')
offers_frame = pd.read_sql_query('''SELECT * FROM offers''', offers_connection)
offers_connection.close()

with sqlite3.connect('../../datasets/skills_dataset.db') as skills_connection:
    skills_frame = pd.read_sql_query('SELECT * FROM skills', skills_connection)
    skills_frame.set_index('ID', inplace=True)

In [3]:
offers_frame.dropna(subset='Name',inplace=True)
offers_frame.drop_duplicates(inplace=True)

In [4]:
def remove_symbols(description: str, remove_map: dict) -> str:
    """
    """
    for old, new in remove_map.items():
        description = description.replace(old, new)
    return description.lower()


def extract_symbols(description: str, available_symbols: list) -> set:
    s = set()
    prev = ''
    for word in description.split():
        if word in available_symbols:
            s.add(word)
        elif f'{prev} {word}' in available_symbols:
            s.add(f'{prev} {word}')  
        prev = word
    return s 

def translate_skills(skills: set, toId = False) -> set:
    if toId:
        return skill_to_id(skills)
    else:
        return id_to_skill(skills)

def id_to_skill(skills: set[int]) -> set[str]:
    out = set()
    for skill in skills:
        out.add(skills_frame.loc[skill, 'SKILL'])
    
    return out

def skill_to_id(skills: set[str]) -> set[int]:
    out = set()
    for skill in skills:
        out.add(
            list(
                skills_frame.loc[
                    skills_frame['SKILL'].apply(lambda val: val.lower()) == skill.lower()
                ].index
            )[0]
        )
    
    return out

In [5]:
# Get Skill list
skills_list = [skill.lower() for skill in skills_frame['SKILL'].tolist()]

# Symbols to remove
punct = [p for p in punctuation]
punct.remove('+')
punct.remove('#')
# punct.remove('.')

removal = {p: ' ' for p in punct}
removal['\n'] = ''
removal['/'] = ' '
removal['('] = ' '
removal[')'] = ' '
removal[','] = ' '
removal['>'] = ' '
removal['.'] = ' .'

In [6]:
required_skills = [] 

# Extract skills from descriptions and add new column
for i, offer_description in enumerate(offers_frame.loc[:, 'Description']):
    desc = remove_symbols(offer_description, removal)
    offer_skills = extract_symbols(desc, skills_list)
    required_skills.append(translate_skills(offer_skills, toId=True))  
    #required_skills.append(offer_skills)
offers_frame.insert(len(offers_frame.columns), "RequiredSkills", required_skills)

# Remove offers with no required skills
offers_frame['RequiredSkills'] = offers_frame['RequiredSkills'].apply(lambda x: np.NaN if not x else x)
offers_frame.dropna(subset='RequiredSkills', inplace=True)
offers_frame.reset_index(drop=True, inplace=True)
offers_frame

Unnamed: 0,Name,Description,Location,RequiredSkills
0,Software Developer,Miniclip is a global leader in digital games w...,"Genova, Liguria","{100, 101}"
1,Junior Software Developer,"NETtoWORK, azienda italiana nata nel 2016, ope...",17100 Savona,"{2, 6, 7, 8, 9}"
2,Software Developer,We are looking for talented and passionate peo...,55100 Lucca,"{0, 3, 6, 44, 26}"
3,Software Developer,ARESYS is a R&D oriented company with nearly ...,"Matera, Basilicata","{8, 9, 2}"
4,Senior Software Developer,Il/la Candidato/a dovrà padroneggiare: \n \n- ...,"Catania, Sicilia","{0, 1, 3, 99, 6, 7, 104, 55, 57, 26}"
...,...,...,...,...
161,Senior Staff Product Engineer for Embedded Too...,Do you want to be part of a new project team w...,"Padova, Veneto","{8, 9}"
162,Internship Engineer for Advanced Process Control,Internship Engineer for Advanced Process Contr...,"Genova, Liguria","{8, 9, 12}"
163,DevOps Engineer Senior,ARGO LOGICA società di consulenza informatica ...,"Roma, Lazio","{80, 73, 61, 6}"
164,Software Quality Engineer,CentralReach is the #1 provider of SaaS softwa...,37121 Verona,"{0, 1, 99, 103}"


### Jaccard Distance

In [7]:
def _jaccard(s1: set, s2: set):
    return 1 - (len(s1.intersection(s2)) / len(s1.union(s2)))

def jaccard(row1: Series, row2: Series, col_name: str):
    if row1.keys().values is not row2.keys().values:
        raise ValueError(f'Rows have different columns: {row1.keys()} vs {row2.keys()}')   
    if col_name not in row1.keys() or col_name not in row2.keys():
        raise ValueError(f'Invalid Column Name {col_name}')  
    return _jaccard(row1[col_name], row2[col_name])

In [8]:
# Examples
r1 = offers_frame.loc[8, :]
r2 = offers_frame.loc[37, :]
r3 = offers_frame.loc[random.randint(0, len(offers_frame))]
r4 = offers_frame.loc[random.randint(0, len(offers_frame))]

print(f'Distance between {r1["Name"]} and {r1["Name"]}: {jaccard(r1, r1, "RequiredSkills"):.2f}\n {r1["RequiredSkills"]} and {r1["RequiredSkills"]}')
print(f'Distance between {r1["Name"]} and {r2["Name"]}: {jaccard(r1, r2, "RequiredSkills"):.2f}\n {r1["RequiredSkills"]} and {r2["RequiredSkills"]}')
print(f'Distance between {r3["Name"]} and {r4["Name"]}: {jaccard(r3, r4, "RequiredSkills"):.2f}\n {r3["RequiredSkills"]} and {r4["RequiredSkills"]}')

Distance between Frontend and Frontend: 0.00
 {0, 1, 99} and {0, 1, 99}
Distance between Frontend and Software Developer Degree Apprenticeship (Nottingham): 1.00
 {0, 1, 99} and {6, 7, 9, 57, 30}
Distance between Sistemista Junior - System Administrator and DevOps Engineer Senior: 0.86
 {104, 3, 6, 39} and {80, 73, 61, 6}


## Distance Matrix

In [9]:
def get_distance_matrix(frame: pd.DataFrame, col_name: str):
    distances = []
    for i, row1 in frame.iterrows():
        distances.append([])
        for j, row2 in frame.iterrows():
            if i != j:
                distances[-1].append(_jaccard(row1[col_name], row2[col_name]))
            else:
                distances[-1].append(0)#np.inf) 
    return pd.DataFrame(distances, index=frame.index, columns=frame.index)

distance_matrix = get_distance_matrix(offers_frame, 'RequiredSkills')

In [10]:
distance_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,156,157,158,159,160,161,162,163,164,165
0,0.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.666667,1.000000,1.000000,...,1.000000,0.857143,0.916667,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
1,1.0,0.000000,0.888889,0.400000,0.846154,0.833333,0.600000,1.000000,1.000000,0.833333,...,0.923077,1.000000,0.933333,0.857143,0.846154,0.600000,0.666667,0.875000,1.000000,0.818182
2,1.0,0.888889,0.000000,1.000000,0.636364,0.833333,0.833333,1.000000,0.857143,0.833333,...,0.923077,1.000000,1.000000,0.857143,0.846154,1.000000,1.000000,0.875000,0.875000,0.916667
3,1.0,0.400000,1.000000,0.000000,1.000000,1.000000,0.750000,1.000000,1.000000,1.000000,...,0.909091,1.000000,0.923077,0.800000,0.916667,0.333333,0.500000,1.000000,1.000000,0.900000
4,1.0,0.846154,0.636364,1.000000,0.000000,0.800000,0.909091,1.000000,0.700000,0.909091,...,0.882353,0.933333,0.950000,0.916667,0.750000,1.000000,1.000000,0.923077,0.727273,0.875000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,1.0,0.600000,1.000000,0.333333,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,0.333333,1.000000,1.000000,1.000000
162,1.0,0.666667,1.000000,0.500000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,0.333333,0.000000,1.000000,1.000000,0.900000
163,1.0,0.875000,0.875000,1.000000,0.923077,0.800000,0.800000,1.000000,1.000000,0.800000,...,0.818182,0.888889,0.928571,1.000000,0.833333,1.000000,1.000000,0.000000,1.000000,0.666667
164,1.0,1.000000,0.875000,1.000000,0.727273,1.000000,1.000000,1.000000,0.250000,1.000000,...,0.916667,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,1.000000


**Distance matrix based recommendation**

In [11]:
def similar_offers(offer_id: int, threshold: float = 0.6) -> list:
    similar = []
    for item_id, distance in distance_matrix.iloc[offer_id].sort_values()[offer_id:].items():
        if distance > threshold:
            break
        similar.append(offers_frame.iloc[item_id])
    return similar

In [12]:
# Junior Software Developer: {c++, java, python, c, c#}
print(f'Offers Similar to {offers_frame.iloc[1]["Name"]}:\n')
for i, o in enumerate(similar_offers(1)):
    print(f'{i}> {o["Name"]} : {o["RequiredSkills"]}')

Offers Similar to Junior Software Developer:

0> Cyber Security Analyst - Undergraduate : {9, 2, 6}
1> Software Developer : {8, 9, 2}
2> Energy Harvesting Internship – Starting Summer 2024 : {8, 9, 2}
3> Computer Vision Alorithm / SW Engineer : {8, 9, 2}
4> Software Engineer : {8, 9, 6}
5> System Software Engineer, Summer Intern - 2024 Start (Fixed-Term Contract) : {2, 6, 104, 9, 8, 13}
6> Manufacturing Software Engineer / Relocation USA : {2, 3, 6, 7, 8, 19}
7> Software Engineer (University Grad) : {0, 8, 2, 6}
8> Cyber Threat Intelligence Analyst : {9, 2, 12, 6}
9> Software Application Engineer Intern : {104, 8, 6, 7}
10> Control System Engineer : {8, 9, 2, 63}
11> Software Developer - FULL REMOTE : {2, 6, 7, 30, 31}
12> Software Developer Degree Apprenticeship (Nottingham) : {6, 7, 9, 57, 30}
13> Emerging Threats Intelligence Intern (Remote) : {7, 8, 9, 43, 13}
14> Senior Staff Product Engineer for Embedded Tools (f/m/div) : {8, 9}
15> 2024 Intern - Software Development Engineering 

In [13]:
# Frontend: {css, javascript, html}
print(f'Offers Similar to {offers_frame.iloc[8]["Name"]}:\n')
for i, o in enumerate(similar_offers(8)):
    print(f'{i}> {o["Name"]} : {o["RequiredSkills"]}')

Offers Similar to Frontend:

0> Freelance Web & UX Developer : {0, 1, 99, 10, 48}
1> Frontend Software Developer : {0, 1, 99, 5, 41, 44}
2> Javascript Frontend : {0, 99, 44}
3> FrontEnd Javascript : {0, 99, 44}
4> JUNIOR DEVELOPER : {0, 1, 3, 99, 7, 42, 57}
5> Software Developer : {0, 1, 3, 4, 99, 7, 57}
6> Graduate Software Engineer (London) : {0, 1, 99, 6, 9, 30, 31}
7> L4 Software Developer Apprentice : {0, 1, 99, 100, 9, 10, 12}
8> SVILUPPATORE SOFTWARE : {0, 1, 3, 99, 39, 7, 44}
9> Martech Analyst : {0, 1, 2, 3}


**Clustering based recommendation**

In [14]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4, init='k-means++', random_state=42)
labels = kmeans.fit_predict(distance_matrix)
offers_frame['Group'] = labels

  super()._check_params_vs_input(X, default_n_init=10)
[WinError 2] Impossibile trovare il file specificato
  File "C:\Users\anton\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\anton\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\anton\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\anton\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


In [15]:
def _similar_offers_with_cluster(offer_id: int) -> list:
    target_group = offers_frame.loc[offer_id, 'Group']
    return list(
        offers_frame[
            offers_frame['Group'] == target_group
        ].index
    )

def similar_offers_with_cluster(offer_id: int) -> list:
    return [offers_frame.iloc[oid] for oid in _similar_offers_with_cluster(offer_id)]

In [16]:
# Junior Software Developer: {c++, java, python, c, c#}
print(f'Offers Similar to {offers_frame.iloc[1]["Name"]}:\n')
for i, o in enumerate(similar_offers_with_cluster(1)):
    print(f'{i}> {o["Name"]} : {o["RequiredSkills"]}')

Offers Similar to Junior Software Developer:

0> Junior Software Developer : {2, 6, 7, 8, 9}
1> Software Developer : {8, 9, 2}
2> IBP Junior Algorithms Software Development : {2, 6}
3> Software Engineer : {8, 9, 6}
4> System Software Engineer, Summer Intern - 2024 Start (Fixed-Term Contract) : {2, 6, 104, 9, 8, 13}
5> Emerging Threats Intelligence Intern (Remote) : {7, 8, 9, 43, 13}
6> Cyber Security Analyst - Undergraduate : {9, 2, 6}
7> Energy Harvesting Internship – Starting Summer 2024 : {8, 9, 2}
8> Software Engineer (University Grad) : {0, 8, 2, 6}
9> Electrochemical Modelling - Energy Storage Internship – Starting Summer 2024 (12 months) : {2}
10> Macro Research Analyst and Associate Internship 2024 London : {2}
11> 2024 Intern - Software Development Engineering : {8, 6}
12> Network Production Engineer : {8, 2, 13}
13> AI Developer - MILANO [DIG] : {2, 67, 6, 8, 21, 31, 63}
14> Control System Engineer : {8, 9, 2, 63}
15> Computer Vision Alorithm / SW Engineer : {8, 9, 2}
16> DAT

In [17]:
# Frontend: {css, javascript, html}
print(f'Offers Similar to {offers_frame.iloc[8]["Name"]}:\n')
for i, o in enumerate(similar_offers_with_cluster(8)):
    print(f'{i}> {o["Name"]} : {o["RequiredSkills"]}')

Offers Similar to Frontend:

0> Senior Software Developer : {0, 1, 3, 99, 6, 7, 104, 55, 57, 26}
1> Frontend : {0, 1, 99}
2> JUNIOR DEVELOPER : {0, 1, 3, 99, 7, 42, 57}
3> L4 Software Developer Apprentice : {0, 1, 99, 100, 9, 10, 12}
4> Frontend Software Developer : {0, 1, 99, 5, 41, 44}
5> Senior Software Developer : {0, 57, 41, 7}
6> Graduate Software Engineer (London) : {0, 1, 99, 6, 9, 30, 31}
7> Software Developer : {0, 1, 3, 4, 99, 7, 57}
8> Front End Developer (100% Remote) : {0, 41, 12}
9> Software Developer : {1, 10, 103}
10> Web Developer : {0, 4, 41, 10, 48, 53}
11> Software Engineer (EU) - App Catalog : {0, 99, 100, 7, 41, 11, 80, 57, 31}
12> Web Development Manager : {0, 1, 99, 10, 48}
13> Freelance Web & UX Developer : {0, 1, 99, 10, 48}
14> Junior Web Developer : {0, 1, 99}
15> Frontend Developer : {0, 41, 10, 12, 78}
16> Web Developer Apprentice : {0, 9, 99, 1}
17> Sviluppatore Web : {1, 99, 10, 48, 19}
18> Sviluppatore Web - Middle/Senior : {0, 1, 99, 4, 41, 10, 74, 44

In [18]:
offers_frame

Unnamed: 0,Name,Description,Location,RequiredSkills,Group
0,Software Developer,Miniclip is a global leader in digital games w...,"Genova, Liguria","{100, 101}",1
1,Junior Software Developer,"NETtoWORK, azienda italiana nata nel 2016, ope...",17100 Savona,"{2, 6, 7, 8, 9}",0
2,Software Developer,We are looking for talented and passionate peo...,55100 Lucca,"{0, 3, 6, 44, 26}",2
3,Software Developer,ARESYS is a R&D oriented company with nearly ...,"Matera, Basilicata","{8, 9, 2}",0
4,Senior Software Developer,Il/la Candidato/a dovrà padroneggiare: \n \n- ...,"Catania, Sicilia","{0, 1, 3, 99, 6, 7, 104, 55, 57, 26}",3
...,...,...,...,...,...
161,Senior Staff Product Engineer for Embedded Too...,Do you want to be part of a new project team w...,"Padova, Veneto","{8, 9}",0
162,Internship Engineer for Advanced Process Control,Internship Engineer for Advanced Process Contr...,"Genova, Liguria","{8, 9, 12}",0
163,DevOps Engineer Senior,ARGO LOGICA società di consulenza informatica ...,"Roma, Lazio","{80, 73, 61, 6}",1
164,Software Quality Engineer,CentralReach is the #1 provider of SaaS softwa...,37121 Verona,"{0, 1, 99, 103}",3


### Silhouette Score

In [37]:
group = offers_frame[offers_frame['Group'] == 0]
offer_i = offers_frame.iloc[list(group.index)[0]]

den = len(list(group.index))
inner_distance = 0  # a(i)
for offer in group.iterrows():   
    if not offer[1].equals(offer_i):
        inner_distance += jaccard(offer_i, offer, 'RequiredSkills')
inner_distance = inner_distance/den

(3, Name                                             Software Developer
Description       ARESYS  is a R&D oriented company with nearly ...
Location                                         Matera, Basilicata
RequiredSkills                                            {8, 9, 2}
Group                                                             0
Name: 3, dtype: object)
(6, Name                     IBP Junior Algorithms Software Development
Description       Pirelli is looking for the following profile t...
Location                                               Bari, Puglia
RequiredSkills                                               {2, 6}
Group                                                             0
Name: 6, dtype: object)
(26, Name                                              Software Engineer
Description       Main Responsibilities  \n - Design and develop...
Location                                                Roma, Lazio
RequiredSkills                                         

In [38]:
def find_nearest_cluster(target_cluster_label, centroids):
    target_centroid = centroids[target_cluster_label]
    distances = np.linalg.norm(centroids - target_centroid, axis=1)
    nearest_cluster = np.argmin(distances)
    return nearest_cluster

array([[1.        , 0.56728778, 0.94209643, 0.50507246, 0.9555345 ,
        0.91299172, 0.65207039, 1.        , 0.99275362, 0.91299172,
        0.98774704, 0.88364389, 0.95466653, 0.99378882, 0.91299172,
        0.95628502, 0.98550725, 0.7423913 , 0.99178744, 0.98757764,
        0.85515873, 0.872343  , 1.        , 0.98647343, 0.91540718,
        0.80445135, 0.63768116, 0.94470753, 0.98913043, 1.        ,
        0.84954671, 0.92530742, 0.9889579 , 0.98291925, 0.88681379,
        0.98339921, 0.97681159, 0.85258642, 1.        , 0.88364389,
        0.93576605, 0.86521739, 0.61402692, 1.        , 1.        ,
        0.77109919, 0.85061171, 0.86226708, 0.58291925, 0.50507246,
        0.64327122, 0.99033816, 0.83405797, 0.98043478, 0.99033816,
        0.64451346, 0.98316611, 0.81062802, 0.97463768, 0.64451346,
        0.83863009, 0.91229688, 0.99456522, 0.99456522, 0.99275362,
        0.98007246, 0.99275362, 0.99275362, 0.98291925, 0.7426501 ,
        0.90728088, 0.86459627, 0.65289855, 1.  

In [None]:
centroids = kmeans.cluster_centers_
target_cluster = 0
nearest_cluster = find_nearest_cluster(target_cluster, centroids)