In [210]:
import sqlite3
import random
from string import punctuation

import pandas as pd
import numpy as np
from pandas.core.series import Series
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [211]:
offers_connection = sqlite3.connect('../../datasets/offers_dataset.db')
offers_frame = pd.read_sql_query('''SELECT * FROM offers''', offers_connection)
offers_connection.close()

with sqlite3.connect('../../datasets/skills_dataset.db') as skills_connection:
    skills_frame = pd.read_sql_query('SELECT * FROM skills', skills_connection)
    skills_frame.set_index('ID', inplace=True)

In [212]:
offers_frame.dropna(subset='Name',inplace=True)
offers_frame.drop_duplicates(inplace=True)

In [213]:
def remove_symbols(description: str, remove_map: dict) -> str:
    """
    """
    for old, new in remove_map.items():
        description = description.replace(old, new)
    return description.lower()


def extract_symbols(description: str, available_symbols: list) -> set:
    s = set()
    prev = ''
    for word in description.split():
        if word in available_symbols:
            s.add(word)
        elif f'{prev} {word}' in available_symbols:
            s.add(f'{prev} {word}')  
        prev = word
    return s 

In [214]:
# Get Skill list
skills_list = [skill.lower() for skill in skills_frame['SKILL'].tolist()]

# Symbols to remove
punct = [p for p in punctuation]
punct.remove('+')
punct.remove('#')
# punct.remove('.')

removal = {p: ' ' for p in punct}
removal['\n'] = ''
removal['/'] = ' '
removal['('] = ' '
removal[')'] = ' '
removal[','] = ' '
removal['>'] = ' '
removal['.'] = ' .'

In [215]:
required_skills = [] 

# Extract skills from descriptions and add new column
for i, offer_description in enumerate(offers_frame.loc[:, 'Description']):
    desc = remove_symbols(offer_description, removal)
    offer_skills = extract_symbols(desc, skills_list)
    required_skills.append(offer_skills)  
offers_frame.insert(len(offers_frame.columns), "RequiredSkills", required_skills)

# Remove offers with no required skills
offers_frame['RequiredSkills'] = offers_frame['RequiredSkills'].apply(lambda x: np.NaN if not x else x)
offers_frame.dropna(subset='RequiredSkills', inplace=True)
offers_frame.reset_index(drop=True, inplace=True)
offers_frame

Unnamed: 0,Name,Description,Location,RequiredSkills
0,Software Developer,Miniclip is a global leader in digital games w...,"Genova, Liguria","{git, travis}"
1,Junior Software Developer,"NETtoWORK, azienda italiana nata nel 2016, ope...",17100 Savona,"{c++, java, python, c, c#}"
2,Software Developer,We are looking for talented and passionate peo...,55100 Lucca,"{javascript, sql, java, oracle, angular}"
3,Software Developer,ARESYS is a R&D oriented company with nearly ...,"Matera, Basilicata","{python, c++, c}"
4,Senior Software Developer,Il/la Candidato/a dovrà padroneggiare: \n \n- ...,"Catania, Sicilia","{javascript, sql, java, oracle, html, c#, css,..."
...,...,...,...,...
161,Senior Staff Product Engineer for Embedded Too...,Do you want to be part of a new project team w...,"Padova, Veneto","{c++, c}"
162,Internship Engineer for Advanced Process Control,Internship Engineer for Advanced Process Contr...,"Genova, Liguria","{c++, c, go}"
163,DevOps Engineer Senior,ARGO LOGICA società di consulenza informatica ...,"Roma, Lazio","{spring, kubernetes, java, docker}"
164,Software Quality Engineer,CentralReach is the #1 provider of SaaS softwa...,37121 Verona,"{css, javascript, html, ios}"


### Jaccard Distance

In [219]:
def _jaccard(s1: set, s2: set):
    return 1 - (len(s1.intersection(s2)) / len(s1.union(s2)))

def jaccard(row1: Series, row2: Series, col_name: str):
    if row1.keys().values is not row2.keys().values:
        raise ValueError(f'Rows have different columns: {row1.keys()} vs {row2.keys()}')   
    if col_name not in row1.keys() or col_name not in row2.keys():
        raise ValueError(f'Invalid Column Name {col_name}')  
    return _jaccard(row1[col_name], row2[col_name])

In [220]:
# Examples
r1 = offers_frame.loc[8, :]
r2 = offers_frame.loc[37, :]
r3 = offers_frame.loc[random.randint(0, len(offers_frame))]
r4 = offers_frame.loc[random.randint(0, len(offers_frame))]

print(f'Distance between {r1["Name"]} and {r1["Name"]}: {jaccard(r1, r1, "RequiredSkills"):.2f}\n {r1["RequiredSkills"]} and {r1["RequiredSkills"]}')
print(f'Distance between {r1["Name"]} and {r2["Name"]}: {jaccard(r1, r2, "RequiredSkills"):.2f}\n {r1["RequiredSkills"]} and {r2["RequiredSkills"]}')
print(f'Distance between {r3["Name"]} and {r4["Name"]}: {jaccard(r3, r4, "RequiredSkills"):.2f}\n {r3["RequiredSkills"]} and {r4["RequiredSkills"]}')

Distance between Frontend and Frontend: 0.00
 {'css', 'javascript', 'html'} and {'css', 'javascript', 'html'}
Distance between Frontend and Software Developer Degree Apprenticeship (Nottingham): 1.00
 {'css', 'javascript', 'html'} and {'aws', 'java', 'c', 'c#', '.net'}
Distance between Sviluppatore Applicazioni Web and Software Developer - FULL REMOTE: 1.00
 {'mysql', 'react', 'laravel', 'php'} and {'aws', 'java', 'python', 'azure', 'c#'}


## Distance Matrix

In [249]:
def get_distance_matrix(frame: pd.DataFrame, col_name: str):
    distances = []
    for i, row1 in frame.iterrows():
        distances.append([])
        for j, row2 in frame.iterrows():
            if i != j:
                distances[-1].append(_jaccard(row1[col_name], row2[col_name]))
            else:
                distances[-1].append(0)#np.inf) 
    return pd.DataFrame(distances, index=frame.index, columns=frame.index)

distance_matrix = get_distance_matrix(offers_frame, 'RequiredSkills')

In [250]:
distance_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,156,157,158,159,160,161,162,163,164,165
0,0.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.666667,1.000000,1.000000,...,1.000000,0.857143,0.916667,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
1,1.0,0.000000,0.888889,0.400000,0.846154,0.833333,0.600000,1.000000,1.000000,0.833333,...,0.923077,1.000000,0.933333,0.857143,0.846154,0.600000,0.666667,0.875000,1.000000,0.818182
2,1.0,0.888889,0.000000,1.000000,0.636364,0.833333,0.833333,1.000000,0.857143,0.833333,...,0.923077,1.000000,1.000000,0.857143,0.846154,1.000000,1.000000,0.875000,0.875000,0.916667
3,1.0,0.400000,1.000000,0.000000,1.000000,1.000000,0.750000,1.000000,1.000000,1.000000,...,0.909091,1.000000,0.923077,0.800000,0.916667,0.333333,0.500000,1.000000,1.000000,0.900000
4,1.0,0.846154,0.636364,1.000000,0.000000,0.800000,0.909091,1.000000,0.700000,0.909091,...,0.882353,0.933333,0.950000,0.916667,0.750000,1.000000,1.000000,0.923077,0.727273,0.875000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,1.0,0.600000,1.000000,0.333333,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,0.333333,1.000000,1.000000,1.000000
162,1.0,0.666667,1.000000,0.500000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,0.333333,0.000000,1.000000,1.000000,0.900000
163,1.0,0.875000,0.875000,1.000000,0.923077,0.800000,0.800000,1.000000,1.000000,0.800000,...,0.818182,0.888889,0.928571,1.000000,0.833333,1.000000,1.000000,0.000000,1.000000,0.666667
164,1.0,1.000000,0.875000,1.000000,0.727273,1.000000,1.000000,1.000000,0.250000,1.000000,...,0.916667,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,1.000000


In [291]:
def similar_offers(offer_id: int, threshold: float = 0.6) -> list:
    similar = []
    for similar_id, item in distance_matrix.iloc[offer_id].sort_values()[offer_id:].items():
        if item > threshold:
            break
        similar.append(offers_frame.iloc[similar_id])
    return similar

In [292]:
# Junior Software Developer: {c++, java, python, c, c#}
print(f'Offers Similar to {offers_frame.iloc[1]["Name"]}:\n')
for i, o in enumerate(similar_offers(1)):
    print(f'{i}> {o["Name"]} : {o["RequiredSkills"]}')

Offers Similar to Junior Software Developer:

0> Cyber Security Analyst - Undergraduate : {'python', 'c', 'java'}
1> Software Developer : {'python', 'c++', 'c'}
2> Energy Harvesting Internship – Starting Summer 2024 : {'python', 'c++', 'c'}
3> Computer Vision Alorithm / SW Engineer : {'python', 'c++', 'c'}
4> Software Engineer : {'c++', 'c', 'java'}
5> System Software Engineer, Summer Intern - 2024 Start (Fixed-Term Contract) : {'c++', 'java', 'python', 'rust', 'c', 'linux'}
6> Manufacturing Software Engineer / Relocation USA : {'mysql', 'c++', 'java', 'sql', 'python', 'c#'}
7> Software Engineer (University Grad) : {'python', 'javascript', 'c++', 'java'}
8> Cyber Threat Intelligence Analyst : {'python', 'c', 'go', 'java'}
9> Software Application Engineer Intern : {'linux', 'c++', 'java', 'c#'}
10> Control System Engineer : {'python', 'c++', 'c', 'tensorflow'}
11> Software Developer - FULL REMOTE : {'aws', 'java', 'python', 'azure', 'c#'}
12> Software Developer Degree Apprenticeship (No

In [293]:
# Frontend: {css, javascript, html}
print(f'Offers Similar to {offers_frame.iloc[8]["Name"]}:\n')
for i, o in enumerate(similar_offers(8)):
    print(f'{i}> {o["Name"]} : {o["RequiredSkills"]}')

Offers Similar to Frontend:

0> Freelance Web & UX Developer : {'javascript', 'wordpress', 'html', 'css', 'php'}
1> Frontend Software Developer : {'react', 'bash', 'javascript', 'angular', 'html', 'css'}
2> Javascript Frontend : {'angular', 'css', 'javascript'}
3> FrontEnd Javascript : {'angular', 'css', 'javascript'}
4> JUNIOR DEVELOPER : {'javascript', 'sql', 'jquery', 'html', 'c#', 'css', '.net'}
5> Software Developer : {'javascript', 'sql', 'html', 'typescript', 'c#', 'css', '.net'}
6> Graduate Software Engineer (London) : {'javascript', 'aws', 'java', 'c', 'azure', 'html', 'css'}
7> L4 Software Developer Apprentice : {'javascript', 'go', 'git', 'c', 'html', 'css', 'php'}
8> SVILUPPATORE SOFTWARE : {'javascript', 'sql', 'angular', 'html', 'c#', 'css', 'vmware'}
9> Martech Analyst : {'python', 'javascript', 'html', 'sql'}
