In [61]:
import sqlite3
import random
from string import punctuation

import pandas as pd
import numpy as np
from pandas.core.series import Series
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [62]:
offers_connection = sqlite3.connect('../../datasets/offers_dataset.db')
offers_frame = pd.read_sql_query('''SELECT * FROM offers''', offers_connection)
offers_connection.close()

with sqlite3.connect('../../datasets/skills_dataset.db') as skills_connection:
    skills_frame = pd.read_sql_query('SELECT * FROM skills', skills_connection)
    skills_frame.set_index('ID', inplace=True)

In [63]:
offers_frame.dropna(subset='Name',inplace=True)
offers_frame.drop_duplicates(inplace=True)

In [64]:
def remove_symbols(description: str, remove_map: dict) -> str:
    """
    """
    for old, new in remove_map.items():
        description = description.replace(old, new)
    return description.lower()


def extract_symbols(description: str, available_symbols: list) -> set:
    s = set()
    prev = ''
    for word in description.split():
        if word in available_symbols:
            s.add(word)
        elif f'{prev} {word}' in available_symbols:
            s.add(f'{prev} {word}')  
        prev = word
    return s 

def translate_skills(skills: set, toId = False) -> set:
    if toId:
        return skill_to_id(skills)
    else:
        return id_to_skill(skills)

def id_to_skill(skills: set[int]) -> set[str]:
    out = set()
    for skill in skills:
        out.add(skills_frame.loc[skill, 'SKILL'])
    
    return out

def skill_to_id(skills: set[str]) -> set[int]:
    out = set()
    for skill in skills:
        out.add(
            list(
                skills_frame.loc[
                    skills_frame['SKILL'].apply(lambda val: val.lower()) == skill.lower()
                ].index
            )[0]
        )
    
    return out

In [65]:
# Get Skill list
skills_list = [skill.lower() for skill in skills_frame['SKILL'].tolist()]

# Symbols to remove
punct = [p for p in punctuation]
punct.remove('+')
punct.remove('#')
# punct.remove('.')

removal = {p: ' ' for p in punct}
removal['\n'] = ''
removal['/'] = ' '
removal['('] = ' '
removal[')'] = ' '
removal[','] = ' '
removal['>'] = ' '
removal['.'] = ' .'

In [66]:
required_skills = [] 

# Extract skills from descriptions and add new column
for i, offer_description in enumerate(offers_frame.loc[:, 'Description']):
    desc = remove_symbols(offer_description, removal)
    offer_skills = extract_symbols(desc, skills_list)
    required_skills.append(translate_skills(offer_skills, toId=True))  
    #required_skills.append(offer_skills)
offers_frame.insert(len(offers_frame.columns), "RequiredSkills", required_skills)

# Remove offers with no required skills
offers_frame['RequiredSkills'] = offers_frame['RequiredSkills'].apply(lambda x: np.NaN if not x else x)
offers_frame.dropna(subset='RequiredSkills', inplace=True)
offers_frame.reset_index(drop=True, inplace=True)
offers_frame

Unnamed: 0,Name,Description,Location,RequiredSkills
0,Software Developer,Miniclip is a global leader in digital games w...,"Genova, Liguria","{100, 101}"
1,Junior Software Developer,"NETtoWORK, azienda italiana nata nel 2016, ope...",17100 Savona,"{2, 6, 7, 8, 9}"
2,Software Developer,We are looking for talented and passionate peo...,55100 Lucca,"{0, 3, 6, 44, 26}"
3,Software Developer,ARESYS is a R&D oriented company with nearly ...,"Matera, Basilicata","{8, 9, 2}"
4,Senior Software Developer,Il/la Candidato/a dovrà padroneggiare: \n \n- ...,"Catania, Sicilia","{0, 1, 99, 3, 6, 7, 104, 55, 57, 26}"
...,...,...,...,...
161,Senior Staff Product Engineer for Embedded Too...,Do you want to be part of a new project team w...,"Padova, Veneto","{8, 9}"
162,Internship Engineer for Advanced Process Control,Internship Engineer for Advanced Process Contr...,"Genova, Liguria","{8, 9, 12}"
163,DevOps Engineer Senior,ARGO LOGICA società di consulenza informatica ...,"Roma, Lazio","{80, 73, 61, 6}"
164,Software Quality Engineer,CentralReach is the #1 provider of SaaS softwa...,37121 Verona,"{0, 1, 99, 103}"


### Jaccard Distance

In [67]:
def _jaccard(s1: set, s2: set):
    return 1 - (len(s1.intersection(s2)) / len(s1.union(s2)))

def jaccard(row1: Series, row2: Series, col_name: str):
    if row1.keys().values is not row2.keys().values:
        raise ValueError(f'Rows have different columns: {row1.keys()} vs {row2.keys()}')   
    if col_name not in row1.keys() or col_name not in row2.keys():
        raise ValueError(f'Invalid Column Name {col_name}')  
    return _jaccard(row1[col_name], row2[col_name])

In [68]:
# Examples
r1 = offers_frame.loc[8, :]
r2 = offers_frame.loc[37, :]
r3 = offers_frame.loc[random.randint(0, len(offers_frame))]
r4 = offers_frame.loc[random.randint(0, len(offers_frame))]

print(f'Distance between {r1["Name"]} and {r1["Name"]}: {jaccard(r1, r1, "RequiredSkills"):.2f}\n {r1["RequiredSkills"]} and {r1["RequiredSkills"]}')
print(f'Distance between {r1["Name"]} and {r2["Name"]}: {jaccard(r1, r2, "RequiredSkills"):.2f}\n {r1["RequiredSkills"]} and {r2["RequiredSkills"]}')
print(f'Distance between {r3["Name"]} and {r4["Name"]}: {jaccard(r3, r4, "RequiredSkills"):.2f}\n {r3["RequiredSkills"]} and {r4["RequiredSkills"]}')

Distance between Frontend and Frontend: 0.00
 {0, 1, 99} and {0, 1, 99}
Distance between Frontend and Software Developer Degree Apprenticeship (Nottingham): 1.00
 {0, 1, 99} and {6, 7, 9, 57, 30}
Distance between Sistemista Junior - System Administrator and Martech Analyst: 0.86
 {104, 3, 6, 39} and {0, 1, 2, 3}


## Distance Matrix

In [69]:
def get_distance_matrix(frame: pd.DataFrame, col_name: str):
    distances = []
    for i, row1 in frame.iterrows():
        distances.append([])
        for j, row2 in frame.iterrows():
            if i != j:
                distances[-1].append(_jaccard(row1[col_name], row2[col_name]))
            else:
                distances[-1].append(0)#np.inf) 
    return pd.DataFrame(distances, index=frame.index, columns=frame.index)

distance_matrix = get_distance_matrix(offers_frame, 'RequiredSkills')

In [70]:
distance_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,156,157,158,159,160,161,162,163,164,165
0,0.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.666667,1.000000,1.000000,...,1.000000,0.857143,0.916667,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
1,1.0,0.000000,0.888889,0.400000,0.846154,0.833333,0.600000,1.000000,1.000000,0.833333,...,0.923077,1.000000,0.933333,0.857143,0.846154,0.600000,0.666667,0.875000,1.000000,0.818182
2,1.0,0.888889,0.000000,1.000000,0.636364,0.833333,0.833333,1.000000,0.857143,0.833333,...,0.923077,1.000000,1.000000,0.857143,0.846154,1.000000,1.000000,0.875000,0.875000,0.916667
3,1.0,0.400000,1.000000,0.000000,1.000000,1.000000,0.750000,1.000000,1.000000,1.000000,...,0.909091,1.000000,0.923077,0.800000,0.916667,0.333333,0.500000,1.000000,1.000000,0.900000
4,1.0,0.846154,0.636364,1.000000,0.000000,0.800000,0.909091,1.000000,0.700000,0.909091,...,0.882353,0.933333,0.950000,0.916667,0.750000,1.000000,1.000000,0.923077,0.727273,0.875000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,1.0,0.600000,1.000000,0.333333,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,0.333333,1.000000,1.000000,1.000000
162,1.0,0.666667,1.000000,0.500000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,0.333333,0.000000,1.000000,1.000000,0.900000
163,1.0,0.875000,0.875000,1.000000,0.923077,0.800000,0.800000,1.000000,1.000000,0.800000,...,0.818182,0.888889,0.928571,1.000000,0.833333,1.000000,1.000000,0.000000,1.000000,0.666667
164,1.0,1.000000,0.875000,1.000000,0.727273,1.000000,1.000000,1.000000,0.250000,1.000000,...,0.916667,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,1.000000


In [71]:
def similar_offers(offer_id: int, threshold: float = 0.6) -> list:
    """
    get similar offers based only on distance matrix
    """
    similar = []
    for item_id, distance in distance_matrix.iloc[offer_id].sort_values()[offer_id:].items():
        if distance > threshold:
            break
        similar.append(offers_frame.iloc[item_id])
    return similar

### Clustering based recommendation

**Silhouette Score**

In [72]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean


def item_cluster_distance(frame: pd.DataFrame, item: Series, target: int, 
                          group_label: str = 'Group', 
                          distance_label: str = 'RequiredSkills'):
    """
    Computes distance between an item in a cluster X and the other items in a cluster Y.
    Used also when X and Y are the same clusters.
    """
    group = frame[frame[group_label] == target]
    den = len(list(group.index))
    distance = 0
    
    for it in group.iterrows():
        if not it[1].equals(item):
            distance += jaccard(item, it[1], distance_label)
    
    return distance/den

def find_nearest_cluster(centroids, target_label: int):
    nearest = None
    target_centroid = centroids.iloc[target_label]
    min_dist = float('inf')
    
    for label, vector in centroids[centroids.index != target_label].iterrows():
        dist = euclidean(target_centroid, vector)
        if dist < min_dist:
            min_dist = dist
            nearest = label
    
    return nearest


def kmeans_silhouette(frame: pd.DataFrame,
               clustering: KMeans,
               group_label: str = 'Group',
               distance_label: str = 'RequiredSkills'):
    
    silhouette_scores = []
    for target_cluster in list(np.unique(frame[group_label])): 
        nearest_cluster = find_nearest_cluster(
            pd.DataFrame(clustering.cluster_centers_),
            target_cluster
        )
        for _, item in frame.iterrows():
            a = item_cluster_distance(frame, item, target_cluster, group_label, distance_label)
            b = item_cluster_distance(frame, item, nearest_cluster, group_label, distance_label)
            silhouette_score = (b - a) / max(a, b)
            silhouette_scores.append(silhouette_score)
    
    return np.mean(silhouette_scores)

In [77]:
# --- See how many clusters perform the best
max_value = -2
best_value = 0
for k in range(2, 8):
    tmp_kmeans = KMeans(n_clusters=k, init='k-means++', n_init=10, random_state=42)
    labels = tmp_kmeans.fit_predict(distance_matrix)
    offers_frame['Group'] = labels
    
    value = kmeans_silhouette(offers_frame, tmp_kmeans)
    print(f'Value for {k}: {value:.3f}')
    if value > max_value:
        max_value = value
        best_value = k

Value for 2: -0.000
Value for 3: 0.012
Value for 4: 0.021
Value for 5: 0.019
Value for 6: 0.019
Value for 7: 0.012


In [74]:
# --- best
kmeans = KMeans(n_clusters=best_value, init='k-means++', n_init=10, random_state=42)
labels = kmeans.fit_predict(distance_matrix)
offers_frame['Group'] = labels

In [75]:
def _similar_offers_with_cluster(offer_id: int) -> list:
    target_group = offers_frame.loc[offer_id, 'Group']
    return list(
        offers_frame[
            offers_frame['Group'] == target_group
        ].index
    )

def similar_offers_with_cluster(offer_id: int) -> list:
    return [offers_frame.iloc[oid] for oid in _similar_offers_with_cluster(offer_id)]

In [76]:
# Frontend: {css, javascript, html}
print(f'Offers Similar to {offers_frame.iloc[8]["Name"]}:\n')
for i, o in enumerate(similar_offers_with_cluster(8)):
    print(f'{i}> {o["Name"]} : {o["RequiredSkills"]}')

Offers Similar to Frontend:

0> Senior Software Developer : {0, 1, 99, 3, 6, 7, 104, 55, 57, 26}
1> Frontend : {0, 1, 99}
2> JUNIOR DEVELOPER : {0, 1, 99, 3, 7, 42, 57}
3> L4 Software Developer Apprentice : {0, 1, 99, 100, 9, 10, 12}
4> Frontend Software Developer : {0, 1, 99, 5, 41, 44}
5> Senior Software Developer : {0, 41, 57, 7}
6> Graduate Software Engineer (London) : {0, 1, 99, 6, 9, 30, 31}
7> Software Developer : {0, 1, 99, 4, 3, 7, 57}
8> Front End Developer (100% Remote) : {0, 41, 12}
9> Software Developer : {1, 10, 103}
10> Web Developer : {0, 4, 41, 10, 48, 53}
11> Software Engineer (EU) - App Catalog : {0, 99, 100, 7, 41, 11, 80, 57, 31}
12> Web Development Manager : {0, 1, 99, 10, 48}
13> Freelance Web & UX Developer : {0, 1, 99, 10, 48}
14> Junior Web Developer : {0, 1, 99}
15> Frontend Developer : {0, 41, 10, 12, 78}
16> Web Developer Apprentice : {0, 9, 99, 1}
17> Sviluppatore Web : {1, 99, 10, 48, 19}
18> Sviluppatore Web - Middle/Senior : {0, 1, 99, 4, 41, 10, 74, 44