In [2]:
"""
Import statements
"""

import plotly.express as px
import numpy as np
import plotly.graph_objects as go

from gensim import corpora, models, similarities
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import spatial

from ipynb.fs.full.Data_handling import *

In [3]:
"""
Functions: 
calculate similarity between a document and a bag-of-words
plot similarity with correspoding labels
"""

def calc_sim(bag, doc): 
    dictionary = corpora.Dictionary(bag)
    feature_cnt = len(dictionary.token2id)
    corpus = [dictionary.doc2bow(one) for one in bag]
    tfidf = models.TfidfModel(corpus) 
    vector = dictionary.doc2bow(doc)
    index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features = feature_cnt)
    sim = index[tfidf[vector]]
    return sim 


def plot_sim(sim, labels):
    datafr = pd.DataFrame(dict(
     r=sim, theta=labels))
    fig = px.line_polar(datafr, r='r', theta='theta', line_close=True, range_r=[0,1])
    fig.update_traces(fill='toself')
    fig.show()
    return 

In [4]:
"""
Functions to calculate similarity for documents in dictionary or list to specific bag-of-words
"""

def dict_to_sim_list(dictionary, bag): 
    list_sims = []
    for i in range(1, len(dictionary)+1):
        sim = calc_sim(bag, dictionary[i][1])
        list_sims.append(sim)
    return list_sims 
            
def list_to_sim_list(lists, bag): 
    list_sims = []
    for one in lists: 
        sim = calc_sim(bag, one)
        list_sims.append(sim)
    return list_sims

In [5]:
"""
Functions to search for most related cv and vacancy in lists of similarities and plot them  
"""

# searches for best match, using Euclidean distance as measure
def best_match(cvs, vacancies): 
    best_result = 100
    for i_sim in range(len(cvs)):
        for j_sim in range(len(vacancies)): 
            result = np.linalg.norm(cvs[i_sim] - vacancies[j_sim])
            if result < best_result: 
                best_result = result 
                cv_index = i_sim
                vac_index = j_sim  
    return [best_result, cv_index, vac_index]

# searches for worst match, using Euclidean distance as measure
def worst_match(cvs, vacancies): 
    worst_result = 0
    for i_sim in range(len(cvs)):
        for j_sim in range(len(vacancies)): 
            result = np.linalg.norm(cvs[i_sim] - vacancies[j_sim])
            if result > worst_result: 
                worst_result = result 
                cv_index = i_sim
                vac_index = j_sim  
    return [worst_result, cv_index, vac_index]

# searches for the best vacancy for 1 CV
def best_match_one_cv(cv, vacancies): 
    best_result = 100
    for j_sim in range(len(vacancies)): 
        result = np.linalg.norm(np.array(cv) - np.array(vacancies[j_sim]))
        if result < best_result: 
            best_result = result 
            vac_index = j_sim  
    return [best_result, 0, vac_index]

# searches for the n best vacancy matches for 1 CV
def one_cv_best_vacancies(cv, vacancies, number): 
    best_matches = []
    vacs_to_cv_dict = dict()
    for j_sim in range(len(vacancies)): 
        result = np.linalg.norm(np.array(cv) - np.array(vacancies[j_sim]))
        vacs_to_cv_dict[j_sim] = result
    for i in range(number): 
        best_match = min(vacs_to_cv_dict, key=vacs_to_cv_dict.get)
        best_matches.append([best_match, vacs_to_cv_dict[best_match]])
        del vacs_to_cv_dict[best_match]
    return best_matches