# Cosine Similarity 

In [8]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install neo4j

Note: you may need to restart the kernel to use updated packages.


In [9]:
pip install docx2txt

Note: you may need to restart the kernel to use updated packages.


In [10]:
import re
import docx2txt
def clean_text_from_file(file_path):
    try:
        # Extract text from .docx file
        text = docx2txt.process(file_path)
        # Define characters to remove 
        unnecessary_chars = re.compile(r'[^a-zA-Z0-9\s.,;:’èé+#Éàâ()ôÔ&/-]')
        # Remove unnecessary characters
        cleaned_text = unnecessary_chars.sub('', text)
        # Remove extra spaces and join into one paragraph
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
        return cleaned_text
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        return None
    except Exception as e:
        print(f"Error: An error occurred while processing the file '{file_path}': {e}")
        return None

In [24]:
import numpy as np

def cosine_similarity(list1, list2):
    # Convert all skills to lowercase
    list1 = list(map(str.lower, list1))
    list2 = list(map(str.lower, list2))
    
    # Create a combined list of unique skills
    skills = list(set(list1).union(set(list2)))
    
    # Create skill vectors
    vector1 = np.array([1 if skill in list1 else 0 for skill in skills])
    vector2 = np.array([1 if skill in list2 else 0 for skill in skills])
    
    # Calculate cosine similarity
    dot_product = np.dot(vector1, vector2)
    magnitude1 = np.linalg.norm(vector1)
    magnitude2 = np.linalg.norm(vector2)
    
    if magnitude1 == 0 or magnitude2 == 0:
        return 0.0  

    return dot_product / (magnitude1 * magnitude2)

In [13]:
import spacy
nlp=spacy.load('output/model-best')
def extraire_competences(doc):
    competences = [ent.text for ent in doc.ents if ent.label_ == 'SKILLS']  
    return set(competences)


def extraire_Noms(doc):
    Noms = [ent.text for ent in doc.ents if ent.label_ == 'NAME']  
    return set(Noms)

In [16]:
def save_similarity(tx, nom1, nom2, similarity, skills1, skills2):
    if similarity > 0.0:
        similarity_label = f"SIMILARITY_{round(similarity, 3)}"
        query = """
        MERGE (a:Consultant {name: $nom1})
        SET a.skills = $skills1
        MERGE (b:Consultant {name: $nom2})
        SET b.skills = $skills2
        MERGE (a)-[r:`""" + similarity_label + """`]->(b)
        SET r.score = $similarity
        """
        tx.run(query, nom1=nom1, nom2=nom2, similarity=similarity, skills1=list(skills1), skills2=list(skills2))

In [17]:
import os
def comparer_tous_les_cv(dossier, uri, username, password):
    fichiers = [f for f in os.listdir(dossier) if f.endswith('.docx')]
    #driver 
    driver = GraphDatabase.driver(uri, auth=(username, password))
    
    #with open(resultat_fichier, 'w', encoding='utf-8') as f_out:
    with driver.session() as session:
        for i in range(len(fichiers)):
            for j in range(i + 1, len(fichiers)):
                fichier1 = fichiers[i]
                fichier2 = fichiers[j]
                #load the file
                chemin1 = os.path.join(dossier, fichier1)
                chemin2 = os.path.join(dossier, fichier2)
                #clean text
                doc1=nlp(clean_text_from_file(chemin1))
                doc2=nlp(clean_text_from_file(chemin2))
                competences1 = extraire_competences(nlp(doc1))
                competences2 = extraire_competences(nlp(doc2))
                noms1 = list(extraire_Noms(nlp(doc1)))
                noms2 = list(extraire_Noms(nlp(doc2)))
                
                if not noms1 or not noms2:
                    continue
                
                NomConsultant1 = noms1[0]
                NomConsultant2 = noms2[0]
                #test de similarite
                similarite = cosine_similarity(competences1, competences2)
                
                # Enregistrement du résultat dans le fichier de sortie
                #ligne_resultat = f"Similarité de Jaccard entre : '{NomConsultant1}' et '{NomConsultant2}' : {similarite:}\n"
                #f_out.write(ligne_resultat)
                
                # Save the result in the Neo4j database
                session.execute_write(
                    save_similarity, NomConsultant1, NomConsultant2, similarite, competences1, competences2
                )



In [26]:
from neo4j import GraphDatabase

uri = "bolt://localhost:7687"
username = "neo4j"
password = "12345678"


dossier = 'Cv_DataSet'
#resultat_fichier = 'res.txt'

comparer_tous_les_cv(dossier, uri, username, password)