# Libraries: 

In [8]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip



  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ----- ---------------------------------- 1.8/12.8 MB 11.2 MB/s eta 0:00:01
     ------------ --------------------------- 3.9/12.8 MB 11.2 MB/s eta 0:00:01
     ---------------------- ----------------- 7.1/12.8 MB 11.8 MB/s eta 0:00:01
     ------------------------------ -------- 10.0/12.8 MB 11.9 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 12.0 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 10.2 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [58]:
import spacy
from spacy import displacy
from spacy.training import Example
import matplotlib.pyplot as plt
import pickle
import re

In [14]:
def check_overlapping_entities(data):
    for text, annotations in data:
        entities = sorted(annotations["entities"], key=lambda x: x[0])  # Sort by start index
        for i in range(len(entities) - 1):
            start1, end1, label1 = entities[i]
            start2, end2, label2 = entities[i + 1]
            if end1 > start2:  # Overlapping condition
                print(f"Overlap detected between {label1} ({start1}-{end1}) and {label2} ({start2}-{end2})")


In [15]:
check_overlapping_entities(TRAIN_DATA)

Overlap detected between Skills (1136-1248) and Companies worked at (1209-1215)
Overlap detected between Skills (1356-1793) and Companies worked at (1417-1423)
Overlap detected between Companies worked at (34-50) and Companies worked at (34-49)
Overlap detected between Skills (4121-4399) and Companies worked at (4186-4191)
Overlap detected between College Name (2116-2143) and Companies worked at (2128-2144)
Overlap detected between Skills (886-897) and Skills (894-897)
Overlap detected between Degree (1106-1183) and College Name (1153-1184)
Overlap detected between Skills (1734-1745) and Skills (1742-1745)
Overlap detected between Skills (1748-1790) and Skills (1765-1771)
Overlap detected between Skills (1794-1830) and Skills (1811-1815)
Overlap detected between Skills (1844-1873) and Skills (1844-1860)
Overlap detected between Skills (3466-3819) and Companies worked at (3535-3541)
Overlap detected between Skills (6305-7258) and Companies worked at (6861-6870)
Overlap detected between 

In [21]:
def clean_training_data(train_data):
    """Cleans training data by ensuring no overlapping entities and stripping spaces."""
    cleaned_data = []
    for text, annotations in train_data:
        entities = sorted(annotations["entities"], key=lambda x: (x[0], x[1]))  # Sort entities by start position
        has_overlap = False

        # Check for overlaps
        for i in range(len(entities) - 1):
            start1, end1, label1 = entities[i]
            start2, end2, label2 = entities[i + 1]

            if start2 < end1:  # Overlap detected
                has_overlap = True
                break  # Skip this resume

        if not has_overlap:
            # Trim spaces in entity spans
            new_entities = []
            for start, end, label in entities:
                trimmed_text = text[start:end].strip()
                new_start = text.find(trimmed_text, start)  # Get new start position
                new_end = new_start + len(trimmed_text)
                new_entities.append((new_start, new_end, label))

            cleaned_data.append((text, {"entities": new_entities}))  # Store cleaned entities

    return cleaned_data


# Load blank Spacy model
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

# Clean the training data
TRAIN_DATA_CLEANED = clean_training_data(TRAIN_DATA)
print(len(TRAIN_DATA_CLEANED))

154


# Preprocessing

In [30]:
# Import all datasets
from dataset_1 import TRAIN_DATA_1
from dataset_2 import TRAIN_DATA_2
from dataset_3 import TRAIN_DATA_3
from dataset_4 import TRAIN_DATA_4
from dataset_5 import TRAIN_DATA_5
from test_dataset_1 import test_1
from test_dataset_2 import test_2
# Merge all datasets into one
TRAIN_DATA_MERGED = TRAIN_DATA_1 + TRAIN_DATA_2 + TRAIN_DATA_3 + TRAIN_DATA_4 + TRAIN_DATA_5 +test_1+test_2

# Save the merged dataset to a new file
with open("merged_dataset.py", "w", encoding="utf-8") as f:
    f.write("TRAIN_DATA_MERGED = " + str(TRAIN_DATA_MERGED))


In [112]:
import re
from merged_dataset import TRAIN_DATA_MERGED

# Check a few examples
for text, annotations in TRAIN_DATA_MERGED[:1]:  # Print only the first sample
    entities = annotations.get("entities", [])  # Extract entity list
    print(entities)  # Print (start, end, label) tuples


[(224, 243, 'POSTE'), (245, 264, 'POSTE'), (3274, 3293, 'POSTE'), (3295, 3314, 'POSTE'), (6225, 6244, 'POSTE'), (6246, 6265, 'POSTE'), (13826, 13845, 'POSTE'), (13860, 13879, 'POSTE'), (27198, 27217, 'POSTE'), (27219, 27238, 'POSTE'), (28445, 28464, 'POSTE'), (28466, 28485, 'POSTE'), (28575, 28594, 'POSTE'), (30997, 31016, 'POSTE'), (31018, 31037, 'POSTE'), (38598, 38617, 'POSTE'), (38632, 38651, 'POSTE'), (51970, 51989, 'POSTE'), (51991, 52010, 'POSTE'), (53217, 53236, 'POSTE'), (53238, 53257, 'POSTE'), (53347, 53366, 'POSTE'), (55769, 55788, 'POSTE'), (55790, 55809, 'POSTE'), (63370, 63389, 'POSTE'), (63404, 63423, 'POSTE'), (76742, 76761, 'POSTE'), (76763, 76782, 'POSTE'), (77989, 78008, 'POSTE'), (78010, 78029, 'POSTE'), (78119, 78138, 'POSTE'), (80052, 80071, 'POSTE'), (80073, 80092, 'POSTE'), (83102, 83121, 'POSTE'), (83123, 83142, 'POSTE'), (86053, 86072, 'POSTE'), (86074, 86093, 'POSTE'), (93654, 93673, 'POSTE'), (93688, 93707, 'POSTE')]


In [33]:
print(len(TRAIN_DATA_MERGED))

21


In [34]:
!python -m spacy download fr_core_news_md  # For French model


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Invalid requirement: '#': Expected package name at the start of dependency specifier
    #
    ^


In [35]:
import spacy
from spacy.training.example import Example

In [36]:
def detect_overlapping_entities(train_data):
    overlapping_samples = []
    total_overlaps = 0
    texts_with_overlaps = 0
    
    for text, annotations in train_data:
        entities = annotations["entities"]
        entities_sorted = sorted(entities, key=lambda x: x[0])  # Sort entities by their start position
        
        overlaps_in_text = 0
        
        # Check for overlaps
        for i in range(len(entities_sorted) - 1):
            start_1, end_1, label_1 = entities_sorted[i]
            start_2, end_2, label_2 = entities_sorted[i + 1]
            
            if start_2 < end_1:  # Overlap detected
                overlapping_samples.append({
                    'text': text,
                    'overlap': [(start_1, end_1, label_1), (start_2, end_2, label_2)],
                    'positions': (start_1, end_1, start_2, end_2)
                })
                overlaps_in_text += 1
        
        # If overlaps are detected in this text, increase the counter
        if overlaps_in_text > 0:
            texts_with_overlaps += 1
            total_overlaps += overlaps_in_text

    return overlapping_samples, texts_with_overlaps, total_overlaps

In [37]:
# Call function to detect overlaps and count them
overlaps, texts_with_overlaps, total_overlaps = detect_overlapping_entities(TRAIN_DATA_MERGED)

# Output results
print(f"Number of texts with overlaps: {texts_with_overlaps}")
print(f"Total number of overlaps: {total_overlaps}")

# If you want to print the overlapping samples, uncomment this:
# for overlap in overlaps:
#     print(f"Overlap detected in: {overlap['text']}")
#     print(f"Overlapping Entities: {overlap['overlap']}")
#     print(f"Positions: {overlap['positions']}\n")

Number of texts with overlaps: 8
Total number of overlaps: 449


In [38]:
def has_overlap(entity1, entity2):
    """Check if two entity spans overlap"""
    return not (entity1[1] <= entity2[0] or entity1[0] >= entity2[1])

def fix_overlaps(data):
    """Remove or adjust overlapping entities"""
    cleaned_data = []
    
    for text, annotations in data:
        entities = sorted(annotations["entities"], key=lambda x: (x[0], x[1]))  # Sort by start position
        non_overlapping_entities = []

        for entity in entities:
            if all(not has_overlap(entity, existing) for existing in non_overlapping_entities):
                non_overlapping_entities.append(entity)

        cleaned_data.append((text, {"entities": non_overlapping_entities}))
    
    return cleaned_data

In [39]:
# Fix overlapping entities
fixed_train_data = fix_overlaps(TRAIN_DATA_MERGED)

In [40]:
# Call function to detect overlaps and count them
overlaps, texts_with_overlaps, total_overlaps = detect_overlapping_entities(fixed_train_data)

# Output results
print(f"Number of texts with overlaps: {texts_with_overlaps}")
print(f"Total number of overlaps: {total_overlaps}")

Number of texts with overlaps: 0
Total number of overlaps: 0


# Training: 

In [41]:
# Load blank Spacy model
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

# Add labels to the NER model
for _, annotations in fixed_train_data:
    for start, end, label in annotations["entities"]:
        ner.add_label(label)

# Train the model
optimizer = nlp.begin_training()
nlp.config["training"]["optimizer"]["learn_rate"] = 0.0001  

best_loss = float('inf')  
patience_counter = 0  
patience_threshold = 5 

# List to store the losses  
losses_list = []  

for epoch in range(50):  # Adjust epochs as needed
    losses = {}
    for text, annotations in fixed_train_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
       
        nlp.update([example], drop=0.001, losses=losses)

    # Store the loss for the current epoch  
    losses_list.append(losses['ner'])
    print(f"Epoch {epoch} Loss: {losses['ner']}")

    # Early stopping condition  
    if losses['ner'] < best_loss:  
        best_loss = losses['ner']  
        patience_counter = 0  
    else:  
        patience_counter += 1  
    
    if patience_counter >= patience_threshold:  
        print(f"Early stopping at epoch {epoch}.")  
        break 

# Save the model
nlp.to_disk("custom_ner_model")

 Étudiant en Intelligence Artificie..." with entities "[(57, 79, 'COMPETENCES'), (81, 106, 'CONTACT'), (1...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


Epoch 0 Loss: 87682.27471699925
Epoch 1 Loss: 10407.436692103383
Epoch 2 Loss: 7733.415592869509
Epoch 3 Loss: 7017.307402287294
Epoch 4 Loss: 6516.3333772274655
Epoch 5 Loss: 7529.477312932133
Epoch 6 Loss: 6226.654511233508
Epoch 7 Loss: 5089.623256621429
Epoch 8 Loss: 6297.587068647356
Epoch 9 Loss: 4355.913330746995
Epoch 10 Loss: 3841.469888915218
Epoch 11 Loss: 3761.360140399487
Epoch 12 Loss: 3460.1770082560483
Epoch 13 Loss: 2713.702052448542
Epoch 14 Loss: 2338.266921237242
Epoch 15 Loss: 2512.3213707148266
Epoch 16 Loss: 2185.789734778288
Epoch 17 Loss: 2473.3136353169366
Epoch 18 Loss: 2115.1534215477404
Epoch 19 Loss: 2025.4219211069874
Epoch 20 Loss: 1975.3808352535443
Epoch 21 Loss: 1911.032777137726
Epoch 22 Loss: 1718.755414368
Epoch 23 Loss: 1819.2172411206334
Epoch 24 Loss: 1984.9557808629404
Epoch 25 Loss: 1805.861596494536
Epoch 26 Loss: 1887.4548255835
Epoch 27 Loss: 1723.4438220222096
Early stopping at epoch 27.


# Test: 

In [43]:
# Load trained model
nlp_ner = spacy.load("custom_ner_model")

In [44]:
# Import all test datasets
from test_dataset_3 import test_3


# Merge all datasets into one
TEST_DATA_MERGED = test_3

# Save the merged dataset to a new file
with open("merged_TEST_dataset.py", "w", encoding="utf-8") as f:
    f.write("TEST_DATA_MERGED = " + str(TEST_DATA_MERGED))


In [117]:
import re
from merged_TEST_dataset import TEST_DATA_MERGED

# Cleaning 

In [47]:
# Call function to detect overlaps and count them
overlaps, texts_with_overlaps, total_overlaps = detect_overlapping_entities(TEST_DATA_MERGED)

# Output results
print(f"Number of texts with overlaps: {texts_with_overlaps}")
print(f"Total number of overlaps: {total_overlaps}")

Number of texts with overlaps: 12
Total number of overlaps: 238


In [48]:
# Fix overlapping entities
fixed_test_data = fix_overlaps(TEST_DATA_MERGED)

In [49]:
# Call function to detect overlaps and count them
overlaps, texts_with_overlaps, total_overlaps = detect_overlapping_entities(fixed_test_data)

# Output results
print(f"Number of texts with overlaps: {texts_with_overlaps}")
print(f"Total number of overlaps: {total_overlaps}")


Number of texts with overlaps: 0
Total number of overlaps: 0


In [50]:
print(len(fixed_test_data))

17


In [60]:
import spacy
from spacy.training import Example
from spacy.scorer import Scorer

def evaluate_ner(nlp, test_data):
    """Evaluate a spaCy NER model using proper entity alignment."""
    scorer = Scorer()
    examples = []
    for text, annotations in test_data:
        # Create a Doc object for the text and apply the model's predictions
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        example.predicted = nlp(text)  # Process with the model
        examples.append(example)
    
    # Calculate scores using spaCy's scorer
    scores = scorer.score(examples)
    return {
        "precision": scores["ents_p"],
        "recall": scores["ents_r"],
        "f1_score": scores["ents_f"],
        "details": scores["ents_per_type"]  # Metrics per entity type
    }

In [61]:
# Load your trained NER model  
nlp_ner = spacy.load("custom_ner_model")

# Sample test data (replace with your actual test data)  
test_data = fixed_test_data 

metrics = evaluate_ner(nlp, test_data)
print(f"Overall Precision: {metrics['precision']:.2f}")
print(f"Overall Recall: {metrics['recall']:.2f}")
print(f"Overall F1: {metrics['f1_score']:.2f}")

Overall Precision: 0.23
Overall Recall: 0.09
Overall F1: 0.13


In [60]:
# Define your validation dataset (format: text, {"entities": [(start, end, "LABEL")]})
VALIDATION_DATA = fixed_train_data[2:6]

# Convert the validation data into spaCy's Example format
examples = []
for text, annotations in VALIDATION_DATA:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    examples.append(example)

# Evaluate the model on the validation dataset
metrics = nlp.evaluate(examples)

# Print the evaluation metrics: Precision, Recall, F1 score
print(f"Precision: {metrics['ents_p']:.4f}")
print(f"Recall: {metrics['ents_r']:.4f}")
print(f"F1 Score: {metrics['ents_f']:.4f}")


Precision: 0.8607
Recall: 0.9181
F1 Score: 0.8885


# Extracting from PDFs:

In [58]:
#!pip install pymupdf 
!pip install ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting widgetsnbextension~=4.0.12 (from ipywidgets)
  Downloading widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)
Downloading ipywidgets-8.1.5-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB)
Downloading widgetsnbextension-4.0.13-py3-none-any.whl (2.3 MB)
   ---------------------------------------- 0.0/2.3 MB ? eta -:--:--
   ----------------------------------- ---- 2.1/2.3 MB 11.8 MB/s eta 0:00:01
   ---------------------------------------- 2.3/2.3 MB 10.2 MB/s eta 0:00:00
Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-8.1.5 jupyterlab-widgets-3.0.13 widgetsnbextension-4.0.13



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [67]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"  # Extract text from each page
    return text

# Example usage
pdf_path = ["CV_GHIZLANE AIT LMOUDDEN.pdf","CV_GHIZLANE AIT LMOUDDEN.pdf"] # Replace with the actual PDF file path
extracted_text_list = []
for cv in pdf_path:
    
    cv_text = extract_text_from_pdf(cv)
    extracted_text_list.append(cv_text)

print(cv_text)  # Print extracted text

Langages de Programmation : Python, Java , CSS, HTML, PHP , JavaScript .
 Framework de Développement :JavaEE(Java Enterprise Edition) , Spring Boot. 
Bases de Données : MySQL , NoSQL  . 
Intelligence Artificielle et Big Data : Machine Learning , Deep Learning ,Traitement du
Langage Naturel (NLP) , Vision par Ordinateur , Big Data ( hadoop , Apache Pig ) . 
DevOps & DataOps .
GHIZLANE AIT LMOUDDEN
PROFIL
En tant qu'étudiante en deuxième année d'intelligence artificielle et d'ingénierie des données à
l'École Supérieure de Technologie de Nador, je suis activement à la recherche d'un stage de fin
d'études d'une durée de deux mois , à partir du mois d'avril. 
Ce stage représente pour moi une opportunité précieuse de mettre en pratique mes
connaissances académiques tout en acquérant une expérience professionnelle significative. 
 EXPÉRIENCE PROFESSIONNELLE 
 FORMATIONS 
    2ᵉ année en Intelligence Artificielle et Ingénierie des Données 
    École Supérieure de Technologie de Nador.
  2023  

# Ranking CVs based on the job description

In [98]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [99]:
# Load your trained SpaCy NER model
nlp = spacy.load("custom_ner_model")

In [104]:
from CVs import cvs_list as cvs_text

In [108]:
#Job posting:
job_text = """Date
31-07-2024
Direction
JUNIOR DATA SCIENTIST
Contrat
Localisation
Casablanca
Fonction
Autre fonction
Contexte du recrutement et définition de poste
We are looking for a Junior Data Scientist. They are expected to conduct in-depth predictive analyses using complex data to business units make informed decisions. Using statistical modeling and machine learning techniques, the Data Scientist will create predictive models and propose corresponding metrics in order to evaluate the efficiency of these models.

Key Responsibilities:

Workflow Integration:
Prepare and integrate developments into existing workflows.
Comprehensive Documentation:
Thoroughly document critical parts of the code.
Document global workflows and the significance and use of data.
Data Evaluation and Communication:
Evaluate data quality before usage.
Communicate date-related issues to the data management.
Coding Standards and Documentation:
Adhere to coding standards, stylistic conventions, and provided guidelines.
Critically evaluate and propose improvements to existing code base.
Prediction and Model Evaluation:
Help define business metrics to evaluate data prediction outputs.
Define technical metrics to evaluate model training.
Technical Procedures:
Document and model technical procedures specific to Data Science.
Profil recherché
Bachelor or Masters degree in Data Science, Computer Science, or a related field.
Knowledge in machine learning libraries (pandas, numpy, scikit-learn, pytorch, tensorflow and others).
Knowledge in Python programming language.
Understanding of machine learning models and data preprocessing.
Strong analytical and problem-solving skills.
Ability to work in a team."""

In [100]:
def extract_unique_entities(text):
    """Extract unique entities from text using the trained NER model."""
    doc = nlp(text)
    entities = {}
    for ent in doc.ents:
        if ent.label_ not in entities:
            entities[ent.label_] = set()  # Store values as sets to ensure uniqueness
        entities[ent.label_].add(ent.text.lower())  # Store lowercase for consistency
    
    # Convert sets back to lists
    return {key: list(values) for key, values in entities.items()}

In [102]:
def run_model_on_cvs(cv_texts):
    """Runs the NER model on a list of CV texts and returns the entities extracted."""
    cv_entities_list = []
    
    for cv_text in cv_texts:
        # Extract unique entities for each CV
        cv_entities = extract_unique_entities(cv_text)
        cv_entities_list.append(cv_entities)
    
    return cv_entities_list

def extract_unique_entities_for_all_cvs(cv_entities_list):
    """Extract unique entities from all CVs into a list."""
    unique_entities_list = []
    for cv_entities in cv_entities_list:
        unique_entities = {key: list(set(values)) for key, values in cv_entities.items()}  # Ensure uniqueness
        unique_entities_list.append(unique_entities)
    
    return unique_entities_list

In [101]:
def compute_similarity(cv_entities, job_entities, keyword_weights):
    """Computes similarity between CV entities and job description entities using TF-IDF & weighted keywords."""
    cv_text = " ".join([" ".join(values) for values in cv_entities.values()])
    job_text = " ".join([" ".join(values) for values in job_entities.values()])
    
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([cv_text, job_text])
    
    # Compute cosine similarity
    similarity_score = cosine_similarity(vectors[0], vectors[1])[0][0]
    
    # Apply recruiter-defined weights
    weighted_score = 0
    for keyword, weight in keyword_weights.items():
        if any(keyword in " ".join(cv_entities.get(k, [])) for k in cv_entities):
            weighted_score += weight
    
    final_score = similarity_score * weighted_score
    return final_score

In [103]:
def rank_cvs_by_similarity(cv_entities_list, job_entities, keyword_weights):
    """Compute similarity between each CV and the job posting and rank them."""
    ranked_cvs = []
    
    for i, cv_entities in enumerate(cv_entities_list):
        similarity_score = compute_similarity(cv_entities, job_entities, keyword_weights)
        ranked_cvs.append({"name": f"CV {i+1}", "score": similarity_score, "entities": cv_entities})
    
    # Rank CVs by similarity score (descending order)
    ranked_cvs.sort(key=lambda x: x["score"], reverse=True)
    
    return ranked_cvs

In [109]:
# Example recruiter-defined weights
keyword_weights = {"python": 0.8, "java": 0.4, "baccalaureate": 0.6}

# Example job posting
job_entities = extract_unique_entities(job_text)

In [111]:
# 1. Run model on CVs and extract entities
cv_entities_list = run_model_on_cvs(cvs_text)

# 2. Ensure entities in each CV are unique
unique_cv_entities_list = extract_unique_entities_for_all_cvs(cv_entities_list)

# 3. Rank the CVs based on similarity with job posting
ranked_cvs = rank_cvs_by_similarity(unique_cv_entities_list, job_entities, keyword_weights)

# Print ranked CVs
for rank, cv in enumerate(ranked_cvs, 1):
    print(f"{rank}. {cv['name']} - Similarity Score: {cv['score']:.4f}")

1. CV 2 - Similarity Score: 0.4068
2. CV 1 - Similarity Score: 0.1584
