## Reading files from OCR

In [1]:
!pip install pymupdf
!sudo apt install tesseract-ocr
!sudo apt install libtesseract-dev
!pip install pytesseract
!sudo apt install tesseract-ocr-fra
!pip install googletrans==4.0.0-rc1
!pip install transformers torch spacy pandas
!pip install textblob_fr

Collecting pymupdf
  Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.4
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 29 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http:

In [None]:
# pytesseract.pytesseract.tesseract_cmd = r'/kaggle/working/tesseract' # Set the path to your tesseract executable

In [5]:
import fitz  # PyMuPDF
from PIL import Image
import pandas as pd
import os
# from pdf2image import convert_from_path
import pytesseract
from googletrans import Translator
import re
from textblob import TextBlob
from textblob_fr import PatternTagger, PatternAnalyzer

def translate_french_to_english(text):
    translator = Translator()
    translation = translator.translate(text, src='fr', dest='en')
    return translation.text

def ocr_pdf(pdf_path, language='eng'):
    """
    Extracts text from a PDF file using OCR.

    Args:
    pdf_path: Path to the PDF file.
    language: Language of the text ('eng' for English, 'fra' for French).

    Returns:
    Extracted text as a string.
    """
    doc = fitz.open(pdf_path)
    text = ''
    for page in doc:
        pix = page.get_pixmap()
        # Create a PIL Image object from the pixmap bytes
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        text += pytesseract.image_to_string(img, lang=language)
    return text

def clean_text(text):
    """Cleans job description text before feeding it into a model."""
    # Remove unwanted newline characters
    text = text.replace("\n", " ").replace("\x0c", " ")

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Replace OCR-specific artifacts
    text = re.sub(r"\+ _", "•", text)  # Standardize bullet points
    text = re.sub(r"{|}", "", text)  # Remove curly braces
    text = re.sub(r"(?<!\d),(?!\d)", "", text)  # Remove misplaced commas

    return text

def correct_ocr_errors_fr(text):
    """Automatically correct OCR errors using TextBlob-FR."""

    # Remove unwanted characters
    text = text.replace("\n", " ").replace("\x0c", " ")
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove or standardize bullet points
    text = re.sub(r"\+\.", "", text)  # Remove '+.' (OCR bullet point misinterpretation)
    text = re.sub(r"\+", "", text)    # Remove standalone '+'
    # Apply French grammar and spelling correction
    blob = TextBlob(text, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
    corrected_text = str(blob.correct())

    return corrected_text

def correct_ocr_errors_en(text):
    """Automatically corrects OCR errors using TextBlob for English text."""

    # Remove unwanted characters
    text = text.replace("\n", " ").replace("\x0c", " ")
    text = re.sub(r'\s+', ' ', text).strip()

    # Apply English grammar and spelling correction
    blob = TextBlob(text)
    corrected_text = str(blob.correct())

    return corrected_text

In [13]:
import fitz  # PyMuPDF
from PIL import Image
import pandas as pd
import os
import pytesseract
from googletrans import Translator
import re
from textblob import TextBlob
from textblob_fr import PatternTagger, PatternAnalyzer
import time

def translate_french_to_english(text):
    translator = Translator()
    # Add error handling and retry mechanism
    for _ in range(3):  # Retry up to 3 times
        try:
            translation = translator.translate(text, src='fr', dest='en')
            return translation.text  # Return if successful
        except TypeError as e:
            print(f"Translation error: {e}. Retrying in 5 seconds...")
            time.sleep(5)  # Wait before retrying
    return None  # Return None if all retries fail

def ocr_pdf(pdf_path, language='eng'):
    """
    Extracts text from a PDF file using OCR.

    Args:
    pdf_path: Path to the PDF file.
    language: Language of the text ('eng' for English, 'fra' for French).

    Returns:
    Extracted text as a string.
    """
    doc = fitz.open(pdf_path)
    text = ''
    for page in doc:
        pix = page.get_pixmap()
        # Create a PIL Image object from the pixmap bytes
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        text += pytesseract.image_to_string(img, lang=language)
    return text

def clean_text(text):
    """Cleans job description text before feeding it into a model."""
    # Remove unwanted newline characters
    text = text.replace("\n", " ").replace("\x0c", " ")

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Replace OCR-specific artifacts
    text = re.sub(r"\+ _", "•", text)  # Standardize bullet points
    text = re.sub(r"{|}", "", text)  # Remove curly braces
    text = re.sub(r"(?<!\d),(?!\d)", "", text)  # Remove misplaced commas

    return text

def correct_ocr_errors_fr(text):
    """Automatically correct OCR errors using TextBlob-FR."""

    # Remove unwanted characters
    text = text.replace("\n", " ").replace("\x0c", " ")
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove or standardize bullet points
    text = re.sub(r"\+\.", "", text)  # Remove '+.' (OCR bullet point misinterpretation)
    text = re.sub(r"\+", "", text)    # Remove standalone '+'
    # Apply French grammar and spelling correction
    blob = TextBlob(text, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
    corrected_text = str(blob.correct())

    return corrected_text

def correct_ocr_errors_en(text):
    """Automatically corrects OCR errors using TextBlob for English text."""

    # Remove unwanted characters
    text = text.replace("\n", " ").replace("\x0c", " ")
    text = re.sub(r'\s+', ' ', text).strip()

    # Apply English grammar and spelling correction
    blob = TextBlob(text)
    corrected_text = str(blob.correct())

    return corrected_text

In [9]:
# Directory containing PDF files
pdf_directory = "/content/"  # Adjust this path

# List of PDF files
pdf_files = ["Scrum.pdf", "Data Engineer.pdf", "Data Analyst.pdf"]

# Initialize an empty list to store data
data = []

# Process each PDF file
for pdf in pdf_files:
    pdf_path = os.path.join(pdf_directory, pdf)

    # Extract text using OCR
    extracted_text = correct_ocr_errors_fr(clean_text(ocr_pdf(pdf_path, language='fra')))

    # Translate to English
    translated_text = correct_ocr_errors_en(translate_french_to_english(extracted_text))

    # Append data to the list
    data.append({
        "demand_id": os.path.splitext(pdf)[0],
        "extracted_text": extracted_text,
        "translated_text": translated_text
    })

# Convert list to DataFrame
df = pd.DataFrame(data)

'/content/Scrum.pdf'

In [15]:
df

Unnamed: 0,demand_id,extracted_text,translated_text
0,Scrum,Serum Master- Contrat de 12 moist Dieu : Montr...,Serum Master- Contract of 12 Lists God: Montre...
1,Data Engineer,Most: Data Engineer Type de contrat: Contrat ...,Most: Data Engineer Type of contract: Contract...
2,Data Analyst,Profit Data Analyst - Marketing Analytics. A p...,Profit Data Analyst - Marketing Analytics.Abou...


In [17]:
df['translated_text'][1]

'Most: Data Engineer Type of contract: Contract "Late Debut: Was as possible sure of the mandate: 12 months Late Limited of Depot: November 6 with Oh words of Enterprise Retailnova But A leader of the Detail Trade Letter with an Omnichannel distribution branch present in Traders all of Territory. Enterprise combined technological innovation and operational excellence for optimism its processesAnticipate YES Lesions des Customers and minimize the performance. In the Are of its Retailnova digital transformation strategic invested passively.Its transaction and customer operational donations.He Data Engineer will be responsible for the design of the development of the employment of the Donned pipelines necessary for the lamentation of the Data Take of Enterprise.In addition to the new RED treatment architecture in insurance in insurance in place in the Robustness to Calamity Well Performance of the solutions deployed.Detailed responsibilities: Design and develop optimism pipelines using Az

In [22]:
import re
import torch
import pandas as pd
from transformers import pipeline
# Load the pre-trained BERT model for NER
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
df["ner_results"] = df["translated_text"].apply(lambda text: ner_pipeline(text))

def extract_features(ner_results, translated_text): # Add translated_text as an argument
    skills, experience, languages, availability = [], [], [], []

    for entity in ner_results:
        text, label = entity['word'], entity['entity']

    # Extract Language Requirements using regex and keyword matching
    languages_regex = re.findall(r'(?i)(english|french|anglais|français)', translated_text) # Case-insensitive
    languages.extend(languages_regex)

    # Add language requirements to the dictionary
    language_requirements = {
        "languages": list(set(languages)) # Unique languages
    }

    return {
        "languages": language_requirements # Update languages to be a dictionary

    }

# Apply Feature Extraction (Updated)
df["features"] = df.apply(lambda row: extract_features(row["ner_results"], row["translated_text"]), axis=1)

# Access language requirements from the 'features' column
df["languages"] = df["features"].apply(lambda x: x["languages"]["languages"]) # Extract list of languages

# ... (Rest of the code) ...

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [25]:
df

Unnamed: 0,demand_id,extracted_text,translated_text,ner_results,features,english_required,french_required,languages,experience
0,Scrum,Serum Master- Contrat de 12 moist Dieu : Montr...,Serum Master- Contract of 12 Lists God: Montre...,"[{'entity': 'I-LOC', 'score': 0.98448527, 'ind...","{'languages': {'languages': ['French', 'Englis...",True,True,"[French, English]",
1,Data Engineer,Most: Data Engineer Type de contrat: Contrat ...,Most: Data Engineer Type of contract: Contract...,"[{'entity': 'I-ORG', 'score': 0.5615247, 'inde...","{'languages': {'languages': ['French', 'Englis...",True,True,"[French, English]",
2,Data Analyst,Profit Data Analyst - Marketing Analytics. A p...,Profit Data Analyst - Marketing Analytics.Abou...,"[{'entity': 'I-ORG', 'score': 0.98849547, 'ind...","{'languages': {'languages': ['French', 'Englis...",True,True,"[French, English]",


In [24]:
import re

def extract_experience(text):
    """Extracts experience requirement from text using regex."""
    experience_match = re.search(r"(\d+)\+? years?", text, re.IGNORECASE)
    if experience_match:
        return experience_match.group(1) + " years"  # Extract number and add "years"
    else:
        return None  # Return None if no experience requirement found

# Apply experience extraction to the translated text column
df["experience"] = df["translated_text"].apply(extract_experience)

In [29]:
 df["extracted_text"][0]

'Serum Master- Contrat de 12 moist Dieu : Montreal Late de démarrage : Was que possible Sure : 12 moist avec possibility de renouvellement Dans He care d\'un mandate d\'importance stratégique nous recherchons un(e) Serum Master experiment(e) pour guide une quite Agile multidisciplnaire au sein d\'un environment stimulant et en line transformation numérique. Tom de l\'enterprise : Alimora Troupe Slogan : Cuttiver l\'avenue burri l\'excellence Presentation de l\'enterprise Fond en 1998, Alimora Troupe est un after major de l\'agroalimentaire unable en Amérique du Word. Spécialisé dans la production la transformation et a distribution de products alimentaires de quality He group s\'engage à affair une lamentation spine et accessible tout en respecting l\'environment et les communautés Scales. Avec plus de 1 500 employs répartis sur 7 sites de production et une presence dans plus de 12 pays Alimora se distinguee par On innovation constant son excellence ‘opérationnelle et sa capacity à ant

In [28]:
 df["translated_text"][0]

'Serum Master- Contract of 12 Lists God: Montreal Late Starting: WAS that possible sure: 12 MONTH with possibility of renewal in he care of a mandate of strategic importance He are looking for a Serum Master Experiment (E) for Guide an Agile Multidpilar Quite within a stimulating Environment and in Line Digital Transformation.Tom de l\'Enterprise: Alimora Slogan troop: Cultivating Burri Avenue Excellence Presentation of the Enterprise Background in 1998, Alimora Troupe is an after major of the Word\'s united food industry.Specializing in production The processing and distribution of quality food products of Group undertakes to affect a spine and accessible lamentation while respecting the environment and the Scales communities.With more than 1,500 Employs spread over 7 production sites and a presence in more than 12 Alimora countries is distinguished by constant innovation His excellence ‘operational and its capacity to anticipate consolation tendencies.Remains of expertise: • Transfor

In [11]:
correct_ocr_errors_fr(clean_text(ocr_pdf('/content/Scrum.pdf', language='fra')))

'Serum Master- Contrat de 12 moist Dieu : Montreal Late de démarrage : Was que possible Sure : 12 moist avec possibility de renouvellement Dans He care d\'un mandate d\'importance stratégique nous recherchons un(e) Serum Master experiment(e) pour guide une quite Agile multidisciplnaire au sein d\'un environment stimulant et en line transformation numérique. Tom de l\'enterprise : Alimora Troupe Slogan : Cuttiver l\'avenue burri l\'excellence Presentation de l\'enterprise Fond en 1998, Alimora Troupe est un after major de l\'agroalimentaire unable en Amérique du Word. Spécialisé dans la production la transformation et a distribution de products alimentaires de quality He group s\'engage à affair une lamentation spine et accessible tout en respecting l\'environment et les communautés Scales. Avec plus de 1 500 employs répartis sur 7 sites de production et une presence dans plus de 12 pays Alimora se distinguee par On innovation constant son excellence ‘opérationnelle et sa capacity à ant

In [12]:
correct_ocr_errors_fr(clean_text(ocr_pdf('/content/Data Engineer.pdf', language='fra')))

'Most: Data Engineer  Type de contrat: Contrat " Late de debut: Was que possible  Sure du mandate: 12mois  Late limited de depot: 6 november à oh Propos de enterprise RetailNova out un Leader du setter du commerce de detail avec un rameau de distribution omnicanal present à traders tout He territory. L\'enterprise combine innovation technologique et excellence opérationnelle pour optimism ses processes anticiper Yes lesions des clients et maximiser la performance. Dans He care de sa strategic de transformation numérique RetailNova invested massivement dans A modernisation de ses. infrastructure de donned pour accélérer a rise de decision fond sur des analysis Context du mandate : RetailNova met en sure une nouvelle platform d\'enterprise base sur des technologies cloud akin de centralized sécuriser et valoriser ses donned opérationnelles transactionnelles et clients. He project stratégique vise à moderniser l\'écosystème analytique à automatiser Yes klux de donned à améliorer la gouver

In [10]:
correct_ocr_errors_fr(clean_text(ocr_pdf('/content/Data Analyst.pdf', language='fra')))

'Profit Data Analyst - Marketing Analytics. A propos de Veltrida Vettrixia Technologies And. estate enterprise specialise dans Yes solutions de donned etlinteligence marketing. Was à Montreal Veltrixia accompany depuis plus de 15 and des enterprises word-américaines dans l\'optimisation de pleurs performances grace à des technologies analytiques advances. L\'enterprise rise sur l\'innovation la collaboration et l\'excellence pour propulser ses clients very He future numérique. Context Dans He care du développement de notre quite analytique marketing nous recherchons une analyst de donned pour traveller sur des projects lips à la collected l\'analyse tell visualisation de donned marketing. Vous were responsible de la maintenance et de L\'evolution de yeux de donned covenant de sources varies akin de soutenir Yes squires dans peur rise de decision base sur Yes donned. Details du post  Sure : 10 moist (renouvelable)  Type: Temps plain " Rode de travail : Bride (2 hours par remained au bur

In [8]:
translate_french_to_english(extracted_text)

'Serum Master- Contract of 12 Mists God: Montreal Late Starting: WAS that possible sure: 12 MONTH with possibility of renewal in he care of a mandate of strategic importance We are looking for a Serum Master Experiment (E) for Guide an Agile Multidpilar Quite within a stimulating Environment and in Line Digital Transformation.Tom de l\'Enterprise: Alimora Slogan troop: Cutivating Burri Avenue Excellence Presentation of the Enterprise Background in 1998, Alimora Troupe is an after major of the Word\'s united food industry.Specializing in production The processing and distribution of quality food products HE Group undertakes to affect a spine and accessible lamentation while respecting the environment and the Scales communities.With more than 1,500 Employs spread over 7 production sites and a presence in more than 12 Alimora countries is distinguished by constant innovation His excellence ‘operational and its capacity to anticipate consolation tendencies.Domains of expertise: • Transform

In [None]:

# Display Named Entity Recognition results
df[["demand_id", "ner_results"]]

def extract_features(ner_results):
    skills, experience, languages, availability = [], [], [], []

    for entity in ner_results:
        text, label = entity['word'], entity['entity']

        # Extract Skills (Example: JOB_ROLE, SKILL)
        if "ORG" in label or "MISC" in label:
            skills.append(text)

        # Extract Experience (Detect numbers followed by "years" or "experience")
        if any(word in text.lower() for word in ["year", "experience", "months"]):
            experience.append(text)

        # Extract Language Requirements (Common languages mentioned)
        if text.lower() in ["english", "french"]:
            languages.append(text)

        # Extract Month Availability
        if text.lower() in [
            "january", "february", "march", "april", "may", "june",
            "july", "august", "september", "october", "november", "december"
        ]:
            availability.append(text)

    return {
        "skills": list(set(skills)),
        "experience": list(set(experience)),
        "languages": list(set(languages)),
        "availability": list(set(availability))
    }

# Apply Feature Extraction
df["features"] = df["ner_results"].apply(extract_features)

# Convert extracted features into separate columns
df["skills"] = df["features"].apply(lambda x: x["skills"])
df["experience"] = df["features"].apply(lambda x: x["experience"])
df["languages"] = df["features"].apply(lambda x: x["languages"])
df["availability"] = df["features"].apply(lambda x: x["availability"])

# Drop temporary columns
df.drop(columns=["ner_results", "features"], inplace=True)

# # Save to CSV
# df.to_csv("extracted_features.csv", index=False)

# # Display Final DataFrame
# import ace_tools as tools
# tools.display_dataframe_to_user(name="Extracted Features", dataframe=df)


In [None]:
df

Unnamed: 0,demand_id,extracted_text,translated_text,skills,experience,languages,availability
0,Scrum,Serum Master- Contrat de 12 moist Dieu : Montr...,Serum Master- Contract of 12 Lists God: Montre...,"[##e, English, Enterprise, ', Products, AB, Al...",[],[English],[]
1,Data Engineer,Most: Data Engineer + Type de contrat: Contrat...,Most: Data Engineer + Type of contract: Contra...,"[Re, V, Data, ##s, ##quire, ##bri, ##tti, A, F...",[],[],[]
2,Data Analyst,Profit Data Analyst - Marketing Analytics. A p...,Profit Data Analyst - Marketing Analytics.Abou...,"[##xia, ##ly, V, ##rix, American, I, English, ...",[],"[English, French]",[]


In [None]:
df['skills']

0    [##e, English, Enterprise, ', Products, AB, Al...
1    [Re, V, Data, ##s, ##quire, ##bri, ##tti, A, F...
2    [##xia, ##ly, V, ##rix, American, I, English, ...
Name: skills, dtype: object

In [None]:
df['translated_text'][1]

In [None]:
import spacy
import pandas as pd

# Load the NLP model
nlp = spacy.load("en_core_web_sm")

# Define a list of common skills for job descriptions (Can be extended)
custom_skills = {
    "Python", "SQL", "Machine Learning", "Deep Learning", "Data Science",
    "TensorFlow", "PyTorch", "Excel", "Power BI", "Tableau", "Big Data",
    "NLP", "Cloud Computing", "AWS", "GCP", "Azure", "Spark", "Kubernetes",
    "Docker", "Git", "Java", "C++", "Pandas", "NumPy", "SciPy"
}

def extract_skills(text):
    """Extracts skills from text using spaCy NER and keyword matching."""
    doc = nlp(text)
    skills = set()

    for ent in doc.ents:
        if ent.label_ in {"ORG", "PRODUCT", "WORK_OF_ART"}:  # Relevant NER labels
            skills.add(ent.text.strip())

    # Check for keyword matches in custom skill list
    words = text.split()
    matched_skills = {word for word in words if word in custom_skills}

    # Combine NER-extracted skills and keyword-matched skills
    skills.update(matched_skills)

    return list(skills)

# Apply skill extraction to the translated text column
df["skills"] = df["translated_text"].apply(extract_skills)


In [None]:
df['skills'][2]

['Solid',
 'Dax, Power Query',
 'MS SQL Server',
 'CRM',
 'Etlinteligence Marketing',
 'SQL',
 'Power Bi',
 'Visualization of marketing data',
 'Google Analytics']

In [None]:
import spacy
import pandas as pd
!python -m spacy download fr_core_news_sm

# Load the French NLP model
nlp = spacy.load("fr_core_news_sm")

# Define a list of common skills in French (Can be extended)
custom_skills_fr = {
    "Python", "SQL", "Apprentissage automatique", "Deep Learning", "Science des données",
    "TensorFlow", "PyTorch", "Excel", "Power BI", "Tableau", "Big Data",
    "NLP", "Cloud Computing", "AWS", "GCP", "Azure", "Spark", "Kubernetes",
    "Docker", "Git", "Java", "C++", "Pandas", "NumPy", "SciPy", "Statistiques",
    "Analyse de données", "Développement web", "Gestion de projet", "Cybersécurité"
}

def extract_skills_fr(text):
    """Extracts skills from French job descriptions using spaCy NER and keyword matching."""
    doc = nlp(text)
    skills = set()

    for ent in doc.ents:
        if ent.label_ in {"ORG", "MISC", "PRODUCT"}:  # Relevant French NER labels
            skills.add(ent.text.strip())

    # Check for keyword matches in custom skill list
    words = text.split()
    matched_skills = {word for word in words if word in custom_skills_fr}

    # Combine NER-extracted skills and keyword-matched skills
    skills.update(matched_skills)

    return list(skills)

# Apply skill extraction to the French text column
df["skills"] = df["extracted_text"].apply(extract_skills_fr)

Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m60.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
df['skills'][2]

['Les solutions de données\netlinteligence marketing',
 'Google Analytics',
 'ETL',
 'Concevoir des modèles',
 'MS SQL Server',
 'Optimiserles',
 'Databricks et SSMS',
 'CRM',
 'Solide',
 'SQL',
 'DAX',
 'Power Bi',
 'Power Query',
 'Power BL',
 'Power BI']

In [None]:
df['extracted_text'][0]

'Scrum Master- Contrat de 12 mois\n\n \n\nLieu : Montréal\nDate de démarrage : Dès que possible\nDurée : 12 mois, avec possibilité de renouvellement\n\nDans Le cadre d\'un mandat d\'importance stratégique, nous recherchons un(e) Serum\nMaster expérimenté(e) pour guider une équipe Agile multidisciplnaire au sein d\'un\nenvironnement stimulant et en pleine transformation numérique.\n\nNom de l\'entreprise : Alimora Groupe\n\nStogan : Cuttiver l\'avenir, nourri l\'excellence\n\n \n\nPrésentation de l\'entreprise\n\nFondé en 1998, Alimora Groupe est un acteur majeur de l\'agroalimentaire durable en\nAmérique du Nord. Spécialisé dans la production, la transformation et a distribution de\nproduits alimentaires de qualité, Le groupe s\'engage à offrir une alimentation saine et\naccessible, tout en respectant l\'environnement et les communautés Locales.\n\nAvec plus de 1 500 employés répartis sur 7 sites de production et une présence dans plus\nde 12 pays, Alimora se distingue par Son innovati

In [None]:
!pip install symspellpy textblob language-tool-python
!python -m textblob.download_corpora

Collecting symspellpy
  Downloading symspellpy-6.9.0-py3-none-any.whl.metadata (3.9 kB)
Collecting language-tool-python
  Downloading language_tool_python-2.9.0-py3-none-any.whl.metadata (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting editdistpy>=0.1.3 (from symspellpy)
  Downloading editdistpy-0.1.5-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.9 kB)
Downloading symspellpy-6.9.0-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading language_tool_python-2.9.0-py3-none-any.whl (49 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading editdistpy-0.1.5-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylin

In [None]:
import re
import language_tool_python
from symspellpy import SymSpell, Verbosity
from textblob import TextBlob

# Initialize LanguageTool for French grammar correction
tool = language_tool_python.LanguageToolPublicAPI("fr")

# Load SymSpell for OCR spell correction
sym_spell = SymSpell()
sym_spell.load_dictionary("https://raw.githubusercontent.com/atebits/Vocabulary/master/french_words.txt", 0, 1)

def correct_text(text):
    """Automatically correct OCR errors and clean text."""

    # 1. Fix spacing and remove unwanted newlines
    text = text.replace("\n", " ").replace("\x0c", " ")
    text = re.sub(r'\s+', ' ', text).strip()

    # 2. Use SymSpell to correct words
    words = text.split()
    corrected_words = []
    for word in words:
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
        corrected_words.append(suggestions[0].term if suggestions else word)
    text = " ".join(corrected_words)

    # 3. Apply French grammar correction with LanguageTool
    text = tool.correct(text)

    # 4. Use TextBlob for spell correction
    blob = TextBlob(text)
    text = str(blob.correct())

    return text

# Example: Apply automatic OCR correction
cleaned_text = correct_text(extracted_text)
print(cleaned_text)


2025-03-26 05:46:28,577: E symspellpy.symspellpy] Dictionary file not found at https:/raw.githubusercontent.com/atebits/Vocabulary/master/french_words.txt.


Profit Data Analyse - Marketing Analytics. À propos de Vélarisa Pétrifia Technologies And. es tune enterprise specialise dans Yes solutions de donned mésintelligence marketing. Was à Montreal, Restrictif accompany depuis plus de 15 and des enterprises word-américaines dans l'optimisation de pleurs performances grace à des technologies analytiques advances. L'enterprise rise sur l'innovation, la collaboration et l'excellence pour propulser ses clients very He future numérique. Context Dans He care du développement de notre quite analytique marketing, nous recherchons une analyst de donned pour traveller sur des projects lips à la collected, l'analyse ta visualisation de donned marketing. Vous were responsible de la maintenance et de L'evolution de yeux de donned covenant de sources varies, akin de soutenir Yes squires dans peur rise de decision base sur Yes donned. Details du post +. Sure : 10 moist (renouvelable) + Type : Temps plain +" Rode de travail : Bride (2 hours par remained au 

In [None]:
extracted_text

'Profil\n\nData Analyst - Marketing Analytics.\n\nA propos de Veltrida\n\nVettrixia Technologies Inc. estune entreprise spécialisée dans Les solutions de données\netlinteligence marketing. Basé à Montréal, Veltrixia accompagne depuis plus de 15 ans\ndes entreprises nord-américaines dans l\'optimisation de leurs performances grâce à des\ntechnologies analytiques avancées. L\'entreprise mise sur l\'innovation, la collaboration et\nl\'excellence pour propulser ses clients vers Le futur numérique.\n\nContexte\n\nDans Le cadre du développement de notre équipe analytique marketing, nous recherchons\nune analyste de données pour travailler sur des projets liés à la collecte, l\'analyse etla\nvisualisation de données marketing. Vous serez responsable de la maintenance et de\nL\'évolution de jeux de données provenant de sources variées, afin de soutenir Les équipes\ndans leur prise de décision basée sur Les données.\n\n \n\nDétails du poste\n+. Durée : 10 mois (renouvelable)\n+ Type: Temps plei

In [None]:
!pip install Spellchecker

Collecting Spellchecker
  Downloading spellchecker-0.4.tar.gz (3.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting inexactsearch (from Spellchecker)
  Downloading inexactsearch-1.0.2.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting soundex>=1.0 (from inexactsearch->Spellchecker)
  Downloading soundex-1.1.3.tar.gz (9.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting silpa_common>=0.3 (from inexactsearch->Spellchecker)
  Downloading silpa_common-0.3.tar.gz (9.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: Spellchecker, inexactsearch, silpa_common, soundex
  Building wheel for Spellchecker (setup.py) ... [?25l[?25hdone
  Created wheel for Spellchecker: filename=spellchecker-0.4-py3-none-any.whl size=3966499 sha256=9554ed52f37b

In [None]:
!pip install indexer

Collecting indexer
  Downloading indexer-0.6.2.tar.gz (14 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [None]:
import re
from spellchecker import SpellChecker

# Initialize French spell checker
spell = SpellChecker(language="fr")

def correct_ocr_errors(text):
    """Automatically corrects OCR errors using pyspellchecker."""

    # Remove unwanted characters
    text = text.replace("\n", " ").replace("\x0c", " ")
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize text into words
    words = text.split()

    # Correct misspelled words
    corrected_words = [spell.correction(word) if spell.correction(word) else word for word in words]

    # Reconstruct the text
    corrected_text = " ".join(corrected_words)

    return corrected_text

# Example: Apply automatic OCR correction
cleaned_text = correct_ocr_errors(extracted_text)
print(cleaned_text)


ModuleNotFoundError: No module named 'indexer'

In [None]:
import re
from textblob import TextBlob
from textblob_fr import PatternTagger, PatternAnalyzer

def correct_ocr_errors(text):
    """Automatically correct OCR errors using TextBlob-FR."""

    # Remove unwanted characters
    text = text.replace("\n", " ").replace("\x0c", " ")
    text = re.sub(r'\s+', ' ', text).strip()

    # Apply French grammar and spelling correction
    blob = TextBlob(text, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
    corrected_text = str(blob.correct())

    return corrected_text

# Example: Apply automatic OCR correction
cleaned_text = correct_ocr_errors(extracted_text)
print(cleaned_text)


Profit Data Analyst - Marketing Analytics. A propos de Veltrida Vettrixia Technologies And. estate enterprise specialise dans Yes solutions de donned etlinteligence marketing. Was à Montreal, Veltrixia accompany depuis plus de 15 and des enterprises word-américaines dans l'optimisation de pleurs performances grace à des technologies analytiques advances. L'enterprise rise sur l'innovation, la collaboration et l'excellence pour propulser ses clients very He future numérique. Context Dans He care du développement de notre quite analytique marketing, nous recherchons une analyst de donned pour traveller sur des projects lips à la collected, l'analyse tell visualisation de donned marketing. Vous were responsible de la maintenance et de L'evolution de yeux de donned covenant de sources varies, akin de soutenir Yes squires dans peur rise de decision base sur Yes donned. Details du post +. Sure : 10 moist (renouvelable) + Type: Temps plain +" Rode de travail : Bride (2 hours par remained au b

Profil Data Analyst - Marketing Analytics. A propos de Veltrida Vettrixia Technologies Inc. estune entreprise spécialisée dans Les solutions de données etlinteligence marketing. Basé à Montréal Veltrixia accompagne depuis plus de 15 ans des entreprises nord-américaines dans l'optimisation de leurs performances grâce à des technologies analytiques avancées. L'entreprise mise sur l'innovation la collaboration et l'excellence pour propulser ses clients vers Le futur numérique. Contexte Dans Le cadre du développement de notre équipe analytique marketing nous recherchons une analyste de données pour travailler sur des projets liés à la collecte l'analyse etla visualisation de données marketing. Vous serez responsable de la maintenance et de L'évolution de jeux de données provenant de sources variées afin de soutenir Les équipes dans leur prise de décision basée sur Les données. Détails du poste +. Durée : 10 mois (renouvelable) + Type: Temps plein +" Mode de travai : Hybride (2 jours par se