In [63]:
import os
import re
import unicodedata
import random
import csv
from pathlib import Path

import pandas as pd
import numpy as np

import spacy
from gensim.models import FastText
from gensim.models.phrases import Phrases, Phraser

import nltk
from unidecode import unidecode


# ----------------------------------------
# 1. Load French NLP Pipeline
# ----------------------------------------
nlp = spacy.load("fr_core_news_sm", disable=["parser","ner"])
french_sw = nlp.Defaults.stop_words

# ----------------------------------------
# 1. Paths & Parameters
# ----------------------------------------
project_root = Path().resolve().parent
attribute_file = project_root / "data" / "attribute_base_processedV2.xlsx"
jocas_dir = "/Users/alfonso/Desktop/JOCAS"  # adjust if needed
sample_size     = 50
random_seed     = 42

# FastText & phrase-detector hyperparams
vector_size      = 100
window_size      = 5
min_count        = 5
workers          = 4
phrase_min_count = 10
phrase_threshold = 10

# French stop-words (for PREPROCESSING ONLY)
french_stopwords = set(stopwords.words("french"))

# ----------------------------------------
# 2. — Define preprocessing function
# ----------------------------------------
def preprocess_text(text: str) -> str:
    """
    Lowercase, strip accents, tokenize & lemmatize, remove stop-words.
    Returns a space-joined string of lemmas.
    """
    # Lowercase and remove accents
    txt = unidecode(text).lower()
    # Lemmatize & filter
    doc = nlp_stop(txt)
    lemmas = [
        tok.lemma_.lower()
        for tok in doc
        if tok.is_alpha
           and tok.lemma_.lower() not in french_stopwords
           and len(tok.lemma_) > 1
    ]
    return " ".join(lemmas)

# ----------------------------------------
# 3. — Load & prepare attribute base
# ----------------------------------------
attribute_df = (
    pd.read_excel(attribute_file)
      .loc[:, ["attribute_category_fr", "expression"]]
      .dropna()
)

# Build lemma-sets per category for later filtering
cat_to_lemmas = {}
for cat, grp in attribute_df.groupby("attribute_category_fr"):
    lemmas = set()
    for expr in grp["expression"].astype(str):
        for tok in nlp(expr):
            if tok.is_alpha and len(tok.lemma_) > 1:
                lemmas.add(tok.lemma_.lower())
    cat_to_lemmas[cat] = lemmas



In [64]:
cat_to_lemmas

{'Autonomie dans l’exécution des tâches': {'action',
  'autogestion',
  'autonome',
  'autonomie',
  'capacité',
  'dans',
  'direct',
  'force',
  'gestion',
  'indépendance',
  'initiative',
  'le',
  'liberter',
  'mission',
  'organisation',
  'personnel',
  'poste',
  'prise',
  'professionnel',
  'proposition',
  'responsable',
  'sans',
  'supervision',
  'travail'},
 'Bel emplacement': {'agréable',
  'attractif',
  'bord',
  'cadre',
  'centre',
  'commodité',
  'de',
  'dynamique',
  'emplacement',
  'environnement',
  'exceptionnel',
  'géographique',
  'idéal',
  'imprenable',
  'le',
  'localisation',
  'mer',
  'nature',
  'pittoresque',
  'priser',
  'privilégier',
  'proche',
  'proximité',
  'quartier',
  'rechercher',
  'secteur',
  'site',
  'situation',
  'verdoyer',
  'ville',
  'vue',
  'zone'},
 'Bonne rémunération des heures supplémentaires': {'compensation',
  'de',
  'en',
  'heure',
  'majoration',
  'majorer',
  'paiement',
  'payer',
  'plus',
  'pour',
  'r

In [2]:
# ----------------------------------------
# 3. Ingest & Aggregate JOCAS Job Ads
# ----------------------------------------

# Initialize container
jocas_list = []


# Walk though source folders
for source_folder in os.listdir(jocas_dir):
    source_path = os.path.join(jocas_dir, source_folder)
    
    if os.path.isdir(source_path):
        # Collect all csv files under this source
        csv_files = []
        for root, dirs, files in os.walk(source_path):
            for file in files:
                
                # Case-insensitive check for .csv files
                if file.lower().endswith('.csv'):
                    csv_files.append(os.path.join(root, file))
        
        print(f"Found {len(csv_files)} CSV files in source folder: {source_folder}")
        
        if not csv_files:
            continue
        
        # Select up to 50 csv files from this source
        if len(csv_files) > 50:
            selected_files = random.sample(csv_files, 50)
        else:
            selected_files = csv_files
        
        print(f"Processing {len(selected_files)} files from source folder: {source_folder}")
        
        # Process each selected CSV file
        for file_path in selected_files:
            try:
                df = pd.read_csv(
                    file_path,
                    header=0,
                    sep=';',
                    on_bad_lines='skip',
                    low_memory=False,
                    quoting=csv.QUOTE_MINIMAL,
                    escapechar='\\'
                )
                jocas_list.append(df)
            except Exception as e:
                print(f"Failed to read {file_path}: {e}")

        print(f"Finished processing source folder: {source_folder}")

# Aggregate all dataframes into one
df_job = pd.concat(jocas_list, ignore_index=True)
print(f"Total job ads: {len(df_job)}")


Found 361 CSV files in source folder: regionsjob_nord
Processing 50 files from source folder: regionsjob_nord
Finished processing source folder: regionsjob_nord
Found 357 CSV files in source folder: regionsjob_paris
Processing 50 files from source folder: regionsjob_paris
Finished processing source folder: regionsjob_paris
Found 359 CSV files in source folder: regionsjob_rhonealpes
Processing 50 files from source folder: regionsjob_rhonealpes
Finished processing source folder: regionsjob_rhonealpes
Found 362 CSV files in source folder: apec
Processing 50 files from source folder: apec
Finished processing source folder: apec
Found 169 CSV files in source folder: keljob
Processing 50 files from source folder: keljob
Finished processing source folder: keljob
Found 360 CSV files in source folder: regionsjob_sudouest
Processing 50 files from source folder: regionsjob_sudouest
Finished processing source folder: regionsjob_sudouest
Found 361 CSV files in source folder: regionsjob_paca
Process

  df_job = pd.concat(jocas_list, ignore_index=True)


Total job ads: 1824535


In [62]:
# ----------------------------------------
# 5. — Gather & (optionally) sample descriptions
# ----------------------------------------
if "description_full" not in df_job.columns:
    raise KeyError("Missing 'description_full' in JOCAS data")

texts = df_job["description_full"].dropna().astype(str)
if isinstance(sample_size, int):
    texts = texts.sample(n=sample_size, random_state=random_seed).tolist()
else:
    texts = texts.tolist()

print(f"Using {len(texts)} job-ad descriptions")

# ----------------------------------------
# 6. — Preprocess & tokenize for phrase detection
# ----------------------------------------
# Each sentence = list of lemmas
sentences = [preprocess_text(doc).split() for doc in texts]

# ----------------------------------------
# 7. — Phrase detection (bigrams & trigrams)
# ----------------------------------------
phrases = Phrases(
    sentences,
    min_count=phrase_min_count,
    threshold=phrase_threshold
)
phraser = Phraser(phrases)
sent_phrased = [phraser[s] for s in sentences]

# ----------------------------------------
# 8. — Inject lemmatized seed expressions
# ----------------------------------------
for expr in attribute_df["expression"].astype(str):
    toks = preprocess_text(expr).split()
    if len(toks) > 1:
        sent_phrased.append(toks)

# ----------------------------------------
# 9. — Train FastText model
# ----------------------------------------
model = FastText(
    sentences=sent_phrased,
    vector_size=vector_size,
    window=window_size,
    min_count=min_count,
    workers=workers,
    seed=random_seed
)

# ----------------------------------------
# 10. — Embedding helper
# ----------------------------------------
def embed(phrase: str) -> np.ndarray | None:
    toks = preprocess_text(phrase).split()
    vecs = [model.wv[t] for t in toks if t in model.wv]
    return np.mean(vecs, axis=0) if vecs else None

# ----------------------------------------
# 11. — Compute category centroids
# ----------------------------------------
cat_centroids = {}
for cat, grp in attribute_df.groupby("attribute_category_fr"):
    vecs = [embed(expr) for expr in grp["expression"]]
    vecs = [v for v in vecs if v is not None]
    if vecs:
        cat_centroids[cat] = np.mean(vecs, axis=0)

# ----------------------------------------
# 12. — Retrieve & filter suggestions
# ----------------------------------------
results = []
for cat, cent in cat_centroids.items():
    raw = model.wv.similar_by_vector(cent, topn=200)
    seeds = cat_to_lemmas[cat]
    kept = []
    for term, sim in raw:
        parts = term.split("_")
        # only keep expressions ≥4 lemmas that share ≥1 seed lemma
        if len(parts) >= 4 and any(p in seeds for p in parts):
            kept.append((term, sim))
        if len(kept) >= 10:
            break
    for term, sim in kept:
        results.append({
            "attribute_category_fr": cat,
            "suggested_expression": term.replace("_", " "),
            "similarity": sim
        })

# ----------------------------------------
# 13. — Preview top-5 per category
# ----------------------------------------
results_df = pd.DataFrame(results)
preview = results_df.groupby("attribute_category_fr").head(5).reset_index(drop=True)
print(preview.to_string(index=False))

Using 50 job-ad descriptions


NameError: name 'nlp_stop' is not defined

In [55]:
results_df