In [None]:
!pip install -q transformers datasets accelerate scikit-learn pandas torch

In [None]:

import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import accuracy_score, f1_score


In [None]:

URL = "https://raw.githubusercontent.com/anan181991ba-glitch/000/refs/heads/main/job_dataset_1k_en_fr.csv"
df = pd.read_csv(URL)

label_map = {"hiring": 1, "job_seeker": 0}
df["label"] = df["target"].map(label_map)

df = df[["post", "label"]]
df.head()


In [None]:

dataset = Dataset.from_pandas(df)

dataset = dataset.train_test_split(
    test_size=0.2,
    stratify_by_column="label",
    seed=42
)

dataset


In [None]:

MODEL_NAME = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


In [None]:

def tokenize(batch):
    return tokenizer(
        batch["post"],
        padding="max_length",
        truncation=True,
        max_length=128
    )


In [None]:

dataset = dataset.map(tokenize, batched=True)
dataset = dataset.remove_columns(["post"])
dataset.set_format("torch")

dataset


In [None]:

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)


In [None]:

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }


In [None]:

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    report_to="none"
)


In [None]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:

trainer.train()


In [None]:

trainer.evaluate()


In [None]:

def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1)
    label = torch.argmax(probs).item()
    return "hiring" if label == 1 else "job_seeker"

print(predict("Recherche d√©veloppeur React pour startup"))
print(predict("Disponible pour un poste de data analyst"))
print(predict("Hiring backend engineer ASAP"))


In [None]:

trainer.save_model("job_intent_model")
tokenizer.save_pretrained("job_intent_model")
