<a href="https://colab.research.google.com/github/anan181991ba-glitch/Model/blob/main/job_intent_training_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers datasets accelerate scikit-learn pandas torch

In [2]:

import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import accuracy_score, f1_score

In [3]:

URL = "https://raw.githubusercontent.com/anan181991ba-glitch/000/refs/heads/main/job_dataset_1k_en_fr.csv"
df = pd.read_csv(URL)

label_map = {"hiring": 1, "job_seeker": 0}
df["label"] = df["target"].map(label_map)

df = df[["post", "label"]]
df.head()

Unnamed: 0,post,label
0,Recherche développeur UI/UX designer pour startup,1
1,Hiring machine learning engineer developer ASAP,1
2,Looking for a mobile Flutter developer,1
3,Disponible pour un poste de mobile Flutter,0
4,Développeur mobile Flutter junior open to work,0


In [5]:

dataset = Dataset.from_pandas(df)

dataset = dataset.train_test_split(
    test_size=0.2,
    seed=42
)

dataset

DatasetDict({
    train: Dataset({
        features: ['post', 'label'],
        num_rows: 800
    })
    test: Dataset({
        features: ['post', 'label'],
        num_rows: 200
    })
})

In [6]:

MODEL_NAME = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [7]:

def tokenize(batch):
    return tokenizer(
        batch["post"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

In [8]:

dataset = dataset.map(tokenize, batched=True)
dataset = dataset.remove_columns(["post"])
dataset.set_format("torch")

dataset

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 800
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 200
    })
})

In [9]:

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

In [12]:

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    report_to="none"
)

In [13]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [14]:

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4053,0.013499,1.0,1.0
2,0.0135,0.00047,1.0,1.0


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4053,0.013499,1.0,1.0
2,0.0135,0.00047,1.0,1.0
3,0.0044,0.000406,1.0,1.0


TrainOutput(global_step=150, training_loss=0.14109171241521834, metrics={'train_runtime': 425.7181, 'train_samples_per_second': 5.638, 'train_steps_per_second': 0.352, 'total_flos': 157866633216000.0, 'train_loss': 0.14109171241521834, 'epoch': 3.0})

In [17]:

trainer.evaluate()

{'eval_loss': 0.0004059255588799715,
 'eval_accuracy': 1.0,
 'eval_f1': 1.0,
 'eval_runtime': 1.3677,
 'eval_samples_per_second': 146.228,
 'eval_steps_per_second': 9.505,
 'epoch': 3.0}

In [19]:

def predict(text):
    # Detect device of the model (CPU or GPU)
    device = next(model.parameters()).device

    # Tokenize and move inputs to the same device as the model
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

    # Forward pass
    with torch.no_grad():
        outputs = model(**inputs)

    # Get probabilities and predicted label
    probs = torch.softmax(outputs.logits, dim=1)
    label = torch.argmax(probs).item()

    return "hiring" if label == 1 else "job_seeker"

# Test predictions
print(predict("Recherche développeur React pour startup"))
print(predict("Disponible pour un poste de data analyst"))
print(predict("Hiring backend engineer ASAP"))

hiring
job_seeker
hiring


In [20]:

trainer.save_model("job_intent_model")
tokenizer.save_pretrained("job_intent_model")

('job_intent_model/tokenizer_config.json',
 'job_intent_model/special_tokens_map.json',
 'job_intent_model/sentencepiece.bpe.model',
 'job_intent_model/added_tokens.json',
 'job_intent_model/tokenizer.json')