<a href="https://colab.research.google.com/github/abidlifiras/QA_LLM/blob/master/bert_camembert_multilabel_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/abidlifiras/QA_LLM.git

fatal: destination path 'QA_LLM' already exists and is not an empty directory.


In [5]:
!pip install transformers==4.17 datasets evaluate numpy==1.26.4 --quiet
!pip uninstall -y sentence-transformers thinc

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m113.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 4.1.0 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.17.0 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0mFound existing installation: sentence-transformers 4.1.0
Uninstalling sentence-transformers-4.1.0:
  Successfully uninstalled sentence-transformers-4.1.0
Found existing installation: thinc 8.3.6
Uninstalling thinc-8.3.6:
  Successfully uninstalled thinc-8.3.6


In [2]:
import transformers
print(transformers.__version__)

4.17.0


In [15]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import evaluate
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
from sklearn.model_selection import train_test_split
import os

In [4]:
# Load data
df_train = pd.read_json('QA_LLM/dataset/train.json')
df_dev = pd.read_json('QA_LLM/dataset/dev.json')
df_test = pd.read_json('QA_LLM/dataset/test.json')

In [5]:
# Preprocessing
def preprocess_dataset(df):
    processed = []
    for _, row in df.iterrows():
        question = row['question']
        correct = set(row['correct_answers'])
        for key, answer in row['answers'].items():
            prompt = f"Question: {question}\nAnswer choice: {answer}"
            label = 1 if key in correct else 0
            processed.append({"prompt": prompt, "label": label})
    return pd.DataFrame(processed)

df_train_prep = preprocess_dataset(df_train)
df_dev_prep = preprocess_dataset(df_dev)
df_test_prep = preprocess_dataset(df_test)


In [6]:
# Tokenization
model_name = "camembert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["prompt"], padding="max_length", truncation=True)

train_dataset = Dataset.from_pandas(df_train_prep)
dev_dataset = Dataset.from_pandas(df_dev_prep)
test_dataset = Dataset.from_pandas(df_test_prep)

train_dataset = train_dataset.map(tokenize, batched=True)
dev_dataset = dev_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
dev_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])


Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/508 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/10855 [00:00<?, ? examples/s]

Map:   0%|          | 0/1560 [00:00<?, ? examples/s]

Map:   0%|          | 0/3110 [00:00<?, ? examples/s]

In [7]:
# Evaluation Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }


In [8]:
# Load base model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


Downloading:   0%|          | 0.00/424M [00:00<?, ?B/s]

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.weig

In [9]:
os.environ["WANDB_DISABLED"] = "true" #to disable automatic Weights & Biases logging
# Fine-tuning
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
The following columns in the training set  don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: prompt. If prompt are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10855
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2037


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6546,0.608445,0.662179,0.631211
2,0.6383,0.602673,0.677564,0.633649
3,0.6197,0.599807,0.679487,0.622926


The following columns in the evaluation set  don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: prompt. If prompt are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1560
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-679
Configuration saved in ./results/checkpoint-679/config.json
Model weights saved in ./results/checkpoint-679/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-679/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-679/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: prompt. If prompt are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1

TrainOutput(global_step=2037, training_loss=0.6362451863277175, metrics={'train_runtime': 2911.0326, 'train_samples_per_second': 11.187, 'train_steps_per_second': 0.7, 'total_flos': 8568211517798400.0, 'train_loss': 0.6362451863277175, 'epoch': 3.0})

Correct bert_camembert_model for multiple answers

In [18]:
def preprocess_multilabel(df):
    processed = []
    for _, row in df.iterrows():
        question = row['question']
        correct = set(row['correct_answers'])
        answers = [row['answers'].get(chr(97+i), "") for i in range(5)]
        labels = [1 if chr(97+i) in correct else 0 for i in range(5)]
        for i, ans in enumerate(answers):
            processed.append({
                "input": f"{question} {ans}",
                "label": [float(labels[i])]
            })
    return pd.DataFrame(processed)

df_train_proc = preprocess_multilabel(df_train)
df_dev_proc = preprocess_multilabel(df_dev)

# Tokenization
tokenizer = AutoTokenizer.from_pretrained("camembert-base")

def tokenize_function(examples):
    return tokenizer(
        examples["input"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

train_ds = Dataset.from_pandas(df_train_proc)
dev_ds = Dataset.from_pandas(df_dev_proc)

train_ds = train_ds.map(tokenize_function, batched=True)
dev_ds = dev_ds.map(tokenize_function, batched=True)
train_ds = train_ds.rename_column("label", "labels")
dev_ds = dev_ds.rename_column("label", "labels")
train_ds.set_format("torch")
dev_ds.set_format("torch")

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    "camembert-base",
    num_labels=1,
    problem_type="multi_label_classification"
)

# Metrics
def compute_metrics(pred):
    logits, labels = pred
    probs = torch.sigmoid(torch.tensor(logits)).squeeze()
    preds = (probs > 0.5).int().numpy()
    labels = labels.astype(int).squeeze()
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

# Training args
training_args = TrainingArguments(
    output_dir="./results_multilabel",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir="./logs",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

loading configuration file https://huggingface.co/camembert-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/f459e43c5ebb871abbf9209195563bff6a11547fd9532047739667c394833221.e23d229c54bcc6f67d337b8b2dd111b0e3dc01fa854bfecd3efdeb8c955749e6
Model config CamembertConfig {
  "_name_or_path": "camembert-base",
  "architectures": [
    "CamembertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "classifier_dropout": null,
  "eos_token_id": 6,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 32005
}

loading file https://hugg

Map:   0%|          | 0/10855 [00:00<?, ? examples/s]

Map:   0%|          | 0/1560 [00:00<?, ? examples/s]

loading configuration file https://huggingface.co/camembert-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/f459e43c5ebb871abbf9209195563bff6a11547fd9532047739667c394833221.e23d229c54bcc6f67d337b8b2dd111b0e3dc01fa854bfecd3efdeb8c955749e6
Model config CamembertConfig {
  "_name_or_path": "camembert-base",
  "architectures": [
    "CamembertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "classifier_dropout": null,
  "eos_token_id": 6,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "multi_label_classification",
  "tr

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6603,0.603147,0.673077,0.636234
2,0.6388,0.596745,0.685256,0.634944
3,0.6153,0.587216,0.696795,0.613878
4,0.5934,0.604259,0.691026,0.569643
5,0.5594,0.611701,0.687821,0.579084


The following columns in the evaluation set  don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: input. If input are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1560
  Batch size = 32
Saving model checkpoint to ./results_multilabel/checkpoint-679
Configuration saved in ./results_multilabel/checkpoint-679/config.json
Model weights saved in ./results_multilabel/checkpoint-679/pytorch_model.bin
tokenizer config file saved in ./results_multilabel/checkpoint-679/tokenizer_config.json
Special tokens file saved in ./results_multilabel/checkpoint-679/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: input. If input are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message

TrainOutput(global_step=3395, training_loss=0.608482057461858, metrics={'train_runtime': 2298.9294, 'train_samples_per_second': 23.609, 'train_steps_per_second': 1.477, 'total_flos': 7140112156070400.0, 'train_loss': 0.608482057461858, 'epoch': 5.0})

In [19]:
import shutil

export_dir = "./camembert_multilabel_model"
model.save_pretrained(export_dir)
tokenizer.save_pretrained(export_dir)

shutil.make_archive("camembert_multilabel_model", 'zip', export_dir)

Configuration saved in ./camembert_multilabel_model/config.json
Model weights saved in ./camembert_multilabel_model/pytorch_model.bin
tokenizer config file saved in ./camembert_multilabel_model/tokenizer_config.json
Special tokens file saved in ./camembert_multilabel_model/special_tokens_map.json


'/content/camembert_multilabel_model.zip'