<a href="https://colab.research.google.com/github/abidlifiras/QA_LLM/blob/master/google_flan_t5_base_first_try.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/abidlifiras/QA_LLM.git

In [None]:
!pip install transformers==4.17 datasets evaluate --quiet

In [None]:
import transformers
print(transformers.__version__)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMultipleChoice, TrainingArguments, Trainer
import pandas as pd
from tqdm import tqdm
import json
from datasets import Dataset
import numpy as np
import evaluate

In [None]:
# Load datasets
df_train = pd.read_json('QA_LLM/dataset/train.json')
df_dev = pd.read_json('QA_LLM/dataset/dev.json')
df_test = pd.read_json('QA_LLM/dataset/test.json')

In [None]:
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def format_prompt(question, answers_dict):
    letters = ['a', 'b', 'c', 'd', 'e']
    choices = [f"{l}) {answers_dict.get(l, '')}" for l in letters]
    return f"Question: {question}\n" + "\n".join(choices) + "\nAnswer:"

def get_prediction(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    outputs = model.generate(**inputs, max_new_tokens=5)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).lower()
    return response.strip()

# Load test set
df_test = pd.read_json('QA_LLM/dataset/test.json')
letter_to_index = {"a": 0, "b": 1, "c": 2, "d": 3, "e": 4}

correct = 0
results = []

for i, row in tqdm(df_test.iterrows(), total=len(df_test)):
    prompt = format_prompt(row["question"], row["answers"])
    pred = get_prediction(prompt)

    # Prendre uniquement la première lettre présente dans la prédiction
    pred_letter = next((c for c in pred if c in letter_to_index), None)
    true_letter = row["correct_answers"][0]

    is_correct = (pred_letter == true_letter)
    correct += is_correct
    results.append({
        "question": row["question"],
        "predicted": pred_letter,
        "correct": true_letter,
        "success": is_correct
    })

accuracy = correct / len(df_test)
print(f"Accuracy: {accuracy:.2%}")


In [None]:
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def format_input(example):
    question = example["question"]
    answers = example["answers"]
    correct = example["correct_answers"][0]  # 'a', 'b', etc.

    letters = ['a', 'b', 'c', 'd', 'e']
    choices = [f"{l}) {answers.get(l, '')}" for l in letters]
    prompt = f"Question: {question}\n" + "\n".join(choices) + "\nAnswer:"
    return {
        "input_text": prompt,
        "target_text": correct
    }

# Convert DataFrame to Hugging Face dataset
train_ds = Dataset.from_pandas(df_train)
val_ds = Dataset.from_pandas(df_dev)

# Apply formatting
train_ds = train_ds.map(format_input)
val_ds = val_ds.map(format_input)


In [None]:
def tokenize(example):
    inputs = tokenizer(
        example["input_text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )
    targets = tokenizer(
        example["target_text"],
        padding="max_length",
        truncation=True,
        max_length=5
    )
    inputs["labels"] = targets["input_ids"]
    return inputs


train_ds = train_ds.map(tokenize, remove_columns=train_ds.column_names)
val_ds = val_ds.map(tokenize, remove_columns=val_ds.column_names)



In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoModelForSeq2SeqLM,DataCollatorForSeq2Seq


data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
import os
os.environ["WANDB_DISABLED"] = "true" #to disable automatic Weights & Biases logging

from datasets import load_metric

accuracy_metric = load_metric("accuracy")  # Load accuracy metric from Huggingface datasets

from sklearn.metrics import accuracy_score

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Décodage des prédictions et des labels (on ignore les tokens spéciaux)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Extraction de la première lettre, ou '?' si vide
    def first_letter_or_question_mark(text):
        text = text.strip().lower()
        return text[0] if len(text) > 0 else "?"

    decoded_preds = [first_letter_or_question_mark(p) for p in decoded_preds]
    decoded_labels = [first_letter_or_question_mark(l) for l in decoded_labels]

    # Calcul de l'accuracy
    accuracy = accuracy_score(decoded_labels, decoded_preds)

    return {"accuracy": accuracy}



model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-finetuned-qa",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    fp16=True  # si tu as un GPU avec support
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()