In [None]:
# Install required packages
!pip uninstall -y fsspec gcsfs
!pip install -q fsspec[http]==2024.6.1 gcsfs==2024.6.1
!pip install -q datasets transformers torch accelerate bitsandbytes

import os
import json
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import Dataset
from typing import List, Dict, Tuple
from google.colab import drive

class MIRFlanT5TrainerFullText:
    def __init__(self,
                 base_model: str = "google/flan-t5-base",
                 output_dir: str = "mir_flan_t5_fulltext"):
        """Initialize the FLAN-T5 trainer for full-text MIR questions"""
        print("Initializing MIR FLAN-T5 Full Text Trainer...")
        self.base_model = base_model
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)

        print(f"Loading base model: {base_model}")
        self.tokenizer = AutoTokenizer.from_pretrained(base_model)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(base_model)

        # Move model to GPU if available
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")
        self.model = self.model.to(self.device)

    def format_question(self, item: Dict) -> Dict:
        """Format a question for full-text training"""
        try:
            # Remove any prefixes and clean up the input text
            question_text = item['input'].replace("Medical Question: ", "").strip()
            options = item['context']['all_options']
            answer_num = item['context']['numeric_answer']

            # Format input with question and all options
            input_text = f"""Responde esta pregunta médica de manera completa y detallada.

Pregunta: {question_text}

Opciones disponibles:
A) {options[0]}
B) {options[1]}
C) {options[2]}
D) {options[3]}

Respuesta:"""

            # Get the correct answer text
            correct_answer = options[answer_num - 1]

            return {
                'input': input_text,
                'output': correct_answer
            }
        except Exception as e:
            print(f"Error formatting question: {str(e)}")
            print(f"Item structure: {json.dumps(item, indent=2)}")
            return None

    def prepare_training_data(self, questions: List[Dict]) -> Tuple[Dataset, Dataset]:
        """Prepare and split training data"""
        print("Preparing training data...")
        training_data = []

        for i, question in enumerate(questions):
            try:
                formatted = self.format_question(question)
                if formatted:
                    training_data.append(formatted)

                if (i + 1) % 100 == 0:
                    print(f"Processed {i + 1} questions...")

            except Exception as e:
                print(f"Error processing question {i}: {str(e)}")
                continue

        print(f"Created {len(training_data)} training examples")

        if len(training_data) == 0:
            raise ValueError("No valid training examples were created!")

        # Convert to DataFrame
        df = pd.DataFrame(training_data)

        # Tokenize inputs and outputs
        inputs = list(df["input"])
        outputs = list(df["output"])

        # Tokenize with padding and truncation
        tokenized_inputs = self.tokenizer(
            inputs, padding=True, truncation=True, max_length=1024, return_tensors="pt"
        )
        tokenized_outputs = self.tokenizer(
            outputs, padding=True, truncation=True, max_length=256, return_tensors="pt"
        )

        # Create tokenized dataset
        tokenized_data = {
            "input_ids": tokenized_inputs["input_ids"],
            "attention_mask": tokenized_inputs["attention_mask"],
            "labels": tokenized_outputs["input_ids"],
        }

        # Split data
        total_examples = len(tokenized_data["input_ids"])
        train_size = int(0.9 * total_examples)

        train_data = {k: v[:train_size] for k, v in tokenized_data.items()}
        eval_data = {k: v[train_size:] for k, v in tokenized_data.items()}

        train_dataset = Dataset.from_dict(train_data)
        eval_dataset = Dataset.from_dict(eval_data)

        return train_dataset, eval_dataset

    def train(self, train_dataset: Dataset, eval_dataset: Dataset):
        """Train the model"""
        print("Starting training...")

        # Disable W&B integration
        os.environ["WANDB_DISABLED"] = "true"

        training_args = TrainingArguments(
            output_dir=self.output_dir,
            num_train_epochs=5,  # Increased epochs for better learning
            per_device_train_batch_size=4,  # Reduced batch size due to longer sequences
            per_device_eval_batch_size=4,
            warmup_steps=200,  # Increased warmup steps
            weight_decay=0.01,
            logging_dir=f'{self.output_dir}/logs',
            logging_steps=10,
            evaluation_strategy="steps",
            eval_steps=100,
            save_strategy="steps",
            save_steps=100,
            load_best_model_at_end=True,
            save_total_limit=2,
            fp16=torch.cuda.is_available(),
            remove_unused_columns=False,
            learning_rate=2e-5,  # Slightly lower learning rate
            gradient_accumulation_steps=4,  # Added gradient accumulation
            max_grad_norm=0.5,   # Gradient clipping
        )

        data_collator = DataCollatorForSeq2Seq(
            tokenizer=self.tokenizer,
            model=self.model,
            padding=True,
            max_length=1024
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=data_collator,
            tokenizer=self.tokenizer
        )

        print("Training model...")
        trainer.train()

        print("Saving model...")
        trainer.save_model()
        self.tokenizer.save_pretrained(self.output_dir)
        print(f"Model saved to {self.output_dir}")

def main():
    print("Starting MIR FLAN-T5 Full Text Training Pipeline...")

    try:
        # Mount Google Drive
        drive.mount('/content/drive')

        # Define paths
        base_dir = '/content/drive/MyDrive/TFM2'
        training_data_path = f"{base_dir}/meli-training-content/full_context/flan_t5_training.json"
        output_dir = f"{base_dir}/models/mir_flan_t5_fulltext"

        # Initialize trainer
        trainer = MIRFlanT5TrainerFullText(
            base_model="google/flan-t5-base",
            output_dir=output_dir
        )

        # Load training data
        print("\nLoading training data...")
        with open(training_data_path, 'r', encoding='utf-8') as f:
            questions = json.load(f)
        print(f"Loaded {len(questions)} questions")

        # Print sample question for debugging
        if questions:
            print("\nSample question format:")
            print(json.dumps(questions[0], indent=2))

        # Prepare and split training data
        train_dataset, eval_dataset = trainer.prepare_training_data(questions)

        print(f"\nSplit sizes:")
        print(f"Training examples: {len(train_dataset)}")
        print(f"Validation examples: {len(eval_dataset)}")

        # Train the model
        trainer.train(train_dataset, eval_dataset)

        print("\nTraining complete! Model saved to:", output_dir)

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        raise

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hStarting MIR FLAN-T5 Full Text Training Pipeline...
Mounted at /content/drive
Initializing MIR FLAN-T5 Full Text Trainer...
Loading base model: google/flan-t5-base


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Using device: cuda

Loading training data...
Loaded 619 questions

Sample question format:
{
  "id": "MED2019-P105",
  "input": "Answer this medical multiple choice question.\nSelect the best answer: A, B, C, or D.\n\nQuestion: 000 UIL. Se aporta imagen de la ecograf\u00eda transvaginal. Qu\u00e9 indicar\u00eda en ese momento:\n\nOptions:\nA: Repetir seriadamente cada 2 d\u00edas la ecograf\u00eda y la \u00df-HCG.\nB: Tranquilizar a la paciente, indicar reposo, antiem\u00e9ticos y repetir la ecograf\u00eda en una semana.\nC: Legrado uterino.\nD: Tratamiento con misoprostol por v\u00eda sist\u00e9mica. - 2 029102 ANICIDEM ESF",
  "output": "The correct answer is D: Tratamiento con misoprostol por v\u00eda sist\u00e9mica. - 2 029102 ANICIDEM ESF",
  "context": {
    "year": "2019",
    "question_number": "105",
    "all_options": [
      "Repetir seriadamente cada 2 d\u00edas la ecograf\u00eda y la \u00df-HCG.",
      "Tranquilizar a la paciente, indicar reposo, antiem\u00e9ticos y repet

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



Split sizes:
Training examples: 557
Validation examples: 62
Starting training...


  trainer = Trainer(


Training model...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
100,0.0,




Saving model...
Model saved to /content/drive/MyDrive/TFM2/models/mir_flan_t5_fulltext

Training complete! Model saved to: /content/drive/MyDrive/TFM2/models/mir_flan_t5_fulltext



Inspect training examples:



In [None]:
# Load and display some training examples
base_dir = '/content/drive/MyDrive/TFM2'
training_data_path = f"{base_dir}/meli-training-content/full_context/flan_t5_training.json"

# Load the training data
with open(training_data_path, 'r', encoding='utf-8') as f:
    questions = json.load(f)
print(f"Loaded {len(questions)} questions")

# Display a few examples
num_examples = 5
print(f"\nDisplaying {num_examples} training examples:")

for i, question in enumerate(questions[:num_examples]):
    print(f"\n{'='*80}")
    print(f"Example {i+1}:")
    print(f"\nOriginal format:")
    print(json.dumps(question, indent=2))

    print(f"\nInput format for training:")
    input_text = f"""Responde esta pregunta médica de manera completa y detallada.

Pregunta: {question['input'].replace('Medical Question: ', '').strip()}

Opciones disponibles:
A) {question['context']['all_options'][0]}
B) {question['context']['all_options'][1]}
C) {question['context']['all_options'][2]}
D) {question['context']['all_options'][3]}

Respuesta:"""
    print(input_text)

    print(f"\nExpected output:")
    correct_answer = question['context']['all_options'][question['context']['numeric_answer'] - 1]
    print(correct_answer)
    print(f"{'='*80}")

Loaded 619 questions

Displaying 5 training examples:

Example 1:

Original format:
{
  "id": "MED2019-P105",
  "input": "Answer this medical multiple choice question.\nSelect the best answer: A, B, C, or D.\n\nQuestion: 000 UIL. Se aporta imagen de la ecograf\u00eda transvaginal. Qu\u00e9 indicar\u00eda en ese momento:\n\nOptions:\nA: Repetir seriadamente cada 2 d\u00edas la ecograf\u00eda y la \u00df-HCG.\nB: Tranquilizar a la paciente, indicar reposo, antiem\u00e9ticos y repetir la ecograf\u00eda en una semana.\nC: Legrado uterino.\nD: Tratamiento con misoprostol por v\u00eda sist\u00e9mica. - 2 029102 ANICIDEM ESF",
  "output": "The correct answer is D: Tratamiento con misoprostol por v\u00eda sist\u00e9mica. - 2 029102 ANICIDEM ESF",
  "context": {
    "year": "2019",
    "question_number": "105",
    "all_options": [
      "Repetir seriadamente cada 2 d\u00edas la ecograf\u00eda y la \u00df-HCG.",
      "Tranquilizar a la paciente, indicar reposo, antiem\u00e9ticos y repetir la e