In [1]:
from google.colab import drive
drive.mount('/content/drive')
project_path = "/content/drive/MyDrive/STUDY/2024Fall/ECE570/TERM-PROJECT"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install faiss-cpu datasets transformers torch tensorboard evaluate rouge_score nltk

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.many

In [3]:
import os
import time
import numpy as np
from tqdm import tqdm
import torch
from torch import nn, optim
from transformers import RagTokenizer, RagTokenForGeneration, RagRetriever, DPRContextEncoder, DPRContextEncoderTokenizerFast, AutoTokenizer, AutoModel
from evaluate import load
from datasets import load_dataset
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
import logging
import sys
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu


In [4]:
# Enhanced CustomLogger with timestamp and better formatting
class CustomLogger:
    def __init__(self, log_dir_base, logger_name):
        self.log_dir_base = log_dir_base
        self.logger_name = logger_name
        self.logger = logging.getLogger(logger_name)

        if not os.path.exists(log_dir_base):
            os.makedirs(log_dir_base)

        self.log_dir = f"{log_dir_base}/{logger_name}"
        if not os.path.exists(self.log_dir):
            os.makedirs(self.log_dir)

        self.logger.setLevel(logging.INFO)

        # Prevent adding multiple handlers if logger is already configured
        if not self.logger.handlers:
            formatter = logging.Formatter('%(asctime)s - [%(levelname)s] - %(message)s')

            # Create file handler
            fh = logging.FileHandler(f"{self.log_dir}/{logger_name}.log")
            fh.setLevel(logging.INFO)
            fh.setFormatter(formatter)
            self.logger.addHandler(fh)

            # Create console handler
            ch = logging.StreamHandler(sys.stdout)
            ch.setLevel(logging.INFO)
            ch.setFormatter(formatter)
            self.logger.addHandler(ch)

    def get_logger(self):
        return self.logger

    def get_log_dir(self):
        return self.log_dir

In [5]:
!export CUDA_LAUNCH_BLOCKING=1

In [6]:
class FineTuneRAGModel(nn.Module):
    def __init__(self, log_dir='./log', model_name='facebook/rag-token-base', args=None, resume_pth=None, start_epoch=0):
        super(FineTuneRAGModel, self).__init__()
        # Initialize logger
        self.logger = CustomLogger(log_dir_base=log_dir, logger_name="fine_tune_rag")
        self.log_dir = self.logger.get_log_dir()
        self.log = self.logger.get_logger()

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.log.info(f"Using device: {self.device}")

        # Load the passages dataset
        self.log.info("Loading passages from rag-datasets/rag-mini-wikipedia...")
        passages_dataset = load_dataset('rag-datasets/rag-mini-wikipedia', 'text-corpus', split='passages')

        self.start_epoch = start_epoch
        if resume_pth == None:
          # Load pre-trained RAG model and tokenizer
          self.tokenizer = RagTokenizer.from_pretrained(model_name, cache_dir=f'{project_path}/hugging_face_models')
          self.model = RagTokenForGeneration.from_pretrained(model_name, cache_dir=f'{project_path}/hugging_face_models')

          # Set up the retriever with the passages
          self.log.info("Setting up retriever...")
          self.retriever = RagRetriever.from_pretrained(
              model_name,
              index_name='exact',
              passages_dataset=passages_dataset,
              use_dummy_dataset=True,  # Avoid loading full dataset
              cache_dir=f'{project_path}/hugging_face_models'
          )
        elif resume_pth != None:
          self.log.info("Resuming training from checkpoint.")
          self.load_model(resume_pth)

        # Set the retriever in the model
        self.model.set_retriever(self.retriever)

        # Move model to device
        self.model.to(self.device)

        # Load the QA dataset
        self.log.info("Loading question-answer dataset...")
        qa_dataset = load_dataset('rag-datasets/rag-mini-wikipedia', 'question-answer', cache_dir=f'{project_path}/datasets')

        # Split the data into training and evaluation sets
        train_test_split = qa_dataset['test'].train_test_split(test_size=0.2)
        self.train_dataset = train_test_split['train']
        self.eval_dataset = train_test_split['test']

        self.log.info("Initialization complete.")

    def train_model(self, epochs=30, batch_size=8, learning_rate=1e-5, checkpoint_interval=10, writer=None):
        """Train the RAG model."""
        self.model.train()
        optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
        self.log.info("Starting training.")

        train_dataloader = DataLoader(self.train_dataset, batch_size=batch_size, shuffle=True)
        total_steps = len(train_dataloader) * epochs

        for epoch in range(self.start_epoch, epochs):
            epoch_start_time = time.time()
            total_loss = 0.0
            self.log.info(f"Epoch {epoch + 1}/{epochs} started.")

            for i, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch + 1}")):
                questions = batch['question']
                answers = batch['answer']

                # Tokenize the questions and answers
                inputs = self.tokenizer(
                    questions,
                    return_tensors='pt',
                    padding=True,
                    truncation=True
                ).to(self.device)

                labels = self.tokenizer(
                    answers,
                    return_tensors='pt',
                    padding=True,
                    truncation=True
                ).input_ids.to(self.device)

                # Replace padding token ids with -100 to ignore them during loss computation
                labels[labels == self.tokenizer.generator.pad_token_id] = -100

                optimizer.zero_grad()

                # **Pass attention_mask to the model**
                outputs = self.model(
                    input_ids=inputs['input_ids'],
                    attention_mask=inputs['attention_mask'],  # Added attention_mask
                    labels=labels
                )

                # **Ensure loss is scalar by computing the mean**
                loss = outputs.loss.mean()  # Compute mean over batch

                loss.backward()
                optimizer.step()

                total_loss += loss.item()

                if writer:
                    writer.add_scalar("Training Loss", loss.item(), epoch * len(train_dataloader) + i)

                if i % 100 == 0 and i != 0:
                    self.log.info(f"Epoch {epoch + 1}, Step {i}: Loss = {loss.item():.4f}")

            average_loss = total_loss / len(train_dataloader)
            self.log.info(f"Epoch {epoch + 1} complete. Average Loss: {average_loss:.4f}")
            if writer:
                writer.add_scalar("Average Epoch Loss", average_loss, epoch + 1)

            epoch_duration = time.time() - epoch_start_time
            self.log.info(f"Epoch {epoch + 1} duration: {epoch_duration:.2f} seconds")

            # Save checkpoint
            if (epoch + 1) % checkpoint_interval == 0:
                self.save_model(os.path.join(self.log_dir, f"checkpoint_{epoch + 1}"))

    def evaluate_model1(self, batch_size=8, writer=None):
        """Evaluate the RAG model."""
        self.model.eval()
        eval_dataloader = DataLoader(self.eval_dataset, batch_size=batch_size)
        total_bleu_score = 0.0
        total_rouge_score = 0.0
        total_exact_match = 0.0
        total_samples = 0

        rouge_metric = load("rouge")
        self.log.info("Starting evaluation.")

        with torch.no_grad():
            for batch in tqdm(eval_dataloader, desc="Evaluating"):
                questions = batch['question']
                answers = batch['answer']

                inputs = self.tokenizer(
                    questions,
                    return_tensors='pt',
                    padding=True,
                    truncation=True
                ).to(self.device)

                # **Pass attention_mask to the generate function**
                generated_ids = self.model.generate(
                    input_ids=inputs['input_ids'],
                    attention_mask=inputs['attention_mask'],  # Added attention_mask
                    max_length=50,
                    num_beams=5,
                    early_stopping=True
                )
                generated_answers = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

                for generated_answer, reference in zip(generated_answers, answers):
                    total_samples += 1

                    # Exact Match
                    if generated_answer.strip() == reference.strip():
                        total_exact_match += 1

                    # BLEU Score
                    bleu_score = sentence_bleu([reference.split()], generated_answer.split())
                    total_bleu_score += bleu_score

                    # ROUGE-L Score
                    rouge_result = rouge_metric.compute(
                        predictions=[generated_answer],
                        references=[reference],
                        rouge_types=["rougeL"]
                    )
                    total_rouge_score += rouge_result["rougeL"]

        avg_exact_match = (total_exact_match / total_samples) * 100
        avg_bleu_score = (total_bleu_score / total_samples) * 100
        avg_rouge_score = (total_rouge_score / total_samples) * 100

        self.log.info(f"Evaluation Results:")
        self.log.info(f"  Exact Match: {avg_exact_match:.2f}%")
        self.log.info(f"  Average BLEU Score: {avg_bleu_score:.2f}%")
        self.log.info(f"  Average ROUGE-L Score: {avg_rouge_score:.2f}%")

        if writer:
            writer.add_scalar("Evaluation/Exact_Match", avg_exact_match)
            writer.add_scalar("Evaluation/BLEU_Score", avg_bleu_score)
            writer.add_scalar("Evaluation/ROUGE_L_Score", avg_rouge_score)

    def evaluate_model(self, batch_size=8, writer=None):
        """Evaluate the RAG model."""
        self.model.eval()
        eval_dataloader = DataLoader(self.eval_dataset, batch_size=batch_size)
        total_bleu_score = 0.0
        total_rouge_score = 0.0
        total_exact_match = 0.0
        total_samples = 0

        rouge_metric = load("rouge")
        self.log.info("Starting evaluation.")

        with torch.no_grad():
            for batch in tqdm(eval_dataloader, desc="Evaluating"):
                questions = batch['question']
                answers = batch['answer']

                inputs = self.tokenizer(
                    questions,
                    return_tensors='pt',
                    padding=True,
                    truncation=True
                ).to(self.device)

                # **Pass attention_mask to the generate function**
                generated_ids = self.model.generate(
                    input_ids=inputs['input_ids'],
                    attention_mask=inputs['attention_mask'],  # Added attention_mask
                    max_length=50,
                    num_beams=5,
                    early_stopping=True
                )
                generated_answers = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

                for generated_answer, reference in zip(generated_answers, answers):
                    total_samples += 1

                    # Exact Match
                    if generated_answer.strip() == reference.strip():
                        total_exact_match += 1

                    # BLEU Score
                    bleu_score = sentence_bleu([reference.split()], generated_answer.split())
                    total_bleu_score += bleu_score

                    # ROUGE-L Score
                    rouge_result = rouge_metric.compute(
                        predictions=[generated_answer],
                        references=[reference],
                        rouge_types=["rougeL"]
                    )
                    total_rouge_score += rouge_result["rougeL"]

        avg_exact_match = (total_exact_match / total_samples) * 100
        avg_bleu_score = (total_bleu_score / total_samples) * 100
        avg_rouge_score = (total_rouge_score / total_samples) * 100

        self.log.info(f"Evaluation Results:")
        self.log.info(f"  Exact Match: {avg_exact_match:.2f}%")
        self.log.info(f"  Average BLEU Score: {avg_bleu_score:.2f}%")
        self.log.info(f"  Average ROUGE-L Score: {avg_rouge_score:.2f}%")

        if writer:
            writer.add_scalar("Evaluation/Exact_Match", avg_exact_match)
            writer.add_scalar("Evaluation/BLEU_Score", avg_bleu_score)
            writer.add_scalar("Evaluation/ROUGE_L_Score", avg_rouge_score)


    def inference(self, query):
        """Generate an answer for a given query."""
        self.model.eval()
        with torch.no_grad():
            inputs = self.tokenizer(query, return_tensors='pt').to(self.device)
            generated_ids = self.model.generate(input_ids=inputs['input_ids'])
            generated_answer = self.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        return generated_answer

    def save_model(self, save_directory):
        """Save the model and tokenizer."""
        if not os.path.exists(save_directory):
            os.makedirs(save_directory)

        model_pth = os.path.join(save_directory, "model")
        tokenizer_pth = os.path.join(save_directory, "tokenizer")
        retriever_pth = os.path.join(save_directory, "retriever")

        self.model.save_pretrained(model_pth)
        self.tokenizer.save_pretrained(tokenizer_pth)
        self.retriever.save_pretrained(retriever_pth)

        self.log.info(f"Model saved to {save_directory}")
        !cp -r {self.log_dir} {project_path}/log/

    def load_model(self, load_directory):
        """Load the model and tokenizer."""
        model_pth = os.path.join(load_directory, "model")
        tokenizer_pth = os.path.join(load_directory, "tokenizer")
        retriever_pth = os.path.join(load_directory, "retriever")
        # Define paths to the sub-tokenizers
        generator_tokenizer_path = os.path.join(tokenizer_pth, "question_encoder_tokenizer")
        question_encoder_tokenizer_path = os.path.join(tokenizer_pth, "generator_tokenizer")

        # Load the tokenizer
        generator_tokenizer = AutoTokenizer.from_pretrained(generator_tokenizer_path)
        question_encoder_tokenizer = AutoTokenizer.from_pretrained(question_encoder_tokenizer_path)
        self.tokenizer = RagTokenizer(question_encoder_tokenizer, generator_tokenizer)
        self.log.info(f"Tokenizer loaded from {load_directory}")

        # Load the model
        self.model = RagTokenForGeneration.from_pretrained(model_pth)
        self.log.info(f"Model loaded from {load_directory}")

        # Load the retriever
        self.retriever = RagRetriever.from_pretrained(retriever_pth)
        self.log.info(f"Retriever loaded from {load_directory}")

    def pipeline(self,
                 num_train_epochs=3,
                 batch_size=8,
                 learning_rate=1e-5,
                 save_model=False,
                 checkpoint_interval=1):
        writer = SummaryWriter(os.path.join(self.log_dir, "tensorboard"))

        self.train_model(
            epochs=num_train_epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            checkpoint_interval=checkpoint_interval,
            writer=writer
        )

        if save_model:
            save_path = os.path.join(self.log_dir, "fine_tuned_rag_model")
            self.save_model(save_directory=save_path)

        self.evaluate_model(batch_size=batch_size, writer=writer)

        writer.close()
        self.log.info("Pipeline finished.")

In [7]:
def train_pipeline(
    num_train_epochs,
    learning_rate,
    model_path
):
    log_dir = './log'
    model = FineTuneRAGModel(log_dir=log_dir, resume_pth=model_path)
    model.pipeline(
        num_train_epochs=1,
        batch_size=8,
        learning_rate=1e-5,
        save_model=True,
        checkpoint_interval=10
    )

    query = "What is the capital of France?"
    answer = model.inference(query)
    print(f"Q: {query}\nA: {answer}")

def do_evaluate(model_path):
    log_dir = './log'
    model = FineTuneRAGModel(log_dir=log_dir, resume_pth=model_path)
    model.evaluate_model()

def do_inference(
    model_path,
    query="What is the capital of France?"
):
    log_dir = './log'
    model = FineTuneRAGModel(log_dir=log_dir, resume_pth=model_path)

    answer = model.inference(query)
    print(f"Q: {query}\nA: {answer}")

In [8]:
if __name__ == "__main__":
    mode = "eval"

    if mode == "train":
        train_pipeline(
            num_train_epochs=3,
            learning_rate=1e-5,
            model_path=None
        )
    elif mode == "resume":
        train_pipeline(
            num_train_epochs=3,
            learning_rate=1e-5,
            model_path=f"{project_path}/log/fine_tune_rag/fine_tuned_rag_model"
        )
    elif mode == "eval":
        do_evaluate(
            model_path=f"{project_path}/log/fine_tune_rag/fine_tuned_rag_model"
        )
    elif mode == "inference":
        do_inference(
            resume_pth=f"{project_path}/log/fine_tune_rag/fine_tuned_rag_model",
            query="What is the capital of France?"
        )

2024-11-01 18:54:19,684 - [INFO] - Using device: cuda


INFO:fine_tune_rag:Using device: cuda


2024-11-01 18:54:19,686 - [INFO] - Loading passages from rag-datasets/rag-mini-wikipedia...


INFO:fine_tune_rag:Loading passages from rag-datasets/rag-mini-wikipedia...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/719 [00:00<?, ?B/s]

part.0.parquet:   0%|          | 0.00/797k [00:00<?, ?B/s]

Generating passages split:   0%|          | 0/3200 [00:00<?, ? examples/s]

2024-11-01 18:54:25,676 - [INFO] - Resuming training from checkpoint.


INFO:fine_tune_rag:Resuming training from checkpoint.


2024-11-01 18:54:32,883 - [INFO] - Tokenizer loaded from /content/drive/MyDrive/STUDY/2024Fall/ECE570/TERM-PROJECT/log/fine_tune_rag/fine_tuned_rag_model


INFO:fine_tune_rag:Tokenizer loaded from /content/drive/MyDrive/STUDY/2024Fall/ECE570/TERM-PROJECT/log/fine_tune_rag/fine_tuned_rag_model


2024-11-01 18:55:22,775 - [INFO] - Model loaded from /content/drive/MyDrive/STUDY/2024Fall/ECE570/TERM-PROJECT/log/fine_tune_rag/fine_tuned_rag_model


INFO:fine_tune_rag:Model loaded from /content/drive/MyDrive/STUDY/2024Fall/ECE570/TERM-PROJECT/log/fine_tune_rag/fine_tuned_rag_model


README.md:   0%|          | 0.00/14.9k [00:00<?, ?B/s]

wiki_dpr.py:   0%|          | 0.00/8.63k [00:00<?, ?B/s]

The repository for wiki_dpr contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/wiki_dpr.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


train-00000-of-00001.parquet:   0%|          | 0.00/40.8M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

  0%|          | 0/10 [00:00<?, ?it/s]

2024-11-01 18:55:41,701 - [INFO] - Retriever loaded from /content/drive/MyDrive/STUDY/2024Fall/ECE570/TERM-PROJECT/log/fine_tune_rag/fine_tuned_rag_model


INFO:fine_tune_rag:Retriever loaded from /content/drive/MyDrive/STUDY/2024Fall/ECE570/TERM-PROJECT/log/fine_tune_rag/fine_tuned_rag_model


2024-11-01 18:55:54,290 - [INFO] - Loading question-answer dataset...


INFO:fine_tune_rag:Loading question-answer dataset...


2024-11-01 18:56:00,036 - [INFO] - Initialization complete.


INFO:fine_tune_rag:Initialization complete.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

2024-11-01 18:56:02,026 - [INFO] - Starting evaluation.


INFO:fine_tune_rag:Starting evaluation.
Evaluating:   0%|          | 0/23 [00:00<?, ?it/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
%load_ext tensorboard
%tensorboard --logdir=/content/drive/MyDrive/STUDY/2024Fall/ECE570/TERM-PROJECT/log/fine_tune_rag/tensorboard

In [None]:
!cp -r /content/logs/ /content/drive/MyDrive/STUDY/2024Fall/ECE570/TERM-PROJECT/

In [None]:
!tree /content/logs/fine_tune_rag/fine_tuned_rag_model