<a href="https://colab.research.google.com/github/alessioborgi/DL_Project/blob/main/Source/InfoRetrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

==================================================

**Project Name:** Neural inverted index for fast and effective information retrieval\
**Course:** Deep Learning\
**University:** Sapienza Università di Roma

**Authors:**
  - [Alessio Borgi] (<tt>1952442</tt>)
  - [Eugenio Bugli] (<tt>1934824</tt>)
  - [Damiano Imola] (<tt>2109063</tt>)

**Date:** [November 2024 - Completion Date]


**Implementations**
*   Differentiable Search Index architecture
*   DSI-Multi dataset generation


**Novelties**
*   Dynamic pruning
*   Semantic and Stopwords Augmentation
*   Part Of Speech Masked Language Model (POS-MLM) Augmentation




==================================================

# 0: INSTALL & IMPORT LIBRARIES

In [1]:
%%capture
!pip install -q --upgrade pip
# it takes quite a lot
!pip install -q pyserini==0.12.0
# !pip install -q pyserini==0.21.0
# !pip install -q faiss-gpu
!pip install -q pytorch-lightning transformers datasets torch wandb

In [7]:
# base
import json
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from collections import Counter
from typing import List, Tuple
from tqdm import tqdm

# for pyserini stuffs
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

# cool plots
import wandb

# torch
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

# lightning
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor, StochasticWeightAveraging, DeviceStatsMonitor, ModelPruning


# HF and similar
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer, AutoTokenizer, AutoModelForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration, EncoderDecoderCache

# sklearn
from sklearn.preprocessing import normalize
from sklearn.cluster import AgglomerativeClustering, KMeans

# pyserini
# import faiss
from pyserini.index import IndexReader
from pyserini.search import SimpleSearcher
# from pyserini.search.lucene import LuceneSearcher

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, RegexpTokenizer

# PEFT imports
from peft import LoraConfig, get_peft_model, TaskType

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Currently using {device}")

Currently using cuda


In [11]:
wandb.login()
# wandb.login(key="b3bce19a09c51bdf8a19eb3dc58f7c44de929e13") #(ALESSIO)
# wandb.login(key="6d550e12a1b8f716ebe580082f495c01ed2adf6c") #(DAMIANO)
wandb.login(key="551e67be2c716a42ea3230a7c4fc639fc985f98f") #(EUGENIO)
wandb.init(project="IR_DSI", resume="allow")

[34m[1mwandb[0m: Currently logged in as: [33meugeniobugli15[0m ([33madavit[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 1: LOAD TOKENIZED DATASET

In [13]:
# copy to local memory
!cp '/content/drive/MyDrive/deep-learning-files/train_data_tokenized.pt' '/content/train_data_tokenized.pt'
!cp '/content/drive/MyDrive/deep-learning-files/validation_data_tokenized.pt' '/content/validation_data_tokenized.pt'
!cp '/content/drive/MyDrive/deep-learning-files/test_data_tokenized.pt' '/content/test_data_tokenized.pt'

In [14]:
class DatasetLoader(torch.utils.data.Dataset):
    def __init__(self, file_name, up_to_k=5000):
        self.data = torch.load(file_name)[:up_to_k]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [15]:
train_dataset = DatasetLoader('/content/train_data_tokenized.pt', up_to_k=5)
val_dataset = DatasetLoader('/content/validation_data_tokenized.pt', up_to_k=5)
test_dataset = DatasetLoader('/content/test_data_tokenized.pt', up_to_k=5)

  self.data = torch.load(file_name)[:up_to_k]


In [16]:
train_dataloader = DataLoader(train_dataset, batch_size=3, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=3)
test_dataloader = DataLoader(test_dataset, batch_size=3)

In [17]:
print(len(train_dataset), len(val_dataset), len(test_dataset))
print(len(train_dataset) + len(val_dataset) + len(test_dataset))

5 5 5
15


# 2: MODEL

In [18]:
max_encoder_squence_len = 1797 # 48
max_decoder_squence_len = 4
max_decoder_squence_len_1000 = max_decoder_squence_len * 1000

In [19]:
class DSIT5Model(pl.LightningModule):
    def __init__(self, model_name="t5-small", learning_rate=5e-5, max_decoder_squence_len=max_decoder_squence_len, max_decoder_squence_len_1000=max_decoder_squence_len_1000):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(model_name) # transformer
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.learning_rate = learning_rate


    def training_step(self, batch, batch_idx):
        queries = torch.Tensor(batch['query']).squeeze(1).to(device)
        input_ids = torch.Tensor(batch['input_ids']).squeeze(1)
        decoder_input_ids = torch.Tensor(batch['decoder_input_ids']).squeeze(1)
        decoder_1000_input_ids = torch.Tensor(batch['decoder_ranked_input_ids']).squeeze(1)

        # indexing task
        index_output = self.model(input_ids=input_ids, labels=decoder_input_ids)
        index_loss = index_output.loss

        # retrieval task
        retrieval_output = self.model(input_ids=input_ids, labels=queries)
        retrieval_loss = retrieval_output.loss

        loss = index_loss + retrieval_loss

        # metrics = self.compute_metrics(input_ids, decoder_input_ids, decoder_1000_input_ids)

        self.log("train_loss", loss, on_epoch=True)
        return loss


    def validation_step(self, batch, batch_idx):
        queries = torch.Tensor(batch['query']).squeeze(1).to(device)
        input_ids = torch.Tensor(batch['input_ids']).squeeze(1)
        decoder_input_ids = torch.Tensor(batch['decoder_input_ids']).squeeze(1)
        decoder_1000_input_ids = torch.Tensor(batch['decoder_ranked_input_ids']).squeeze(1)

        # indexing task
        index_output = self.model(input_ids=input_ids, labels=decoder_input_ids)
        index_loss = index_output.loss

        # retrieval task
        retrieval_output = self.model(input_ids=input_ids, labels=queries)
        retrieval_loss = retrieval_output.loss

        loss = index_loss + retrieval_loss

        metrics = self.compute_metrics(input_ids, decoder_input_ids, decoder_1000_input_ids)

        self.log("validation_loss", loss, on_epoch=True)
        return loss



    def test_step(self, batch, batch_idx):
        queries = torch.Tensor(batch['query']).squeeze(1).to(device)
        input_ids = torch.Tensor(batch['input_ids']).squeeze(1)
        decoder_input_ids = torch.Tensor(batch['decoder_input_ids']).squeeze(1)
        decoder_1000_input_ids = torch.Tensor(batch['decoder_ranked_input_ids']).squeeze(1)

        # indexing task
        index_output = self.model(input_ids=input_ids, labels=decoder_input_ids)
        index_loss = index_output.loss

        # retrieval task
        retrieval_output = self.model(input_ids=input_ids, labels=queries)
        retrieval_loss = retrieval_output.loss

        loss = index_loss + retrieval_loss

        metrics = self.compute_metrics(queries, decoder_input_ids, decoder_1000_input_ids)

        self.log("test_loss", loss, on_epoch=True)
        return loss




    def compute_metrics(self, input_ids, decoder_input_ids, decoder_1000_input_ids):
        # infer top 1000 documents
        predicted_docids_tokenized = self.model.generate(input_ids, max_length=max_decoder_squence_len_1000)

        predicted_docids = []
        predicted_1000_docids = []
        for prediction in predicted_docids_tokenized:
            # retrieve top 1
            # decoded = self.tokenizer.decode(prediction[:max_decoder_squence_len], skip_special_tokens=True)

            decoded_top1 = self.tokenizer.decode(prediction[:max_decoder_squence_len], skip_special_tokens=True)
            splitted = decoded_top1.split()
            predicted_docids.append(splitted[0] if splitted else "")
            # predicted_docids.append(decoded.split()[0])

            # retrieve top 1000
            decoded_1000 = self.tokenizer.decode(prediction[:max_decoder_squence_len_1000], skip_special_tokens=True)
            predicted_1000_docids.append(decoded_1000.split())


        # ground truth
        target_docids = []
        for ground_truth in decoder_input_ids:
            decoded = self.tokenizer.decode(ground_truth, skip_special_tokens=True)
            target_docids.append(decoded)


        # compute metrics
        recall_at_1000 = self.compute_recall_at_1000(predicted_1000_docids, decoder_1000_input_ids)
        map = self.compute_map(predicted_docids, target_docids)

        return {
            "recall_at_1000": recall_at_1000,
            "map": map
        }




    def compute_recall_at_1000(self, predicted_docids, target_docids):
        recalls = []

        for predicted, target in zip(predicted_docids, target_docids):
            predicted_set = set(predicted[:1000])
            target_set = set(target)

            if not target_set:
                recalls.append(0)
                continue

            recall = len(predicted_set.intersection(target_set)) / len(target_set)
            recalls.append(recall)

        return np.mean(recalls)



    def compute_map(self, predicted_docids, target_docids):
        aps = []
        for predicted, target in zip(predicted_docids, target_docids):
            target_set = set(target.split())

            if not target_set:
                aps.append(0)
                continue

            precision_at_k = []
            num_hits = 0

            for i, doc in enumerate(predicted_docids):
                if doc in target_set:
                    num_hits += 1
                    precision_at_k.append(num_hits / (i + 1))

            # Average Precision for this query
            aps.append(np.mean(precision_at_k) if precision_at_k else 0)

        return np.mean(aps)


    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.learning_rate)

# 3: TRAINING

In [25]:
model = DSIT5Model()

logger = WandbLogger(project="IR_DSI_Project")

# ===== CALLBACKS =====
checkpoint_callback = ModelCheckpoint(
    monitor='validation_loss',
    dirpath='checkpoints/',
    filename='dsi-t5-{epoch:02d}-{val_loss:.2f}',
    save_top_k=1,
    mode='min'
)

early_stopping_callback = EarlyStopping(monitor='validation_loss', patience=3, mode='min')

# ===== DYNAMIC PRUNING ====
# removes individual weights based on magnitude or importance
# removes the ones with smallest L1 norm
# amount removes 20% of the smallest magnitude weights
pruning_callback = ModelPruning("l1_unstructured", amount=0.5)

trainer = pl.Trainer(
    max_epochs=5,
    logger=logger,
    callbacks=[checkpoint_callback, early_stopping_callback, pruning_callback],

    accelerator="auto",

    # gradient accumulation
    accumulate_grad_batches=4, # gradient is computed after 4 batches

    # mixed precision
    precision='16-mixed', # 16 bit precision of my model
)

# Train the model
trainer.fit(model, train_dataloader, val_dataloaders=val_dataloader)

wandb.finish()

INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params | Mode
------------------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M | eval
------------------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)
0         Modules in train mode
277       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:
Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

In [None]:
# save locally
torch.save(model, '/content/checkpoints/<CHECKPOINT_NAME>')
# save to GDrive
!cp '/content/model.pth' '/content/drive/MyDrive/deep-learning-files/checkpoint_<EPOCH>.pth'


# save locally
torch.save(model, '/content/model.pth')
# save to GDrive
!cp '/content/model.pth' '/content/drive/MyDrive/deep-learning-files/model_<EPOCH>.pth'


# save locally
torch.save(model.state_dict(), '/content/model_state_dict.pth')
# save to GDrive
!cp '/content/model_state_dict.pth' '/content/drive/MyDrive/deep-learning-files/model_state_dict_<EPOCH>.pth'

# PEFT (Parameter Efficient Finetuning) with LORA (adapter)

In [31]:
class DSIT5ModelLORA(pl.LightningModule):
    def __init__(self, model_name="t5-small", learning_rate=5e-5, max_decoder_squence_len=max_decoder_squence_len, max_decoder_squence_len_1000=max_decoder_squence_len_1000, lora_r=8, lora_alpha=32, lora_dropout=0.1):
        super().__init__()

        self.T5 = T5ForConditionalGeneration.from_pretrained(model_name) # transformer

        # PEFT sets requires_grad=False on all original T5 layers
        # LoRA parameters (the low-rank adapters) are injected into the T5 attention layers and do requires_grad=True
        # during training, only these small LoRA adapters get updated, leaving the main T5 weights untouched
        self.peft_config = LoraConfig(
            task_type=TaskType.SEQ_2_SEQ_LM,
            inference_mode=False,
            r=lora_r,
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout
        )

        self.model = get_peft_model(self.T5, self.peft_config)

        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.learning_rate = learning_rate


    def training_step(self, batch, batch_idx):
        queries = torch.Tensor(batch['query']).squeeze(1).to(device)
        input_ids = torch.Tensor(batch['input_ids']).squeeze(1)
        decoder_input_ids = torch.Tensor(batch['decoder_input_ids']).squeeze(1)
        decoder_1000_input_ids = torch.Tensor(batch['decoder_ranked_input_ids']).squeeze(1)

        # indexing task
        index_output = self.model(input_ids=input_ids, labels=decoder_input_ids)
        index_loss = index_output.loss

        # retrieval task
        retrieval_output = self.model(input_ids=input_ids, labels=queries)
        retrieval_loss = retrieval_output.loss

        loss = index_loss + retrieval_loss

        # metrics = self.compute_metrics(input_ids, decoder_input_ids, decoder_1000_input_ids)

        self.log("train_loss", loss, on_epoch=True)
        return loss


    def validation_step(self, batch, batch_idx):
        queries = torch.Tensor(batch['query']).squeeze(1).to(device)
        input_ids = torch.Tensor(batch['input_ids']).squeeze(1)
        decoder_input_ids = torch.Tensor(batch['decoder_input_ids']).squeeze(1)
        decoder_1000_input_ids = torch.Tensor(batch['decoder_ranked_input_ids']).squeeze(1)

        # indexing task
        index_output = self.model(input_ids=input_ids, labels=decoder_input_ids)
        index_loss = index_output.loss

        # retrieval task
        retrieval_output = self.model(input_ids=input_ids, labels=queries)
        retrieval_loss = retrieval_output.loss

        loss = index_loss + retrieval_loss

        metrics = self.compute_metrics(input_ids, decoder_input_ids, decoder_1000_input_ids)

        self.log("validation_loss", loss, on_epoch=True)
        return loss



    def test_step(self, batch, batch_idx):
        queries = torch.Tensor(batch['query']).squeeze(1).to(device)
        input_ids = torch.Tensor(batch['input_ids']).squeeze(1)
        decoder_input_ids = torch.Tensor(batch['decoder_input_ids']).squeeze(1)
        decoder_1000_input_ids = torch.Tensor(batch['decoder_ranked_input_ids']).squeeze(1)

        # indexing task
        index_output = self.model(input_ids=input_ids, labels=decoder_input_ids)
        index_loss = index_output.loss

        # retrieval task
        retrieval_output = self.model(input_ids=input_ids, labels=queries)
        retrieval_loss = retrieval_output.loss

        loss = index_loss + retrieval_loss

        metrics = self.compute_metrics(queries, decoder_input_ids, decoder_1000_input_ids)

        self.log("test_loss", loss, on_epoch=True)
        return loss



    def compute_metrics(self, input_ids, decoder_input_ids, decoder_1000_input_ids):
        # get_base_model().generate(...) uses the LoRA adapters in the forward pass
        # We don’t lose the LoRA modifications by calling the base model, because LoRA modifies the internals of T5’s attention layers

        # infer top 1000 documents
        predicted_docids_tokenized = self.model.get_base_model().generate(input_ids, max_length=max_decoder_squence_len_1000)

        predicted_docids = []
        predicted_1000_docids = []
        for prediction in predicted_docids_tokenized:

            # retrieve top 1
            # decoded = self.tokenizer.decode(prediction[:max_decoder_squence_len], skip_special_tokens=True)

            decoded_top1 = self.tokenizer.decode(prediction[:max_decoder_squence_len], skip_special_tokens=True)
            splitted = decoded_top1.split()
            predicted_docids.append(splitted[0] if splitted else "")
            # predicted_docids.append(decoded.split()[0])

            # retrieve top 1000
            decoded_1000 = self.tokenizer.decode(prediction[:max_decoder_squence_len_1000], skip_special_tokens=True)
            predicted_1000_docids.append(decoded_1000.split())

        # ground truth
        target_docids = []
        for ground_truth in decoder_input_ids:
            decoded = self.tokenizer.decode(ground_truth, skip_special_tokens=True)
            target_docids.append(decoded)


        # compute metrics
        recall_at_1000 = self.compute_recall_at_1000(predicted_1000_docids, decoder_1000_input_ids)
        map = self.compute_map(predicted_docids, target_docids)

        return {
            "recall_at_1000": recall_at_1000,
            "map": map
        }




    def compute_recall_at_1000(self, predicted_docids, target_docids):
        recalls = []

        for predicted, target in zip(predicted_docids, target_docids):
            predicted_set = set(predicted[:1000])
            target_set = set(target)

            if not target_set:
                recalls.append(0)
                continue

            recall = len(predicted_set.intersection(target_set)) / len(target_set)
            recalls.append(recall)

        return np.mean(recalls)



    def compute_map(self, predicted_docids, target_docids):
        aps = []
        for predicted, target in zip(predicted_docids, target_docids):
            target_set = set(target.split())

            if not target_set:
                aps.append(0)
                continue

            precision_at_k = []
            num_hits = 0

            for i, doc in enumerate(predicted_docids):
                if doc in target_set:
                    num_hits += 1
                    precision_at_k.append(num_hits / (i + 1))

            # Average Precision for this query
            aps.append(np.mean(precision_at_k) if precision_at_k else 0)

        return np.mean(aps)


    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.learning_rate)

In [33]:
model = DSIT5ModelLORA(
    lora_r=8,
    lora_alpha=32,
    lora_dropout=0.1
)

logger = WandbLogger(project="IR_DSI_Project")

# ===== CALLBACKS =====
checkpoint_callback = ModelCheckpoint(
    monitor='validation_loss',
    dirpath='checkpoints/',
    filename='dsi-t5-lora-{epoch:02d}-{val_loss:.2f}',
    save_top_k=1,
    mode='min'
)

early_stopping_callback = EarlyStopping(monitor='validation_loss', patience=3, mode='min')

# ===== DYNAMIC PRUNING ====
# removes individual weights based on magnitude or importance
# removes the ones with smallest L1 norm
# amount removes 20% of the smallest magnitude weights
pruning_callback = ModelPruning("l1_unstructured", amount=0.5)

trainer = pl.Trainer(
    max_epochs=5,
    logger=logger,
    callbacks=[checkpoint_callback, early_stopping_callback],#pruning_callback],

    accelerator="auto",

    # gradient accumulation
    accumulate_grad_batches=4, # gradient is computed after 4 batches

    # mixed precision
    precision='16-mixed', # 16 bit precision of my model
)

# Train the model
trainer.fit(model, train_dataloader, val_dataloaders=val_dataloader)

wandb.finish()

INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params | Mode 
-------------------------------------------------------------
0 | T5    | T5ForConditionalGeneration | 60.8 M | eval 
1 | model | PeftModelForSeq2SeqLM      | 60.8 M | train
-------------------------------------------------------------
294 K     Trainable params
60.5 M    Non-trainable params
60.8 M    Total params
243.206   Total estimated model params size (MB)
362       Modules in train mode
277       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

0,1
epoch,▁▁▁▁▁▁▁▁▃▃▆▆▁▁▁▃▃▆▆██
train_loss_epoch,▁▁▁▁█▆▁▁▁▁
trainer/global_step,▁▁▁▁▁▁▁▁▃▃▆▆▁▁▁▃▃▆▆██
validation_loss,▁▁▁▁█▇▁▁▁▁▁

0,1
epoch,3.0
train_loss_epoch,12.58074
trainer/global_step,3.0
validation_loss,13.29458


# QLORA

In [20]:
!pip uninstall -y bitsandbytes accelerate peft


Found existing installation: bitsandbytes 0.45.1
Uninstalling bitsandbytes-0.45.1:
  Successfully uninstalled bitsandbytes-0.45.1
Found existing installation: accelerate 1.3.0
Uninstalling accelerate-1.3.0:
  Successfully uninstalled accelerate-1.3.0
Found existing installation: peft 0.14.0
Uninstalling peft-0.14.0:
  Successfully uninstalled peft-0.14.0


In [11]:
!python -m bitsandbytes

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++ BUG REPORT INFORMATION ++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++ OTHER +++++++++++++++++++++++++++
CUDA specs: CUDASpecs(highest_compute_capability=(7, 5), cuda_version_string='121', cuda_version_tuple=(12, 1))
PyTorch settings found: CUDA_VERSION=121, Highest Compute Capability: (7, 5).
To manually override the PyTorch CUDA version please see: https://github.com/TimDettmers/bitsandbytes/blob/main/docs/source/nonpytorchcuda.mdx
The directory listed in your path is found to be non-existent: /sys/fs/cgroup/memory.events /var/colab/cgroup/jupyter-children/memory.events
The directory listed in your path is found to be non-existent: //172.28.0.1
The directory listed in your path is found to be non-existent: 8013
The directory listed in your path is found to be non-existent: //colab.research.google.com/tun/m/cc48301118ce562b961b3c22d803539adc1e0c19/

In [21]:
!pip install bitsandbytes -U -q
!pip install accelerate peft -q
!pip install -U gdown transformers -q

In [22]:
#!pip install bitsandbytes -q -U
#!pip install bitsandbytes==0.39.1
#!pip install bitsandbytes

import transformers as tra
import peft as pf
import bitsandbytes as bnb

from transformers import T5ForConditionalGeneration, T5Tokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from bitsandbytes.optim import AdamW8bit

print(pf.__version__)
print(tra.__version__)
print(bnb.__version__)

0.14.0
4.48.1
0.45.1


In [23]:
class DSIT5ModelQLORA(pl.LightningModule):
    def __init__(self, model_name="t5-small", learning_rate=5e-5, max_decoder_squence_len=max_decoder_squence_len, max_decoder_squence_len_1000=max_decoder_squence_len_1000, lora_r=8, lora_alpha=32, lora_dropout=0.1):
        super().__init__()

        # 4-bit quantization
        bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.float16)

        # load T5 in 4-bit precision
        self.base_t5 = T5ForConditionalGeneration.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")

        self.peft_config = LoraConfig(
            task_type=TaskType.SEQ_2_SEQ_LM,
            inference_mode=False,
            r=lora_r,
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout
        )

        self.model = get_peft_model(self.base_t5, self.peft_config)

        self.tokenizer = T5Tokenizer.from_pretrained(model_name)

        self.learning_rate = learning_rate
        self.max_decoder_squence_len = max_decoder_squence_len
        self.max_decoder_squence_len_1000 = max_decoder_squence_len_1000

    def training_step(self, batch, batch_idx):
        # Expecting your batch to have these keys
        queries = batch['query'].squeeze(1).to(device)
        input_ids = batch['input_ids'].squeeze(1).to(device)
        decoder_input_ids = batch['decoder_input_ids'].squeeze(1).to(device)
        decoder_1000_input_ids = batch['decoder_ranked_input_ids'].squeeze(1).to(device)

        # indexing task
        index_output = self.model(input_ids=input_ids, labels=decoder_input_ids)
        index_loss = index_output.loss

        # retrieval task
        retrieval_output = self.model(input_ids=input_ids, labels=queries)
        retrieval_loss = retrieval_output.loss

        loss = index_loss + retrieval_loss

        self.log("train_loss", loss, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        queries = batch['query'].squeeze(1).to(device)
        input_ids = batch['input_ids'].squeeze(1).to(device)
        decoder_input_ids = batch['decoder_input_ids'].squeeze(1).to(device)
        decoder_1000_input_ids = batch['decoder_ranked_input_ids'].squeeze(1).to(device)

        index_output = self.model(input_ids=input_ids, labels=decoder_input_ids)
        index_loss = index_output.loss

        retrieval_output = self.model(input_ids=input_ids, labels=queries)
        retrieval_loss = retrieval_output.loss

        loss = index_loss + retrieval_loss

        metrics = self.compute_metrics(input_ids, decoder_input_ids, decoder_1000_input_ids)

        self.log("validation_loss", loss, on_epoch=True)
        return loss

    def test_step(self, batch, batch_idx):
        queries = batch['query'].squeeze(1).to(device)
        input_ids = batch['input_ids'].squeeze(1).to(device)
        decoder_input_ids = batch['decoder_input_ids'].squeeze(1).to(device)
        decoder_1000_input_ids = batch['decoder_ranked_input_ids'].squeeze(1).to(device)

        index_output = self.model(input_ids=input_ids, labels=decoder_input_ids)
        index_loss = index_output.loss

        retrieval_output = self.model(input_ids=input_ids, labels=queries)
        retrieval_loss = retrieval_output.loss

        loss = index_loss + retrieval_loss

        metrics = self.compute_metrics(queries, decoder_input_ids, decoder_1000_input_ids)

        self.log("test_loss", loss, on_epoch=True)
        return loss

    def compute_metrics(self, input_ids, decoder_input_ids, decoder_1000_input_ids):
        # For generation, we call get_base_model() to access T5ForConditionalGeneration.generate(...)
        predicted_docids_tokenized = self.model.get_base_model().generate(input_ids=input_ids, max_length=self.max_decoder_squence_len_1000)

        predicted_docids = []
        predicted_1000_docids = []

        for prediction in predicted_docids_tokenized:
            # top 1
            decoded_top1 = self.tokenizer.decode(prediction[:self.max_decoder_squence_len], skip_special_tokens=True)
            splitted = decoded_top1.split()
            predicted_docids.append(splitted[0] if splitted else "")

            # top 1000
            decoded_1000 = self.tokenizer.decode(prediction[:self.max_decoder_squence_len_1000], skip_special_tokens=True)
            predicted_1000_docids.append(decoded_1000.split())

        # ground truth
        target_docids = []
        for ground_truth in decoder_input_ids:
            decoded = self.tokenizer.decode(ground_truth, skip_special_tokens=True)
            target_docids.append(decoded)

        # compute metrics
        recall_at_1000 = self.compute_recall_at_1000(predicted_1000_docids, decoder_1000_input_ids)
        map_val = self.compute_map(predicted_docids, target_docids)

        return {
            "recall_at_1000": recall_at_1000,
            "map": map_val
        }

    def compute_recall_at_1000(self, predicted_docids, target_docids):
        recalls = []
        for predicted, target in zip(predicted_docids, target_docids):
            predicted_set = set(predicted[:1000])
            target_set = set(target)
            if not target_set:
                recalls.append(0)
                continue
            recall = len(predicted_set.intersection(target_set)) / len(target_set)
            recalls.append(recall)
        return np.mean(recalls)

    def compute_map(self, predicted_docids, target_docids):
        aps = []
        for predicted, target in zip(predicted_docids, target_docids):
            target_set = set(target.split())
            if not target_set:
                aps.append(0)
                continue

            precision_at_k = []
            num_hits = 0
            for i, doc in enumerate(predicted):
                if doc in target_set:
                    num_hits += 1
                    precision_at_k.append(num_hits / (i + 1))

            avg_precision = np.mean(precision_at_k) if precision_at_k else 0
            aps.append(avg_precision)

        return np.mean(aps)

    def configure_optimizers(self):
        optimizer = AdamW8bit(self.parameters(), lr=self.learning_rate)
        return optimizer

In [24]:
model = DSIT5ModelQLORA(
    lora_r=8,
    lora_alpha=32,
    lora_dropout=0.1
)

logger = WandbLogger(project="IR_DSI_Project")

# ===== CALLBACKS =====
checkpoint_callback = ModelCheckpoint(
    monitor='validation_loss',
    dirpath='checkpoints/',
    filename='dsi-t5-lora-{epoch:02d}-{val_loss:.2f}',
    save_top_k=1,
    mode='min'
)

early_stopping_callback = EarlyStopping(monitor='validation_loss', patience=3, mode='min')

# ===== DYNAMIC PRUNING ====
# removes individual weights based on magnitude or importance
# removes the ones with smallest L1 norm
# amount removes 20% of the smallest magnitude weights
pruning_callback = ModelPruning("l1_unstructured", amount=0.5)

trainer = pl.Trainer(
    max_epochs=5,
    logger=logger,
    callbacks=[checkpoint_callback, early_stopping_callback], #pruning_callback],

    accelerator="auto",

    # gradient accumulation
    accumulate_grad_batches=4, # gradient is computed after 4 batches

    # mixed precision
    precision='16-mixed', # 16 bit precision of my model
)

# Train the model
trainer.fit(model, train_dataloader, val_dataloaders=val_dataloader)

wandb.finish()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_z

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/loops/fit_loop.py:310: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

0,1
epoch,▁▁▃▃▆▆██
train_loss_epoch,█▁▆▁
trainer/global_step,▁▁▃▃▆▆██
validation_loss,▁▁▁▁

0,1
epoch,3.0
train_loss_epoch,13.39073
trainer/global_step,3.0
validation_loss,13.63606


# EMPTY GPU RAM

In [None]:
# dump everything in your GPU (call 3 times to work)
import gc
gc.collect()
# del model, trainer
torch.cuda.empty_cache()

# 8: FURTHER IMPROVEMENTS AND TODOS

In [None]:
# TODO
# 1. refactoring totale (EUGENIO)
# 2. mean number of words in passages with and without stopwords/punktuation (plot)
# 3. (DAMIANO) Adversarial Natural Language Inference
# 4. Generate Dataset of top1000 results by using a ranker (Faiss)