==================================================

**Project Name:** Neural inverted index for fast and effective information retrieval\
**Course:** Deep Learning\
**University:** Sapienza Università di Roma

**Authors:**
  - [Alessio Borgi] (<tt>1952442</tt>)
  - [Eugenio Bugli] (<tt>1934824</tt>)
  - [Damiano Imola] (<tt>2109063</tt>)

**Date:** [November 2024 - Completion Date]

==================================================

## 0: INSTALL & IMPORT LIBRARIES

In [1]:
!pip install pyserini==0.12.0
!pip install pytorch-lightning transformers datasets torch

Collecting pyserini==0.12.0
  Downloading pyserini-0.12.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pyjnius>=1.2.1 (from pyserini==0.12.0)
  Downloading pyjnius-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading pyserini-0.12.0-py3-none-any.whl (67.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.5/67.5 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyjnius-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyjnius, pyserini
Successfully installed pyjnius-1.6.1 pyserini-0.12.0
Collecting pytorch-lightning
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading 

In [2]:
import torch
import numpy as np
import pytorch_lightning as pl
import torch.nn.functional as F
from datasets import load_dataset
from pytorch_lightning import Trainer
from torch.utils.data import DataLoader, Dataset
from sklearn.cluster import AgglomerativeClustering
from transformers import AutoModel, AutoTokenizer, AutoTokenizer, AutoModelForSequenceClassification




## 1: DOWNLOADING DATASET




In [3]:
# Load the MS MARCO (100K) Dataset
ms_marco = load_dataset("microsoft/ms_marco", "v1.1", split="train")

# Display a sample
print(ms_marco[0])

train_data = ms_marco.shuffle(seed=42).select(range(80000))
validation_data = ms_marco.shuffle(seed=42).select(range(8000))


README.md:   0%|          | 0.00/9.48k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/175M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/10047 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/82326 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/9650 [00:00<?, ? examples/s]

{'answers': ['Results-Based Accountability is a disciplined way of thinking and taking action that communities can use to improve the lives of children, youth, families, adults and the community as a whole.'], 'passages': {'is_selected': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0], 'passage_text': ["Since 2007, the RBA's outstanding reputation has been affected by the 'Securency' or NPA scandal. These RBA subsidiaries were involved in bribing overseas officials so that Australia might win lucrative note-printing contracts. The assets of the bank include the gold and foreign exchange reserves of Australia, which is estimated to have a net worth of A$101 billion. Nearly 94% of the RBA's employees work at its headquarters in Sydney, New South Wales and at the Business Resumption Site.", "The Reserve Bank of Australia (RBA) came into being on 14 January 1960 as Australia 's central bank and banknote issuing authority, when the Reserve Bank Act 1959 removed the central banking functions from the Commonw

In [5]:
# Load the MS MARCO (100K) Dataset
ms_marco = load_dataset("microsoft/ms_marco", "v1.1", split="train")

# Split into train and validation
train_data = ms_marco.shuffle(seed=42).select(range(80000))
validation_data = ms_marco.shuffle(seed=42).select(range(8000))

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                          | Params | Mode
---------------------------------------------------------------
0 | model | BertForSequenceClassification | 109 M  | eval
---------------------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.93

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:
Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

In [None]:
# PyTorch Dataset class
class MSMARCODataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        query = item["query"]
        passage = item["passages"]["passage_text"][0]  # First passage
        label = 1 if item["passages"]["is_selected"][0] else 0  # Binary label

        # Tokenize input
        inputs = self.tokenizer(
            query,
            passage,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }


In [None]:
# PyTorch Lightning Data Module
class MSMarcoDataModule(pl.LightningDataModule):
    def __init__(self, train_data, validation_data, tokenizer, batch_size=32):
        super().__init__()
        self.train_data = train_data
        self.validation_data = validation_data
        self.tokenizer = tokenizer
        self.batch_size = batch_size

    def setup(self, stage=None):
        self.train_dataset = MSMARCODataset(self.train_data, self.tokenizer)
        self.val_dataset = MSMARCODataset(self.validation_data, self.tokenizer)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)



## 4: MODEL

In [None]:
class MSMarcoClassifier(pl.LightningModule):
    def __init__(self, model_name="bert-base-uncased", learning_rate=2e-5):
        super().__init__()
        self.save_hyperparameters()
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name, num_labels=2
        )
        self.learning_rate = learning_rate

    def forward(self, input_ids, attention_mask):
        return self.model(input_ids=input_ids, attention_mask=attention_mask)

    def training_step(self, batch, batch_idx):
        outputs = self(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
        loss = F.cross_entropy(outputs.logits, batch["label"])
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
        loss = F.cross_entropy(outputs.logits, batch["label"])
        self.log("val_loss", loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.learning_rate)


## 5: TRAINING

In [None]:
# Initialize the data module
data_module = MSMarcoDataModule(train_data, validation_data, tokenizer)

# Initialize the model
model = MSMarcoClassifier()

# Initialize the Trainer.
trainer = Trainer(
    max_epochs=3,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    devices=1 if torch.cuda.is_available() else None,
    enable_progress_bar=True
)
# Train the model.
trainer.fit(model, data_module)

# OLD

## I-PYSERINI INSPECTION

In [None]:
from pyserini.search import get_topics

topics = get_topics('msmarco-passage-dev-subset')
print(f'{len(topics)} queries total')

In [None]:
from pyserini.search import SimpleSearcher

searcher = SimpleSearcher.from_prebuilt_index('msmarco-passage')

# Search the index for a query
hits = searcher.search('What is machine learning?')

# Display the top-ranked results
for i, hit in enumerate(hits):
    print(f"Rank {i+1}: {hit.docid} - {hit.score}")
    print(hit.raw)

## 2: BERT EMBEDDING

In [None]:

# Load a pre-trained model for embeddings
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# Generate embeddings for documents
def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

## Model Implementation

### T5 Transformer

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, TrainerCallback

model_name = "t5-base"

tokenizer = T5Tokenizer.from_pretrained(model_name, cache_dir='cache')
model = T5ForConditionalGeneration.from_pretrained(model_name, cache_dir='cache')

### Bert (12 layers)
For docids embedding generation

In [None]:
!pip install transformers

In [None]:
import torch
from transformers import BertTokenizer, BertModel

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
# Set model to evaluation mode
model.eval()

text = "Transformers are powerful models for NLP tasks."
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)

# Display tokenized input
print(inputs)

### Inputs2Target

In [None]:
class IndexingTrainDataset(Dataset):
    def _init_(self, path_to_data, max_length, cache_dir, tokenizer):
        super()._init_()

        self.train_data = datasets.load_dataset(
            'json',
            data_files=path_to_data,
            ignore_verifications=False,
            cache_dir=cache_dir
        )['train']

        self.max_length = max_length
        self.tokenizer = tokenizer
        self.total_len = len(self.train_data)


    def _getitem_(self, idx):
        # Retrieve document data
        doc = self.data[idx]
        doc_text = doc['text']
        docid = doc['docid']

        # Tokenize input (document text)
        # BertTokenizer.from_pretrained('bert-base-uncased')
        source = self.tokenizer(
            doc_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Tokenize target (docid)
        target = self.tokenizer(
            docid,
            max_length=10,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Prepare input-output pair
        return {
            'input_ids': source['input_ids'].squeeze(),
            'attention_mask': source['attention_mask'].squeeze(),
            'labels': target['input_ids'].squeeze()
        }

### Training

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=0.0005,
    warmup_steps=10000,
    # weight_decay=0.01,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    evaluation_strategy='steps',
    eval_steps=1000,
    max_steps=1000000,
    dataloader_drop_last=False,  # necessary
    report_to='wandb',
    logging_steps=50,
    save_strategy='no',
    # fp16=True,  # gives 0/nan loss at some point during training, seems this is a transformers bug.
    dataloader_num_workers=10,
    # gradient_accumulation_steps=2
)

trainer = IndexingTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=IndexingCollator(
        tokenizer,
        padding='longest',
    ),
    compute_metrics=compute_metrics,
    callbacks=[QueryEvalCallback(test_dataset, wandb, restrict_decode_vocab, training_args, tokenizer)],
    restrict_decode_vocab=restrict_decode_vocab
)

trainer.train()

### Training (from GPT)

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import wandb

# Initialize Weights & Biases (W&B) for logging
wandb.init(project="DSI-Training")

# 1. Load the Pre-trained T5 Model and Tokenizer
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# 2. Prepare the Dataset
class IndexingTrainDataset(torch.utils.data.Dataset):
    def _init_(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def _len_(self):
        return len(self.data)

    def _getitem_(self, idx):
        item = self.data[idx]
        doc_text = item['text']
        docid = item['docid']

        # Tokenize the document text (input)
        source = self.tokenizer(
            doc_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Tokenize the document ID (target)
        target = self.tokenizer(
            docid,
            max_length=10,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Prepare input-output pair
        return {
            'input_ids': source['input_ids'].squeeze(),
            'attention_mask': source['attention_mask'].squeeze(),
            'labels': target['input_ids'].squeeze()
        }

# Load your dataset (e.g., Natural Questions)
dataset = load_dataset("path/to/your/dataset")
train_data = IndexingTrainDataset(dataset['train'], tokenizer)
eval_data = IndexingTrainDataset(dataset['validation'], tokenizer)

# 3. Define Training Arguments
training_args = TrainingArguments(
    output_dir="./dsi_checkpoints",
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir="./logs",
    logging_steps=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_steps=1000,
    save_total_limit=2,
    report_to="wandb"  # Enable logging to W&B
)

# 4. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer
)

# 5. Start Training
trainer.train()

# 6. Save the Fine-tuned Model
model.save_pretrained("./fine_tuned_dsi")
tokenizer.save_pretrained("./fine_tuned_dsi")

# 7. End Logging with W&B
wandb.finish()