==================================================

**Project Name:** Neural inverted index for fast and effective information retrieval\
**Course:** Deep Learning\
**University:** Sapienza Università di Roma

**Authors:**
  - [Alessio Borgi] (<tt>1952442</tt>)
  - [Eugenio Bugli] (<tt>1934824</tt>)
  - [Damiano Imola] (<tt>2109063</tt>)

**Date:** [November 2024 - Completion Date]

==================================================

## Install & Import Libraries

In [None]:
%%capture
!pip install pyserini==0.12.0

import numpy as np

## Dataset (MS Marco 'Passage')

In [None]:
# import os
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

In [None]:
from pyserini.search import get_topics

topics = get_topics('msmarco-passage-dev-subset')
print(f'{len(topics)} queries total')

downlaod pre-built searcher, to gather top K hits of a given 'title' (i.e. document?????)

In [None]:
from pyserini.search import SimpleSearcher

searcher = SimpleSearcher.from_prebuilt_index('msmarco-passage')

## Model Implementation

### T5 Transformer

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, TrainerCallback

model_name = "t5-base"

tokenizer = T5Tokenizer.from_pretrained(model_name, cache_dir='cache')
model = T5ForConditionalGeneration.from_pretrained(model_name, cache_dir='cache')

### Bert (12 layers)
For docids embedding generation

In [None]:
!pip install transformers

In [1]:
import torch
from transformers import BertTokenizer, BertModel

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [2]:
# Set model to evaluation mode
model.eval()

text = "Transformers are powerful models for NLP tasks."
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)

# Display tokenized input
print(inputs)

{'input_ids': tensor([[  101, 19081,  2024,  3928,  4275,  2005, 17953,  2361,  8518,  1012,
           102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


### Inputs2Target

In [None]:
class IndexingTrainDataset(Dataset):
    def _init_(self, path_to_data, max_length, cache_dir, tokenizer):
        super()._init_()

        self.train_data = datasets.load_dataset(
            'json',
            data_files=path_to_data,
            ignore_verifications=False,
            cache_dir=cache_dir
        )['train']

        self.max_length = max_length
        self.tokenizer = tokenizer
        self.total_len = len(self.train_data)


    def _getitem_(self, idx):
        # Retrieve document data
        doc = self.data[idx]
        doc_text = doc['text']
        docid = doc['docid']

        # Tokenize input (document text)
        # BertTokenizer.from_pretrained('bert-base-uncased')
        source = self.tokenizer(
            doc_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Tokenize target (docid)
        target = self.tokenizer(
            docid,
            max_length=10,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Prepare input-output pair
        return {
            'input_ids': source['input_ids'].squeeze(),
            'attention_mask': source['attention_mask'].squeeze(),
            'labels': target['input_ids'].squeeze()
        }

### Training

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=0.0005,
    warmup_steps=10000,
    # weight_decay=0.01,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    evaluation_strategy='steps',
    eval_steps=1000,
    max_steps=1000000,
    dataloader_drop_last=False,  # necessary
    report_to='wandb',
    logging_steps=50,
    save_strategy='no',
    # fp16=True,  # gives 0/nan loss at some point during training, seems this is a transformers bug.
    dataloader_num_workers=10,
    # gradient_accumulation_steps=2
)

trainer = IndexingTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=IndexingCollator(
        tokenizer,
        padding='longest',
    ),
    compute_metrics=compute_metrics,
    callbacks=[QueryEvalCallback(test_dataset, wandb, restrict_decode_vocab, training_args, tokenizer)],
    restrict_decode_vocab=restrict_decode_vocab
)

trainer.train()

### Training (from GPT)

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import wandb

# Initialize Weights & Biases (W&B) for logging
wandb.init(project="DSI-Training")

# 1. Load the Pre-trained T5 Model and Tokenizer
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# 2. Prepare the Dataset
class IndexingTrainDataset(torch.utils.data.Dataset):
    def _init_(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def _len_(self):
        return len(self.data)

    def _getitem_(self, idx):
        item = self.data[idx]
        doc_text = item['text']
        docid = item['docid']

        # Tokenize the document text (input)
        source = self.tokenizer(
            doc_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Tokenize the document ID (target)
        target = self.tokenizer(
            docid,
            max_length=10,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Prepare input-output pair
        return {
            'input_ids': source['input_ids'].squeeze(),
            'attention_mask': source['attention_mask'].squeeze(),
            'labels': target['input_ids'].squeeze()
        }

# Load your dataset (e.g., Natural Questions)
dataset = load_dataset("path/to/your/dataset")
train_data = IndexingTrainDataset(dataset['train'], tokenizer)
eval_data = IndexingTrainDataset(dataset['validation'], tokenizer)

# 3. Define Training Arguments
training_args = TrainingArguments(
    output_dir="./dsi_checkpoints",
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir="./logs",
    logging_steps=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_steps=1000,
    save_total_limit=2,
    report_to="wandb"  # Enable logging to W&B
)

# 4. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer
)

# 5. Start Training
trainer.train()

# 6. Save the Fine-tuned Model
model.save_pretrained("./fine_tuned_dsi")
tokenizer.save_pretrained("./fine_tuned_dsi")

# 7. End Logging with W&B
wandb.finish()