In [None]:
from datasets import load_dataset

# Load the MS MARCO dataset
ms_marco = load_dataset("microsoft/msmarco", "v2.1", split="train")

# Display a sample
print(ms_marco[0])

train_data = ms_marco.shuffle(seed=42).select(range(80000))
validation_data = ms_marco.shuffle(seed=42).select(range(8000))



In [None]:
from sklearn.cluster import AgglomerativeClustering
from transformers import AutoModel, AutoTokenizer

# Load a pre-trained model for embeddings
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# Generate embeddings for documents
def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Create embeddings
embeddings = [embed_text(doc['passage']) for doc in train_data]

# Apply clustering
clustering = AgglomerativeClustering(n_clusters=100)
labels = clustering.fit_predict(embeddings)

# Assign semantically structured identifiers
ssids = [f"Cluster_{label}" for label in labels]
train_data = [{'text': doc['passage'], 'docid': ssid} for doc, ssid in zip(train_data, ssids)]

In [None]:
from torch.utils.data import Dataset

class DSIInputs2TargetDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        doc_text = self.data[idx]['text']
        docid = self.data[idx]['docid']

        # Tokenize input (document text)
        source = self.tokenizer(
            doc_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Tokenize target (structured identifier)
        target = self.tokenizer(
            docid,
            max_length=10,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': source['input_ids'].squeeze(),
            'attention_mask': source['attention_mask'].squeeze(),
            'labels': target['input_ids'].squeeze()
        }


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments

# Initialize model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# Create the dataset
train_dataset = DSIInputs2TargetDataset(train_data, tokenizer)
eval_dataset = DSIInputs2TargetDataset(validation_data, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./dsi_output",
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir="./logs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

# Start training
trainer.train()


In [None]:
from sklearn.metrics import accuracy_score

def evaluate(model, dataset):
    model.eval()
    hits_at_1 = 0
    total = len(dataset)
    
    for data in dataset:
        inputs = tokenizer(data['text'], return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            outputs = model.generate(**inputs)
        predicted_docid = tokenizer.decode(outputs[0], skip_special_tokens=True)
        if predicted_docid == data['docid']:
            hits_at_1 += 1

    accuracy = hits_at_1 / total
    print(f"Hits@1 Accuracy: {accuracy * 100:.2f}%")

# Run evaluation
evaluate(model, eval_dataset)
