# Retrieval Augmented Generation (RAG) model

[link to documentation](https://huggingface.co/docs/transformers/main/en/model_doc/rag#transformers.RagModel)

A different type of model, using a separate retriever and seq2seq model.

[Paper](https://arxiv.org/pdf/2005.11401.pdf)

In [1]:
%conda install pytorch torchvision torchaudio -c pytorch-nightly
%conda install -c pytorch faiss-cpu
%pip install datasets transformers==4.28.0

usage: conda [-h] [-V] command ...
conda: error: unrecognized arguments: pytorch torchvision torchaudio

Note: you may need to restart the kernel to use updated packages.
usage: conda [-h] [-V] command ...
conda: error: unrecognized arguments: faiss-cpu

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [None]:
# solving my dependency issues
%conda install chardet
%pip install -U --force-reinstall charset-normalizer

In [1]:
import logging
import os
from dataclasses import dataclass, field
from functools import partial
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import List, Optional

import torch
from datasets import Features, Sequence, Value, load_dataset

import faiss
from transformers import (
    DPRContextEncoder,
    DPRContextEncoderTokenizerFast,
    HfArgumentParser,
    RagRetriever,
    RagSequenceForGeneration,
    RagTokenizer,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
logger = logging.getLogger(__name__)
torch.set_grad_enabled(False)


def split_text(text: str, n=100, character=" ") -> List[str]:
    """Split the text every ``n``-th occurrence of ``character``"""
    text = text.split(character)
    return [character.join(text[i : i + n]).strip() for i in range(0, len(text), n)]


def split_documents(documents: dict) -> dict:
    """Split documents into passages"""
    titles, texts = [], []
    for title, text in zip(documents["title"], documents["text"]):
        if text is not None:
            for passage in split_text(text):
                titles.append(title if title is not None else "")
                texts.append(passage)
    return {"title": titles, "text": texts}


def embed(
    documents: dict,
    ctx_encoder: DPRContextEncoder,
    ctx_tokenizer: DPRContextEncoderTokenizerFast,
) -> dict:
    """Compute the DPR embeddings of document passages"""
    input_ids = ctx_tokenizer(
        documents["title"],
        documents["text"],
        truncation=True,
        padding="longest",
        return_tensors="pt",
    )["input_ids"]
    embeddings = ctx_encoder(
        input_ids.to(device=device), return_dict=True
    ).pooler_output
    return {"embeddings": embeddings.detach().cpu().numpy()}

In [5]:
######################################
logger.info("Step 1 - Create the dataset")
######################################

# The dataset needed for RAG must have three columns:
# - title (string): title of the document
# - text (string): text of a passage of the document
# - embeddings (array of dimension d): DPR representation of the passage

csv_path = "../../data/data-wiki.csv"

#    # You can load a Dataset object this way
dataset = load_dataset(
    "csv",
    data_files=[csv_path],
    split="train",
    delimiter=",",
    column_names=["title", "text"],
)

# More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets.html?highlight=csv#csv-files

Downloading and preparing dataset csv/default to /Users/egabasova/.cache/huggingface/datasets/csv/default-f8c7785d3f9a1ddb/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 4215.38it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 457.89it/s]
                                                        

Dataset csv downloaded and prepared to /Users/egabasova/.cache/huggingface/datasets/csv/default-f8c7785d3f9a1ddb/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.




In [10]:
# Then split the documents into passages of 100 words
dataset = dataset.map(
    split_documents, batched=True, num_proc=1, batch_size=50
)  ### <- CHANGED THIS

dpr_ctx_encoder_model_name = "facebook/dpr-ctx_encoder-multiset-base"

# And compute the embeddings
device = "cuda" if torch.cuda.is_available() else "cpu"
ctx_encoder = DPRContextEncoder.from_pretrained(dpr_ctx_encoder_model_name).to(
    device=device
)
ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(
    dpr_ctx_encoder_model_name
)
new_features = Features(
    {
        "text": Value("string"),
        "title": Value("string"),
        "embeddings": Sequence(Value("float32")),
    }
)  # optional, save as float32 instead of float64 to save space
dataset = dataset.map(
    partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer),
    batched=True,
    batch_size=16,
    features=new_features,
)

# And finally save your dataset
passages_path = os.path.join("../../data", "faiss_wiki_knowledge_dataset")
dataset.save_to_disk(passages_path)
# from datasets import load_from_disk
# dataset = load_from_disk(passages_path)  # to reload the dataset


[A
[AThe tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
Downloading pytorch_model.bin:   0%|          | 0.00/438M [04:54<?, ?B/s]
Downloading pytorch_model.bin:   0%|          | 0.00/438M [04:47<?, ?B/s]
                                                                                             

In [11]:
# =================================================
# =================================================
# =================================================
# =================================================
passages_path = os.path.join("../../data", "faiss_wiki_knowledge_dataset")

from datasets import load_from_disk

dataset = load_from_disk(passages_path)  # to reload the dataset

In [12]:
######################################
logger.info("Step 2 - Index the dataset")
######################################

# HNSW arguments for FAISS
# - dimensionality of the embedding
d = 768
# - number of bi-directional links for every new element during index construction
m = 128

# Let's use the Faiss implementation of HNSW for fast approximate nearest neighbor search
index = faiss.IndexHNSWFlat(d, m, faiss.METRIC_INNER_PRODUCT)
dataset.add_faiss_index("embeddings", custom_index=index)

# And save the index
index_path = os.path.join(
    "../../data/", "faiss_wiki_knowledge_dataset_hnsw_index.faiss"
)
dataset.get_index("embeddings").save(index_path)
# dataset.load_faiss_index("embeddings", index_path)  # to reload the index

100%|██████████| 1/1 [00:00<00:00, 51.74it/s]


In [14]:
index_path = os.path.join(
    "../../data/", "faiss_wiki_knowledge_dataset_hnsw_index.faiss"
)
dataset.load_faiss_index("embeddings", index_path)  # to reload the index

In [15]:
######################################
logger.info("Step 3 - Load RAG")
######################################

rag_model_name = "facebook/rag-token-nq"

# Easy way to load the model
retriever = RagRetriever.from_pretrained(
    rag_model_name, index_name="custom", indexed_dataset=dataset
)
model = RagSequenceForGeneration.from_pretrained(rag_model_name, retriever=retriever)
tokenizer = RagTokenizer.from_pretrained(rag_model_name)

# For distributed fine-tuning you'll need to provide the paths instead, as the dataset and the index are loaded separately.
# retriever = RagRetriever.from_pretrained(rag_model_name, index_name="custom", passages_path=passages_path, index_path=index_path)

Downloading (…)lve/main/config.json: 100%|██████████| 4.60k/4.60k [00:00<00:00, 11.2MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 546kB/s]
Downloading (…)_tokenizer/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.10MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 1.40MB/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
Downloading (…)okenizer_config.json: 100%|███

In [20]:
######################################
logger.info("Step 4 - Have fun")
######################################

# question = "What is the new starter checklist? "
# question = "What are regular events at the Turing?"
question = "Some of the regular events organised by REG are"
input_ids = tokenizer.question_encoder(question, return_tensors="pt")["input_ids"]
generated = model.generate(input_ids)
generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)[0]
print("Q: " + question)
print("A: " + generated_string)

Q: Some of the regular events organised by REG are
A:  hack week


In [19]:
print(tokenizer.batch_decode(generated, skip_special_tokens=True))

[' all - reg meeting']
