In [1]:
from datasets import load_dataset, Dataset
import torch
from transformers import AutoTokenizer, AutoModel, RagTokenizer, RagRetriever, RagSequenceForGeneration
import faiss
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the SQuAD v2 dataset
dataset = load_dataset("squad_v2")

# Split the dataset into train and validation sets
train_dataset = dataset['train']

# Select a subset of the dataset
contexts = [example["context"] for example in train_dataset.select(range(1000))]
questions = [example["question"] for example in train_dataset.select(range(1000))]


In [3]:
# Load a pre-trained model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to get embeddings
def embed_texts(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings

# Embed the contexts
context_embeddings = embed_texts(contexts).numpy()

In [5]:
# Print dimensions for debugging
print("Embedding dimension:", context_embeddings.shape)

Embedding dimension: (1000, 384)


In [6]:
len(contexts)

1000

In [7]:
# Normalize embeddings for cosine similarity
context_embeddings /= np.linalg.norm(context_embeddings, axis=1, keepdims=True)

In [8]:
# Create a new dataset with required columns
data_dict = {
    "title": [""] * len(contexts),  # Dummy titles as SQuAD does not provide titles
    "text": contexts,
    "embeddings": context_embeddings.tolist()  # Convert to list for serialization
}
new_dataset = Dataset.from_dict(data_dict)

# Save the new dataset to disk
dataset_path = "squad_v2_embeddings_v1"
new_dataset.save_to_disk(dataset_path)


Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 250690.57 examples/s]


In [9]:
# Create and save the FAISS index
distance_type = "Cosine"
index_path = "faiss_index_v1.faiss"
dimension = context_embeddings.shape[1]

# Create the FAISS index
if distance_type == "L2":
    faiss_index = faiss.IndexFlatL2(dimension)
elif distance_type == "Cosine":
    faiss_index = faiss.IndexFlatIP(dimension)

# Add the context embeddings to the index
faiss_index.add(context_embeddings)
faiss.write_index(faiss_index, index_path)

print(faiss_index.ntotal)

1000


In [16]:
# Load RAG components
generator_model_name = "facebook/rag-sequence-nq"
rag_tokenizer = RagTokenizer.from_pretrained(generator_model_name)
rag_model = RagSequenceForGeneration.from_pretrained(generator_model_name)

# Initialize RAG retriever
rag_retriever = RagRetriever.from_pretrained(
    generator_model_name,
    index_name="custom",
    passages_path=dataset_path,
    index_path=index_path,
    use_dummy_dataset=False
)

# Associate the retriever with the RAG model
rag_model.set_retriever(rag_retriever)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [11]:
faiss_index.is_trained

True

In [61]:
context_embeddings.shape

(1000, 384)

In [17]:

def generate_answer(question):
    inputs = rag_tokenizer(question, return_tensors="pt")
    with torch.no_grad():
        result = rag_model.generate(input_ids=inputs["input_ids"], num_return_sequences=1)
    return rag_tokenizer.decode(result[0], skip_special_tokens=True)

# Example question
print(generate_answer("What is the capital of France?"))

AssertionError: 

In [13]:
dimension = 384
faiss_index = faiss.IndexFlatIP(dimension)
assert faiss_index.d == dimension, f"Expected dimension {dimension}, but got {faiss_index.d}"


In [5]:
from datasets import load_dataset

# Load the SQuAD v2 dataset
dataset = load_dataset("squad_v2")

# Split the dataset into train and validation sets
train_dataset = dataset['train']
validation_dataset = dataset['validation']

Downloading readme: 100%|██████████| 8.92k/8.92k [00:00<00:00, 6.66MB/s]
Downloading data: 100%|██████████| 16.4M/16.4M [00:01<00:00, 10.2MB/s]
Downloading data: 100%|██████████| 1.35M/1.35M [00:00<00:00, 2.44MB/s]
Generating train split: 100%|██████████| 130319/130319 [00:00<00:00, 1539811.04 examples/s]
Generating validation split: 100%|██████████| 11873/11873 [00:00<00:00, 1878922.86 examples/s]


In [18]:
contexts = [example for example in train_dataset[:1000]["context"]]
questions = [example for example in train_dataset[:1000]["question"]]

In [19]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load a pre-trained model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to get embeddings
def embed_texts(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings

# Embed the contexts
context_embeddings = embed_texts(contexts)

In [23]:
context_embeddings.shape[1]

384

In [40]:
# Create a new dataset with required columns
data_dict = {
    "title": [""] * len(contexts),  # Dummy titles as SQuAD does not provide titles
    "text": contexts,
    "embeddings": context_embeddings.tolist()  # Convert to list for serialization
}
new_dataset = Dataset.from_dict(data_dict)

# Save the new dataset to disk
dataset_path = "squad_v2_embeddings"
new_dataset.save_to_disk(dataset_path)

Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 114180.43 examples/s]


In [None]:
import faiss

# Save the FAISS index to disk
distance_type = "Cosine"
index_path = "faiss_index.faiss"
dimension = context_embeddings.shape[1]

# Create the FAISS index
if distance_type == "L2":
    faiss_index = faiss.IndexFlatL2(dimension)
elif distance_type == "Cosine":
    faiss_index = faiss.IndexFlatIP(dimension)

# Add the context embeddings to the index
faiss_index.add(context_embeddings)
faiss.write_index(faiss_index, index_path)

In [31]:
from transformers import AutoTokenizer, AutoModel, RagTokenizer, RagRetriever, RagSequenceForGeneration

In [41]:
generator_model_name = "facebook/rag-token-nq"
rag_tokenizer = RagTokenizer.from_pretrained(generator_model_name)
rag_model = RagSequenceForGeneration.from_pretrained(generator_model_name)
# Load the dataset and index paths into the retriever
rag_retriever = RagRetriever.from_pretrained(
    generator_model_name,
    index_name="custom",
    passages_path=dataset_path,
    index_path=index_path,
    use_dummy_dataset=False
)
# Associate the retriever with the RAG model
rag_model.set_retriever(rag_retriever)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [42]:
# Ensure retriever is set
assert rag_model.retriever == rag_retriever, "Retriever is not set correctly."

In [43]:
# Function to generate answers using RAG
def generate_answer(question):
    inputs = rag_tokenizer(question, return_tensors="pt")
    with torch.no_grad():
        result = rag_model.generate(input_ids=inputs["input_ids"], num_return_sequences=1)
    return rag_tokenizer.decode(result[0], skip_special_tokens=True)

In [44]:
# Generate answers
for question in questions[:10]:  # Use a subset for demonstration
    answer = generate_answer(question)
    print(f"Question: {question}\nAnswer: {answer}\n")

AssertionError: 

In [None]:
from langchain.chains import RagChain
from langchain.llms import HuggingFaceLLM
from langchain.retrievers import FaissRetriever
from langchain.tokenizers import HuggingFaceTokenizer

# Create the retriever
retriever = FaissRetriever(index=index, texts=contexts, embed_fn=embed_texts)

# Create the LLM (generator)
generator_model_name = "facebook/rag-token-nq"
llm = HuggingFaceLLM(generator_model_name)

# Create the tokenizer for the generator model
tokenizer = HuggingFaceTokenizer(generator_model_name)

# Create the RAG chain
rag_chain = RagChain(
    retriever=retriever,
    llm=llm,
    tokenizer=tokenizer
)
