In [None]:
# Importing necessary classes and libraries from the Transformers library
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration, RagModel, RagSequenceForGeneration

# Importing pandas library for working with data frames
import pandas as pd

# Importing numpy library for numerical operations
import numpy as np

# Importing SentenceTransformer for creating sentence embeddings
from sentence_transformers import SentenceTransformer

# Importing Dataset class from datasets library
from datasets import Dataset

# Importing torch library for PyTorch operations
import torch as torch

# Importing specific classes and functions from datasets
from datasets import Features, Sequence, Value, load_dataset

# Importing partial function for creating partial functions
from functools import partial

# Importing List and Optional types from typing module
from typing import List, Optional

# Importing specific classes from Transformers library
from transformers import (
    DPRContextEncoder,
    DPRContextEncoderTokenizerFast,
    RagRetriever,
    RagSequenceForGeneration,
    RagTokenizer,
)

In [None]:
# Read a JSONL file(tot corpus jsonl file) into a Pandas DataFrame
df = pd.read_json("./corpus.jsonl", lines=True)

# Select specific columns from the DataFrame
selected_columns = ["page_title", "text", "page_source"]

# Combine 'text' and 'page_source' columns into a single 'text' column
df['text'] = df['text'].str.cat(df['page_source'], sep=' ')

# Create a new DataFrame with the selected columns
df_t = df.loc[:, selected_columns]

# Create a copy of the DataFrame
new_df = df_t.copy()

# Rename the 'page_title' column to 'title'
new_df.rename(columns={'page_title': 'title'}, inplace=True)

# Save the new DataFrame to a CSV file for ref
new_df.to_csv("./my_knowledge_dataset.csv")

In [None]:
def split_text(text: str, n=100, character=" ") -> List[str]:
    """
    Split the text every ``n``-th occurrence of ``character``.

    Args:
        text: The input text to be split.
        n (int): The number of occurrences before splitting.
        character (str): The character to split the text on.

    Returns:
        List[str]: A list of split text passages.
    """
    text = text.split(character)
    return [character.join(text[i : i + n]).strip() for i in range(0, len(text), n)]

def split_documents(documents: dict) -> dict:
    """
    Split documents into passages.

    Args:
        documents (dict): A dictionary containing 'title' and 'text' keys.

    Returns:
        dict: A dictionary with 'title' and 'text' keys containing split passages.
    """
    titles, texts = [], []
    for title, text in zip(documents["title"], documents["text"]):
        if text is not None:
            for passage in split_text(text):
                titles.append(title if title is not None else "")
                texts.append(title + " is the film name and its description is" + passage)
    return {"title": titles, "text": texts}

def custom_embeddings_in_batches(texts, titles, batch_size=8, max_length=512):
    """
    Generate embeddings for texts in batches using DPRContextEncoder.

    Args:
        texts (list): List of text passages.
        titles (list): List of titles corresponding to each text passage.
        batch_size (int): Batch size for processing.
        max_length (int): Maximum length of the input.

    Returns:
        numpy.ndarray: Concatenated embeddings of the input texts.
    """
    ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-multiset-base").to("cuda")
    ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained("facebook/dpr-ctx_encoder-multiset-base")

    embeddings_list = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_titles = titles[i:i+batch_size]

        input_ids = ctx_tokenizer(
            batch_texts, padding="longest", return_tensors="pt", max_length=max_length, truncation=True
        )["input_ids"].to("cuda")

        with torch.no_grad():
            embeddings = ctx_encoder(input_ids, return_dict=True).pooler_output.cpu().numpy()

        embeddings_list.append(embeddings)

    return np.concatenate(embeddings_list, axis=0)

# Split the documents into passages
updated_cols = split_documents(new_df)

# Create a new DataFrame with updated columns
t_df = pd.DataFrame()
t_df["title"] = updated_cols["title"]
t_df["text"] = updated_cols["text"]

# Clear GPU cache
torch.cuda.empty_cache()

# Generate and save custom embeddings for the text passages
embeddings = custom_embeddings_in_batches(t_df["text"].values.tolist(), t_df["title"].values.tolist(), batch_size=8)
torch.save(embeddings, f"./custom_embeddings.pth")


In [None]:

loaded_embeddings = torch.load(f"./custom_embeddings.pth")

In [None]:
# Add the embeddings column to the DataFrame
t_df['embeddings'] = embeddings.tolist()

# Create a Dataset object from the Pandas DataFrame
datasets_obj = Dataset.from_pandas(t_df)

# Save the Dataset object to disk
datasets_obj.save_to_disk('./dataset')

# Add a FAISS index to the 'embeddings' column
datasets_obj.add_faiss_index("embeddings")

# Save the FAISS index to disk
datasets_obj.get_index("embeddings").save('./index.faiss')

In [None]:
# Initialize a RagRetriever from pretrained model
retriever = RagRetriever.from_pretrained(
    "facebook/rag-sequence-nq",
    index_name="custom",
    passages_path="./dataset",
    index_path="./index.faiss",
    indexed_dataset=datasets_obj
)

# Initialize a RagTokenizer from pretrained model
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")

# Initialize a RagSequenceForGeneration from pretrained model
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)


In [None]:
# Encode the input question using the tokenizer
input_dict = tokenizer.question_encoder(
    "Movie from the early 2000s I believe about three people living in an apartment but never running into each other",
    return_tensors="pt"
).to("cuda")

# Move the model to GPU
model.to("cuda")

# Clear GPU cache
torch.cuda.empty_cache()

# Generate an answer using the model
generated = model.generate(input_ids=input_dict["input_ids"])

# Print the generated answer
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])


In [1]:
# Encode the input question using the tokenizer
input_dict = tokenizer.question_encoder(
    "name of a movie where a group of super heroes fighting together",
    return_tensors="pt"
).to("cuda")

# Move the model to GPU
model.to("cuda")

# Clear GPU cache
torch.cuda.empty_cache()

# Generate an answer using the model
generated = model.generate(input_ids=input_dict["input_ids"])


# Print the generated answers
print(tokenizer.batch_decode(generated, skip_special_tokens=True))


NameError: name 'tokenizer' is not defined