#Imports and setup:

In [None]:
!pip install transformers sentence-transformers datasets cohere pinecone

In [None]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm
import cohere
import numpy as np
import warnings
from IPython.display import display
warnings.filterwarnings("ignore")

# Part 3 - RAG Implementation



## Accessing APIs

In [None]:
with open("Assignment 1/chohere_api_key.txt") as f:
    COHERE_API_KEY = f.read().strip()
with open("Assignment 1/pinecone_api_key.txt") as f:
    PINECONE_API_KEY = f.read().strip()

##Embedding Model

In [None]:
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDDING_MODEL)

In [None]:

from datasets import load_dataset
from sentence_transformers import SentenceTransformer

def load_and_embedd_dataset(
    dataset_name: str = 'natural_questions',
    split: str = 'train',
    model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
    text_field: str = 'context',  # Changed from 'highlights' to 'context' for QA datasets
    rec_num: int = 400
) -> tuple:
    """
    Load a dataset and embedd the text field using a sentence-transformer model
    Args:
        dataset_name: The name of the dataset to load
        split: The split of the dataset to load
        model: The model to use for embedding
        text_field: The field in the dataset that contains the text
        rec_num: The number of records to load and embedd
    Returns:
        tuple: A tuple containing the dataset and the embeddings
    """
    print("Loading and embedding the dataset")

    # Load the dataset
    dataset = load_dataset(dataset_name, split=split)

    context_list = dataset['context']
    unique_contexts, unique_indices = np.unique(context_list, return_index=True)
    dataset = dataset.select(unique_indices.tolist())

    # Embed the first `rec_num` rows of the dataset
    embeddings = model.encode(dataset[text_field][:rec_num], convert_to_tensor=True)

    print("Done!")
    return dataset, embeddings



In [None]:
# Example usage
DATASET_NAME = 'squad'
SPLIT = 'validation'

# Ensure the SentenceTransformer model is instantiated correctly
MODEL = SentenceTransformer('all-MiniLM-L6-v2')

dataset, embeddings = load_and_embedd_dataset(
    dataset_name=DATASET_NAME,
    split=SPLIT,
    model=MODEL,
    text_field='context',  # For SQuAD, 'context' is the field containing the passages
    rec_num=400
)

shape = embeddings.shape

Loading and embedding the dataset
Done!


In [None]:
pd_dataset = dataset.to_pandas()
pd_dataset.head(500)

Unnamed: 0,id,title,context,question,answers
0,5725c604271a42140099d185,Apollo_program,Seamans' establishment of an ad-hoc committee...,Who led the committee established by Seaman?,"{'text': ['Nicholas E. Golovin', 'Nicholas E. ..."
1,57267c63dd62a815002e86d6,Newcastle_upon_Tyne,"""Bairn"" and ""hyem"", meaning ""child"" and ""home""...","""Bairn"" and ""hyem"" have origins from what cult...","{'text': ['Scandinavia', 'Scandinavia', 'Scand..."
2,572805363acd2414000df26d,Doctor_Who,"""Official"" reconstructions have also been rele...",Who has released official reconstructions of D...,"{'text': ['the BBC', 'BBC', 'BBC'], 'answer_st..."
3,5705edcd52bb8914006896ca,Southern_California,"""Southern California"" is not a formal geograph...","Geographically speaking, where is California's...","{'text': ['37° 9' 58.23""', '37° 9' 58.23""', '3..."
4,57302bd0b2c2fd14005689db,Islamism,"""The Islamic State"", formerly known as the ""Is...",What type of group is The Islamic State?,{'text': ['Wahhabi/Salafi jihadist extremist m...
...,...,...,...,...,...
495,572758c3dd62a815002e9b78,Genghis_Khan,"Genghis Khan, the title is spelled in variety ...",What is the Mongolian spelling of Genghis Khan?,"{'text': ['Chinggis Khaan', 'Chinggis Khaan', ..."
496,57308cf88ab72b1400f9c576,Imperialism,Geographical theories such as environmental de...,Which theory suggested people in the tropics w...,"{'text': ['environmental determinism', 'enviro..."
497,572677e7708984140094c723,Geology,"Geologists use a number of field, laboratory, ...",What is petrology?,"{'text': ['the study of rocks', 'study of rock..."
498,57300911947a6a140053cfb6,Rhine,Germanic tribes crossed the Rhine in the Migra...,When did Germanic tribes cross the Rhine to mi...,"{'text': ['5th century', 'Migration period', '..."


In [None]:
print(f"The embeddings shape: {embeddings.shape}")

The embeddings shape: torch.Size([400, 384])


##LLM without RAG

Create hard queries for the LLM:


In [None]:
import cohere

query1 = "what can the process of squaring a num can be reduced to"
query2 = "{xx | x is any binary string} can be solved in what time?"
query3 = "give me two well-known complexity resources"

for query in [query1, query2, query3]:
  co = cohere.Client(api_key=COHERE_API_KEY)
  response = co.chat(
          model='command-r-plus',
          message=query,
      )
  print(response.text,'\n','______________________','\n')

The process of squaring a number can be reduced to:

- Multiplying the number by itself.
- Using exponent rules to rewrite the expression.
- Applying special cases for squaring certain types of numbers.

For example, let's consider the number $a$:

- The square of $a$ is denoted as $a^2$, which means multiplying $a$ by itself: $a^2 = a \times a$.
- Using exponent rules, we can also express $a^2$ as $(a)^2 = a^{2-1} \times a^1 = a^1 \times a = a \times a$.
- For special cases, squaring a number like $5$ can be quickly calculated as $5^2 = 25$.

So, squaring a number involves either direct multiplication or applying exponent rules and special cases to simplify the expression. 
 ______________________ 

The given regular expression is {xx | x is any binary string}. To solve this, we can break it down as follows:

- "xx" represents a pair of binary digits, which can be either 00, 01, 10, or 11.
- "|" is the alternation operator, indicating that we have a choice between the options before a

##RAG Pipeline

###VectorDB
We will use Pinecone's free-to-use vectorDB

In [None]:
def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    from pinecone import Pinecone, ServerlessSpec
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc

In [None]:
INDEX_NAME = 'squad-index'

# Create the vector database
# We are passing the index_name and the size of our embeddings
pc = create_pinecone_index(INDEX_NAME, shape[1])

Creating a Pinecone index...
Done!


Now that we have created the vector database, let's add some data to it!

In [None]:
def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = 'context',  # Adjust based on your dataset structure
        batch_size: int = 128
):
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert (numpy.ndarray)
        dataset: The dataset containing the metadata
        text_field: The field in the dataset that contains the metadata
        batch_size: The batch size to use for upserting
    Returns:
        An updated pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape

    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: dataset[text_field][i]} for i in range(shape[0])]  # Adjust metadata field here

    # Convert embeddings from numpy.ndarray to list of Python floats or integers
    embeddings_list = embeddings.tolist()

    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings_list, meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index




In [None]:
# Example usage
INDEX_NAME = 'squad-index'  # Adjusted index name to 'squad-index'

# Assuming `embeddings` and `dataset` are defined
# `embeddings` should be your numpy array of embeddings
# `dataset` should be your dictionary containing SQuAD data

# Create a Pinecone index object
index = Pinecone(api_key=PINECONE_API_KEY).Index(INDEX_NAME)

# Upsert the embeddings to the Pinecone index
index_upserted = upsert_vectors(index, embeddings, dataset)


Upserting the embeddings to the Pinecone index...


100%|██████████| 4/4 [00:02<00:00,  1.70it/s]


Let's view the index statistics!

In [None]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 400}},
 'total_vector_count': 400}

###Final RAG pipeline:

In [None]:
def augment_prompt(
        query: str,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        index=None,
        k=3,
) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        model: The SentenceTransformer model for encoding queries
        index: The Pinecone index object
    Returns:
        str: The augmented prompt
    """
    print("Augmenting the prompt with knowledge base results...")
    results = model.encode(query)

    # Convert results from numpy.ndarray to list of Python floats or integers
    results_list = results.tolist()

    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results_list,
        top_k=k,
        include_values=True,
        include_metadata=True
    )['matches']

    text_matches = [match['metadata']['context'] for match in query_results]  # Adjust to correct metadata field

    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)

    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - say that you don't know.
    Query: {query}"""

    return augmented_prompt, source_knowledge


Using RAG to improve LLM response to hard queries:

In [None]:

query1 = "what can the process of squaring a num can be reduced to?"
query2 = "{xx | x is any binary string} can be solved in what time?"
query3 = "give me two well-known complexity resources"

queries = [query1, query2, query3]
ks = [5000, 10, 1000]

for query, k in zip(queries,ks):
  augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index,k=k)
  # print(augmented_prompt)
  response = co.chat(
        model='command-r-plus',
        message=augmented_prompt,
    )
  print(response.text,'\n','______________________','\n')

Augmenting the prompt with knowledge base results...
The process of squaring a number can be reduced to repeated multiplication. 
 ______________________ 

Augmenting the prompt with knowledge base results...
The context states that the language {xx | x is any binary string} can be solved in linear time on a multi-tape Turing machine. 
 ______________________ 

Augmenting the prompt with knowledge base results...
The two most well-known complexity resources are time and space. 
 ______________________ 

