Souce:
- https://huggingface.co/learn/cookbook/en/advanced_rag
- https://arc.net/l/quote/vntkseji

- Flare:
  - https://ayushtues.medium.com/flare-advanced-rag-implemented-from-scratch-07ca75c89800

In [None]:
# https://arc.net/l/quote/vntkseji
# https://huggingface.co/learn/cookbook/en/advanced_rag

# Flare+T5: https://ayushtues.medium.com/flare-advanced-rag-implemented-from-scratch-07ca75c89800
!pip install -q torch transformers transformers accelerate bitsandbytes langchain sentence-transformers faiss-gpu openpyxl

In [None]:
pip install -U "transformers==4.38.0" --upgrade

In [None]:
!pip install unstructured

In [None]:
!pip install torch

In [None]:
# fix colab error: https://stackoverflow.com/questions/56081324/why-are-google-colab-shell-commands-not-working
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!pip install ragatouille

In [None]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import matplotlib.pyplot as plt

pd.set_option(
    "display.max_colwidth", None
)

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
%cd drive/MyDrive/ANLP
!ls

### Load your knowledge base

In [None]:
from langchain.docstore.document import Document as LangchainDocument
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:

webpages = 'data/webpages'
# https://python.langchain.com/docs/modules/data_connection/document_loaders/file_directory

loader = DirectoryLoader('data/webpages/', glob="**/*.txt")
docs = loader.load()
len(docs)

# 1. Retriever - embeddings

### 1.1 Split the documents into chunks

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer

EMBEDDING_MODEL_NAME = "thenlper/gte-base"


def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    chunk_overlap: int = None,
    tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
) -> List[LangchainDocument]:
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """

    if chunk_overlap is not None:
      chunk_overlap = chunk_overlap
    else:
      chunk_overlap=int(chunk_size / 10),

    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        add_start_index=True,
        strip_whitespace=True,
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique


In [None]:
docs_processed = split_documents(
    chunk_size = 512,
    chunk_overlap = 50,
    knowledge_base = docs,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)


tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(docs_processed)]
fig = pd.Series(lengths).hist()
plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
plt.show()

### 1.2 Building the vector database

##### Nearest Neighbor search algorithm

[FAISS](https://github.com/facebookresearch/faiss)

##### Distances
[here](https://osanseviero.github.io/hackerllama/blog/posts/sentence_embeddings/#distance-between-embeddings).
- **Cosine similarity** computes similarity between two vectors as the cosinus of their relative angle: it allows us to compare vector directions are regardless of their magnitude. Using it requires to normalize all vectors, to rescale them into unit norm.
- **Dot product** takes into account magnitude, with the sometimes undesirable effect that increasing a vector's length will make it more similar to all others.
- **Euclidean distance** is the distance between the ends of vectors.

In [5]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

EMBEDDING_MODEL_NAME = "thenlper/gte-base"

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    # model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  #  True for cosine similarity
)

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

NameError: name 'docs_processed' is not defined

In [None]:
KNOWLEDGE_VECTOR_DATABASE.save_local("faiss_index")

In [7]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    # model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  #  True for cosine similarity
)
new_db = FAISS.load_local("faiss_index", embedding_model)
docs = new_db.similarity_search("When is Andrew Carnegie' birthday?", k=3)
docs = [doc.page_content for doc in docs]
docs

['History -\n\nCMU - Carnegie Mellon University\n\nCarnegie Mellon University\n\n— — —\n\nAndrew Carnegie\n\nA self-educated "working boy" who loved books, Andrew Carnegie emigrated from Scotland in 1848 and settled in Pittsburgh, Pa. Attending night school and borrowing books, Carnegie went from factory worker in a textile mill to successful entrepreneur and industrialist. He rose to prominence by founding what became the world\'s largest steel producing company by the end of the 19th century.\n\nCarnegie Technical Schools\n\nAt one point the richest man in the world, Carnegie believed that "to die rich is to die disgraced." He turned his attention to writing, social activism and philanthropy, determined to establish educational opportunities for the general public where few existed.\n\nIn 1900, he donated $1 million for the creation of a technical institute for the city of Pittsburgh, envisioning a school where working-class men and women of Pittsburgh could learn practical skills, t

In [None]:
relevant_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query="When is Andrew Carnegie's birthday?", k=3)
relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

In [None]:
relevant_docs

# 2. Reader - LLM

### 2.1. Reader model


In [None]:
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import T5Tokenizer, T5ForConditionalGeneration
import os

In [None]:
HUGGINGFACEHUB_API_TOKEN = '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'

In [None]:
# initialize the LLM and its tokenizer, we are using Flan T5 Large for this
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")

# function to get the prediction and scores from the LLM, given a prompt
def get_prediction_and_scores(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    outputs =  model.generate(input_ids, output_scores=True, return_dict_in_generate=True, max_length=100)
    generated_sequence = outputs.sequences[0]

    # get the probability scores for each generated token
    transition_scores = torch.exp(model.compute_transition_scores(
        outputs.sequences, outputs.scores, normalize_logits=True
    )[0])
    return tokenizer.decode(generated_sequence), generated_sequence, transition_scores

In [None]:
# Google Gemma

# huggingfacehub_api_token = HUGGINGFACEHUB_API_TOKEN
# quantization_config = BitsAndBytesConfig(load_in_8bit=True)

# model = AutoModelForCausalLM.from_pretrained("google/gemma-2b",
#                                              quantization_config=quantization_config,
#                                              token = huggingfacehub_api_token)
# tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", token= huggingfacehub_api_token)



In [None]:
# # smaller
# name = 'MBZUAI/LaMini-GPT-774M'
# model = AutoModelForCausalLM.from_pretrained(name)
# tokenizer = AutoTokenizer.from_pretrained(name)

In [None]:
# READER_LLM = pipeline(
#     model=model,
#     tokenizer=tokenizer,
#     task="text-generation",
#     do_sample=True,
#     temperature=0.2,
#     repetition_penalty=1.3,
#     return_full_text=False,
#     max_new_tokens=30,
# )

## Re-ranking Rtriever

In [None]:
from ragatouille import RAGPretrainedModel

RERANKER = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

### 2.2. Prompt

The RAG prompt template below is what we will feed to the Reader LLM: it is important to have it formatted in the Reader LLM's chat template.

We give it our context and the user's question.

In [None]:
# prompt_in_chat_format = '''
# <start_of_turn>user
# Instructions for you: Using the information contained in the context,
# give a comprehensive answer to the question.
# Respond only to the question asked, response should be concise and relevant to the question.
# Provide the number of the source document when relevant.
# If the answer cannot be deduced from the context, do not give an answer <end_of_turn>
# <start_of_turn>model
# sounds good!<end_of_turn>
# <start_of_turn>user
# Here is the context {context}
# and the Question: {question}<end_of_turn>
# '''

In [None]:
# prompt_in_chat_format = [
#     {
#         "role": "system",
#         "content": """Using the information contained in the context, give a comprehensive answer to the question.
# Respond only to the question asked, response should be concise and relevant to the question.
# Provide the number of the source document when relevant.
# Give very short answers..
# If the answer cannot be deduced from the context, do not give an answer.""",
#     },
#     {
#         "role": "user",
#         "content": """Context:
# {context}
# ---
# Here is the question you need to answer.
# Question: {question}""",
#     },
# ]
# from langchain.prompts import PromptTemplate
# # RAG_PROMPT_TEMPLATE = PromptTemplate(
# #  template=prompt_in_chat_format, input_variables=["context", "question"]
# # )

# RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
#     prompt_in_chat_format, tokenize=False, add_generation_prompt=True)
# print(RAG_PROMPT_TEMPLATE)

In [None]:
from transformers import Pipeline


def answer_with_rag_without_flare(
    question: str,
    # llm: Pipeline,
    knowledge_index: FAISS,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 5,
    num_docs_final: int = 3,
) -> Tuple[str, List[LangchainDocument]]:


    # Gather documents with retriever
    print("=> Retrieving documents...")
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

    # Optionally rerank results
    if reranker:
        print("=> Reranking documents...")
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    # final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)
    input_text = question
    new_input_text = f"Keep your answers short and concise. Given the below context:\n{context}\n\n Answer the following \n{input_text}\n"

    # Redact an answer
    print("=> Generating answer...")
    generated_sequence, _, _ = get_prediction_and_scores(new_input_text)
    input_text = f"{input_text} {generated_sequence}"

    answer = input_text
    return answer, relevant_docs

In [None]:
user_query = 'What are the masters programs in LTI?'
answer, relevant_docs = answer_with_rag_without_flare(
    user_query, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER
)

In [None]:

print("==================================Answer==================================")
print(len(relevant_docs))
print(f"{answer}")

In [None]:

print("==================================Source docs==================================")
for i, doc in enumerate(relevant_docs):
    print(f"Document {i}------------------------------------------------------------")
    print(doc)

In [None]:
user_query = 'What is the Buggy race schedule this year?'

In [None]:
answer, relevant_docs = answer_with_rag_without_flare(
    user_query, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER
)

In [None]:

print("==================================Answer==================================")
print(len(relevant_docs))
print(f"{answer}")

In [None]:

print("==================================Source docs==================================")
for i, doc in enumerate(relevant_docs):
    print(f"Document {i}------------------------------------------------------------")
    print(doc)

## Flare T5
- essentially an extra acceptance step

In [None]:
from transformers import Pipeline


def answer_with_rag_flare(
    input_text: str,
    # llm: Pipeline,
    knowledge_index: FAISS,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 5,
    num_docs_final: int = 3,
    threshold = .5
) -> Tuple[str, List[LangchainDocument]]:

    relevant_docs = None
    while True:
        generated_sequence, tokens, scores = get_prediction_and_scores(input_text)
        if torch.min(scores)< threshold:
            confident_tokens = tokens[torch.where(scores>threshold)]
            query = tokenizer.decode(confident_tokens)

            # Gather documents with retriever
            print("=> Retrieving documents...")
            relevant_docs = knowledge_index.similarity_search(query=input_text, k=num_retrieved_docs)
            relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

            # Optionally rerank results
            if reranker:
                print("=> Reranking documents...")
                relevant_docs = reranker.rerank(input_text, relevant_docs, k=num_docs_final)
                relevant_docs = [doc["content"] for doc in relevant_docs]

            relevant_docs = relevant_docs[:num_docs_final]

            # Build the final prompt
            context = "\nExtracted documents:\n"
            context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

            # final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)
            new_input_text = f"Keep your answers short and concise. Given the below context:\n{context}\n\n Answer the following \n{input_text}\n"

            # Redact an answer
            print("=> Generating answer...")
            generated_sequence, _, _ = get_prediction_and_scores(new_input_text)
            input_text = f"{input_text} {generated_sequence}"
            if "</s>" in input_text:
                break
        else: # tokens are alrady high confidence
            input_text = f'{input_text} {generated_sequence}'
            if "</s>" in input_text:
                break
    answer = input_text
    print(relevant_docs)
    if relevant_docs is None:
      return answer, None

    return answer, relevant_docs

In [None]:
user_query = 'What is the Buggy race schedule this year?'
answer, relevant_docs = answer_with_rag_flare(
    user_query, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER
)

In [None]:

print("==================================Answer==================================")
print(len(relevant_docs))
print(f"{answer}")

In [None]:
user_query = 'What did the first doctorate graduate from CMU study?'
answer, relevant_docs = answer_with_rag_flare(
    user_query, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER
)

In [None]:
print("==================================Answer==================================")
# print(len(relevant_docs))
print(f"{answer}")

In [None]:
len(relevant_docs)

## Generate Answers for Webpage Questions

In [None]:
def generate_answer(question):
    answer, relevant_docs = answer_with_rag_flare(
        question, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER
    )
    print(question)
    print('---- relevant docs -----')
    print(relevant_docs)
    return answer

def generate_answers_all(qfile, afile):
    questions_file = open(qfile, 'r')
    questions = questions_file.readlines()
    ans_file = open(afile, "w+")
    for q in questions:
        ans = generate_answer(q)
        ans_file.write(ans + '\n')
    questions_file.close()
    ans_file.close()


In [None]:
generate_answers_all('data/test/questions.txt', 'system_outputs/system_output1.txt')
