Souce:
- https://huggingface.co/learn/cookbook/en/advanced_rag
- https://arc.net/l/quote/vntkseji

# Assumptions
- the faiss_index embeddings are up to date

In [1]:
import os
from dotenv import load_dotenv
load_dotenv('.env')
hf_api = os.getenv('HF_API')
HUGGINGFACEHUB_API_TOKEN = hf_api

In [2]:
pip install transformers==4.38.0

Note: you may need to restart the kernel to use updated packages.


In [5]:
# !pip install -q torch accelerate bitsandbytes langchain sentence-transformers faiss-gpu openpyxl
!pip install -q torch accelerate bitsandbytes langchain sentence-transformers faiss-cpu openpyxl

In [6]:
!pip install unstructured ragatouille
# reranker
from ragatouille import RAGPretrainedModel



In [7]:
!pip3 install numpy
import numpy as np
np.__version__



'1.26.4'

In [8]:
# fix colab error: https://stackoverflow.com/questions/56081324/why-are-google-colab-shell-commands-not-working
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [9]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import matplotlib.pyplot as plt
pd.set_option(
    "display.max_colwidth", None
)

In [11]:
# Imports
import os
import pandas as pd

# langchain imports
from langchain.docstore.document import Document as LangchainDocument
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# hf imports
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import T5Tokenizer, T5ForConditionalGeneration

# reranking

from ragatouille import RAGPretrainedModel
from transformers import Pipeline



In [13]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [14]:
# %cd drive/MyDrive/ANLP/NLP-RAG/src-rag
# !ls

# Specify the models/versions

In [15]:
# give the paths
QUESTIONS_FILE = 'data/test/questions_webpages.txt'
# OUTPUT_FILE = 'system_outputs/webpages.txt'

FAISS_FILE = '../faiss_index_author_papers_natural_language' # it's actually a folder but whatever
EMBEDDING_MODEL = "thenlper/gte-base" # make sure this matches whatever was used to create the doc embeddings
GENERATOR_MODEL = "google/flan-t5-large"
RERANKER_MODEL = "colbert-ir/colbertv2.0"

RERANKER = RAGPretrainedModel.from_pretrained(RERANKER_MODEL)

[Mar 11, 22:33:28] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...




In [16]:
# initialize the LLM and its tokenizer, we are using Flan T5 Large for this
tokenizer = T5Tokenizer.from_pretrained(GENERATOR_MODEL)
model = T5ForConditionalGeneration.from_pretrained(GENERATOR_MODEL)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Load the knowledge base

In [18]:
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL,
    multi_process=True,
    # model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  #  True for cosine similarity
    )

In [19]:
KNOWLEDGE_VECTOR_DATABASE = FAISS.load_local(FAISS_FILE, embedding_model)

In [20]:
# function to get the prediction and scores from the LLM, given a prompt
def get_prediction_and_scores(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    outputs =  model.generate(input_ids, output_scores=True, return_dict_in_generate=True, max_length=100)
                            #   skip_special_tokens=True)
    generated_sequence = outputs.sequences[0]

    # get the probability scores for each generated token
    transition_scores = torch.exp(model.compute_transition_scores(
        outputs.sequences, outputs.scores, normalize_logits=True
        # , skip_special_tokens = True
    )[0])
    return tokenizer.decode(generated_sequence), generated_sequence, transition_scores

## Retrieval and Answer Generation

In [21]:
def flanT5_without_threshold(
    question: str,
    knowledge_index: FAISS,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 3,
    num_docs_final: int = 2
    ):

    print("=> Retrieving documents...")
    # Gather documents with retriever
    relevant_docs_acquired = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    # print(relevant_docs_acquired)
    # print(relevant_docs_acquired)
    if reranker:
        print("=> Reranking documents...")
        relevant_docs = [doc.page_content for doc in relevant_docs_acquired]
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        # print(relevant_docs)
        relevant_docs_content = [doc["content"] for doc in relevant_docs]
        relevant_doc_score = [doc["score"] for doc in relevant_docs]

    else:
        relevant_docs_content = [doc.page_content for doc in relevant_docs_acquired]

    relevant_docs_content = relevant_docs_content[:num_docs_final]
    # relevant_doc_id = relevant_doc_id[:num_docs_final]
    # relevant_doc_index = relevant_doc_index[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs_content)])

    context_and_question = f"Keep your answers short and concise. If the text has date and time include the date, time both. If there are multiple right answers, include them all, but keep it short overall. \n Given the below context:\n{context}\n\n Answer the following \n{question}\n"

    # context_and_question = """
    # Answer the user's questions based on the below context. Please keep your answers short and concise. Only provide the answer itself."
    # ------------
    # {context}
    # ------------
    # Question: {question}
    # Answer:
    # """

    # Redact an answer
    print("=> Generating answer...")
    generated_sequence, _, _ = get_prediction_and_scores(context_and_question)
    # answer = f"{question} {generated_sequence}"

    # removing the special tokens and padding
    answer = generated_sequence.replace("<pad>", "").replace("</s>", "").strip()

    return answer, relevant_docs_content

In [22]:
user_query = 'Who is the first of the paper "Extracting training data from diffusion models"?'

In [23]:
answer, relevant_docs = flanT5_without_threshold(
    user_query, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER
)

=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 11.14it/s]


=> Generating answer...


In [24]:
print("==================================Answer==================================")
print(len(relevant_docs))
print(f"{answer}")

2
Nicholas Carlini


In [25]:
print("==================================Source docs==================================")
for  doc in (relevant_docs):
    print(f"Document ------------------------------------------------------------")
    print(f'{doc}')

Documet ------------------------------------------------------------
Question: Who is the first author of the paper 'Extracting Training Data from Diffusion Models'?
Answer: Nicholas Carlini
Notes: ##Title: Extracting Training Data from Diffusion Models
Documet ------------------------------------------------------------
Question: Who are the authors of the paper 'Extracting Training Data from Diffusion Models'?
Answer: Nicholas Carlini, Jamie Hayes, Milad Nasr, Matthew Jagielski, Vikash Sehwag, Florian Tramer, B. Balle, Daphne Ippolito, Eric Wallace
Notes: ##Title: Extracting Training Data from Diffusion Models


## Evaluation

In [None]:
def generate_answer(question):
    answer, _ = flanT5_without_threshold(
        question, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER
    )
    return answer

OUTPUT_FILE_WITHOUT_THRESHOLD= 'system_outputs/webpages_no_threshold.txt'

# note that this overwrites previously generated answers to the answer file
def generate_answers_all(qfile, afile):
    questions_file = open(qfile, 'r')
    questions = questions_file.readlines()
    ans_file = open(afile, "w+")
    for q in questions:
        ans = generate_answer(q)
        ans_file.write(ans + '\n')
    questions_file.close()
    ans_file.close()

generate_answers_all(QUESTIONS_FILE, OUTPUT_FILE_WITHOUT_THRESHOLD)


In [None]:
from evaluation import total_score

print(total_score(OUTPUT_FILE_WITHOUT_THRESHOLD, '../data/test/reference_answers.txt'))

---
# Leave for now the normal one works just fine

## Flare
- Source: https://ayushtues.medium.com/flare-advanced-rag-implemented-from-scratch-07ca75c89800
- essentially an extra acceptance step
- </s>
  -  is the seperation token (shows the end of a sentence. When we say that we break the generation)
  -  if you dont want this remove the if statement where we 'break'

In [None]:
# # function to get the prediction and scores from the LLM, given a prompt
# def get_prediction_and_score_flare(prompt):
#     input_ids = tokenizer(prompt, return_tensors="pt").input_ids
#     outputs =  model.generate(input_ids, output_scores=True, return_dict_in_generate=True, max_length=200)
#     generated_sequence = outputs.sequences[0]

#     # get the probability scores for each generated token
#     transition_scores = torch.exp(model.compute_transition_scores(
#         outputs.sequences, outputs.scores, normalize_logits=True
#     )[0])
#     return tokenizer.decode(generated_sequence), generated_sequence, transition_scores

In [None]:
# def flanT5_with_threshold(
#     input_text: str,
#     knowledge_index: FAISS,
#     reranker: Optional[RAGPretrainedModel] = None,
#     num_retrieved_docs: int = 5,
#     num_docs_final: int = 3,
#     threshold = .1
#     ):

#     relevant_docs = None

#     while True: # breaks when you have a separation token in the generated sequence

#         generated_sequence, tokens, scores = get_prediction_and_scores(input_text)

#         if torch.min(scores)< threshold:

#             # new query = high confidence tokens
#             confident_tokens = tokens[torch.where(scores>threshold)]
#             confident_query = tokenizer.decode(confident_tokens)

#             # Gather documents with retriever
#             relevant_docs_acquired = knowledge_index.similarity_search(query=confident_query, k=num_retrieved_docs)
#             # print(relevant_d|ocs_acquired)
#             # print(relevant_docs_acquired)
#             if reranker:
#                 print("=> Reranking documents...")
#                 relevant_docs = [doc.page_content for doc in relevant_docs_acquired]

#                 relevant_docs = reranker.rerank(confident_query, relevant_docs, k=num_docs_final)
#                 # print(relevant_docs)

#                 relevant_docs_content = [doc["content"] for doc in relevant_docs]
#                 # relevant_doc_score = [doc["score"] for doc in relevant_docs]

#             else:
#                 relevant_docs_content = [doc.page_content for doc in relevant_docs_acquired]

#             relevant_docs_content = relevant_docs_content[:num_docs_final]
#             # relevant_doc_score = relevant_doc_score[:num_docs_final]

#             # Build the final prompt
#             context = "\nExtracted documents:\n"
#             context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs_content)])

#             # new_input_text = f"Answer the user's questions based on the below context. Keep your answers short and concise.\n------------<context>\n{context}\n</context>------------\n<question> Here is the question\n{input_text}\n</question>\n"

#             new_input_text = f"Keep your answers short and concise. If there are multiple right answers, include them all, but keep it short overall. \n Given the below context:\n{context}\n\n Answer the following \n{input_text}\n"

#             # Redact an answer
#             print("=> Generating answer...")
#             generated_sequence, seq, _ = get_prediction_and_scores(new_input_text)

#             if "</s>" in generated_sequence:
#                 input_text = tokenizer.decode(seq, skip_special_tokens=True)
#                 break

#         else: # tokens are already high confidence
#             if "</s>" in generated_sequence:
#                 input_text = tokenizer.decode(tokens, skip_special_tokens=True)
#                 break

#     # print(relevant_docs_content)
#     answer = input_text.replace("<pad>", "").replace("</s>", "").strip()

#     if relevant_docs is None:
#         return answer, 'docs not needed'
#         # 'id = None', 'index = None'
#     return answer, relevant_docs_content

In [None]:
# # user_query = 'What is the Buggy race schedule this year?'
# user_query = 'What is the safety gear required by all buggy drivers?'
# answer, relevant_docs = flanT5_with_threshold(
#     user_query, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER
# )

In [None]:
# print("==================================Answer==================================")
# print(answer)

hat and gloves


In [None]:
# # print("=========================Relevant Documents===========================")
# print(len(relevant_docs))
# for i in relevant_docs:
#     print(i)

3
This manuscript describes the work that has been completed for domain characterization as an early step toward developing standardized PRO measures to evaluate these important outcomes specific to upper extremity transplantation.

## AUTHORNAME

Lori S. Levin

## JOURNAL

{'volume': '13', 'name': 'Frontiers in Psychology'}

## FIELDSOFSTUDY

['Medicine']

## URL

https://www.semanticscholar.org/paper/52a97ad16605c18e23c9750a388a26a9cdf12200

## YEAR

2023

## TLDR

Qualitative work with experts, clinicians, and patients has identified several domains of QOL that are unique to individuals who have received upper extremity transplants, which are distinct from topics covered by existing PRO measures.

## VENUE

Frontiers in Psychology

on, grip strength, pain, and the various activities of

daily living that require manual motor function and dexterity to

complete. Thus, we believe that assessment of functional ability is

best left to existing measures, such as SCI-FI Fine Motor, SCI-F

## Generate Answers

In [None]:
# def generate_answer(question):
#     answer, _ = flanT5_with_threshold(
#         question, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER
#     )
#     return answer

# OUTPUT_FILE_THRESHOLD= 'system_outputs/webpages_threshold.txt'

In [None]:
# def generate_answer(question):
#     answer, _ = flanT5_with_threshold(
#         question, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER
#     )
#     return answer


# generate_answers_all(QUESTIONS_FILE, OUTPUT_FILE_THRESHOLD)
