Souce:
- https://huggingface.co/learn/cookbook/en/advanced_rag
- https://arc.net/l/quote/vntkseji

# Assumptions
- the faiss_index embeddings are up to date

In [2]:
import os
from dotenv import load_dotenv
load_dotenv('.env')
hf_api = os.getenv('HF_API')
HUGGINGFACEHUB_API_TOKEN = hf_api

In [1]:
!pip3 install numpy
import numpy as np
np.__version__



'1.26.3'

In [2]:
pip install transformers==4.38.0

Collecting transformers==4.38.0
  Downloading transformers-4.38.0-py3-none-any.whl.metadata (131 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.1/131.1 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers==4.38.0)
  Using cached huggingface_hub-0.21.4-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers==4.38.0)
  Using cached regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.38.0)
  Using cached tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers==4.38.0)
  Using cached safetensors-0.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.38.0-py3-none-any.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0

In [3]:
!pip install -q torch accelerate bitsandbytes langchain sentence-transformers faiss-gpu openpyxl

In [4]:
!pip install unstructured ragatouille
# reranker
from ragatouille import RAGPretrainedModel

Collecting unstructured
  Downloading unstructured-0.12.6-py3-none-any.whl.metadata (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.2/83.2 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ragatouille
  Downloading ragatouille-0.0.7.post10-py3-none-any.whl.metadata (15 kB)
Collecting backoff==2.2.1 (from unstructured)
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting certifi==2024.2.2 (from unstructured)
  Using cached certifi-2024.2.2-py3-none-any.whl.metadata (2.2 kB)
Collecting chardet==5.2.0 (from unstructured)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting dataclasses-json-speakeasy==0.5.11 (from unstructured)
  Downloading dataclasses_json_speakeasy-0.5.11-py3-none-any.whl.metadata (25 kB)
Collecting emoji==2.10.1 (from unstructured)
  Downloading emoji-2.10.1-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting filetype==1.2.0 (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.w

In [73]:
# fix colab error: https://stackoverflow.com/questions/56081324/why-are-google-colab-shell-commands-not-working
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [74]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import matplotlib.pyplot as plt
pd.set_option(
    "display.max_colwidth", None
)

In [75]:
# Imports
import os
import pandas as pd

# langchain imports
from langchain.docstore.document import Document as LangchainDocument
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# hf imports
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import T5Tokenizer, T5ForConditionalGeneration

# reranking

from ragatouille import RAGPretrainedModel
from transformers import Pipeline



# Specify the models/versions

In [105]:
# give the paths
QUESTIONS_FILE = 'questions.txt'
# OUTPUT_FILE = 'system_outputs/webpages.txt'

# FAISS_FILE = '../faiss_index_author_papers_natural_language' # it's actually a folder but whatever
FAISS_FILE = 'faiss_index_total_final_new' # it's actually a folder but whatever

EMBEDDING_MODEL = "thenlper/gte-base" # make sure this matches whatever was used to create the doc embeddings
GENERATOR_MODEL = "google/flan-t5-large"
RERANKER_MODEL = "colbert-ir/colbertv2.0"

RERANKER = RAGPretrainedModel.from_pretrained(RERANKER_MODEL)

In [77]:
!pip install sentencepiece



In [78]:
# initialize the LLM and its tokenizer, we are using Flan T5 Large for this
tokenizer = T5Tokenizer.from_pretrained(GENERATOR_MODEL)
model = T5ForConditionalGeneration.from_pretrained(GENERATOR_MODEL)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Load the knowledge base

In [79]:
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  #  True for cosine similarity
    )

In [80]:
KNOWLEDGE_VECTOR_DATABASE = FAISS.load_local(FAISS_FILE, embedding_model, allow_dangerous_deserialization=True)

In [81]:
# function to get the prediction and scores from the LLM, given a prompt
def get_prediction_and_scores(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    outputs =  model.generate(input_ids, output_scores=True, return_dict_in_generate=True, max_length=100)
                            #   skip_special_tokens=True)
    generated_sequence = outputs.sequences[0]

    # get the probability scores for each generated token
    transition_scores = torch.exp(model.compute_transition_scores(
        outputs.sequences, outputs.scores, normalize_logits=True
        # , skip_special_tokens = True
    )[0])
    return tokenizer.decode(generated_sequence), generated_sequence, transition_scores

## Retrieval and Answer Generation

In [82]:
def flanT5_without_threshold(
    question: str,
    knowledge_index: FAISS,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 10,
    num_docs_final: int = 3
    ):

    print("=> Retrieving documents...")
    # Gather documents with retriever
    relevant_docs_acquired = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    # print(relevant_docs_acquired)
    # print(relevant_docs_acquired)
    if reranker:
        print("=> Reranking documents...")
        relevant_docs = [doc.page_content for doc in relevant_docs_acquired]
        # print(relevant_docs)
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        
        relevant_docs_content = [doc["content"] for doc in relevant_docs]
        relevant_doc_score = [doc["score"] for doc in relevant_docs]

    else:
        relevant_docs_content = [doc.page_content for doc in relevant_docs_acquired]

    relevant_docs_content = relevant_docs_content[:num_docs_final]
    # relevant_doc_id = relevant_doc_id[:num_docs_final]
    # relevant_doc_index = relevant_doc_index[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs_content)])

    # context_and_question = f"Keep your answers short and concise. If the text has date and time include the date, time both. If there are multiple right answers, include them all, but keep it short overall. If the question cannot be answered from the context, say 'I don't know'. \n Given the below context:\n{context}\n\n Answer the following \n{question}\n"
    context_and_question = f"Keep your answers short and concise. If the text has date and time include the date, time both. If there are multiple right answers, include them all, but keep it short overall. \n Given the below context:\n{context}\n\n Answer the following \n{question}\n"
    # context_and_question = """
    # Answer the user's questions based on the below context. Please keep your answers short and concise. Only provide the answer itself."
    # ------------
    # {context}
    # ------------
    # Question: {question}
    # Answer:
    # """

    # Redact an answer
    print("=> Generating answer...")
    generated_sequence, _, _ = get_prediction_and_scores(context_and_question)
    # answer = f"{question} {generated_sequence}"

    # removing the special tokens and padding
    answer = generated_sequence.replace("<pad>", "").replace("</s>", "").replace("\n", "").strip()

    return answer, relevant_docs_content

In [58]:
user_query = 'Who taught 11737 Multilingual Natural Language Processing in fall 2023?'
# user_query = 'Was carnegie the best man alive"?'

In [59]:
answer, relevant_docs = flanT5_without_threshold(
    user_query, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER
)

=> Retrieving documents...
=> Reranking documents...
['Question: Is the course 11737 Multilingual Natural Language Processing. offered in  Fall 2023?\nAnswer: Yes', '## TITLE\nLearning Performance-Improving Code Edits\nPhD thesis, Western University, 2022. Lewis Tunstall, Leandro V on Werra, and Thomas Wolf. Natural Language Processing with Transformers . "O Reilly Media, Inc.", 2022. Jason Wei, Yi Tay, Rishi Bommasani, Colin Raffel, Barret Zoph, Sebastian Borgeaud, Dani Yogatama, Maarten Bosma, Denny Zhou, Donald Metzler, Ed H. Chi, Tatsunori Hashimoto, Oriol Vinyals, Percy Liang, Jeff Dean, and William Fedus. Emergent Abilities of Large Language Models. arXiv preprint arXiv:2206.07682 , 2022a.\n\nJason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. Chain-of-thought prompting elicits reasoning in large language models. Advances in Neural Information Processing Systems , 35:24824 24837, 2022b.\n\nThomas Wolf, Lysandre Debut, Victor Sanh

100%|██████████| 1/1 [00:00<00:00, 12.21it/s]

=> Generating answer...





In [60]:
print("==================================Answer==================================")
print(len(relevant_docs))
print(f"{answer}")

3
Li


In [61]:
print("==================================Source docs==================================")
for  doc in (relevant_docs):
    print(f"Document ------------------------------------------------------------")
    print(f'{doc}')

Document ------------------------------------------------------------
Question: Is the course 11737 Multilingual Natural Language Processing. offered in  Fall 2023?
Answer: Yes
Document ------------------------------------------------------------
Question: Who are the instructors of the course 11737 Multilingual Natural Language Processing.?
Answer: Li
Document ------------------------------------------------------------
Question: What is the name of course number 11737?
Answer: Multilingual Natural Language Processing.


## Evaluation

In [83]:
# specify output csv 
input_file = 'qa_gold-t5-large.csv'
output_file = 'flan-t5-large-output-new.csv'
csv_input_dir = f'csv_qa_gold/{input_file}'
csv_output_dir = f'csv_qa_gold/{output_file}'

In [84]:
def initialize_model_answer_column(df):
    if 'ModelAnswer' not in df.columns:
        df['ModelAnswer'] = None
    return df

In [85]:
import glob
import os
import pandas as pd

directory_csv = 'csv_qa_gold'
csv_files = ['csv_qa_gold/test_combined.csv']

# read in the csv files in the directory and concatenate
df_total = pd.DataFrame()
for file in csv_files:
    df = pd.read_csv(file)
    print(df.shape)
    df_total = pd.concat([df_total, df], axis=0)
print(df_total.shape)
df_total = initialize_model_answer_column(df_total)
df_total.to_csv(csv_input_dir, index=False) 
# doing this so the original is untouched

(191, 3)
(191, 3)


In [86]:
df_total.iloc[10:20, :] # please check that excel does not fuck up for row 15,16 the November 2006 to Nov-06 or some other format

Unnamed: 0,Category,Question,Answer,ModelAnswer
10,webpages,Was the Scottish terrier a familiar figure at CMU before it officially became a mascot?,Yes,
11,webpages,Is there an official mascot costume of Scotty?,Yes,
12,webpages,Who led the effort to make Carnegie Mellon's mascot the Scottish Terrier?,Susan Bassett and Jennifer Church,
13,webpages,Where did Scotty debug as CMU's official mascot?,at a football game,
14,webpages,Who did CMU partner with to create the Scotty mascot graphics?,SME Branding,
15,webpages,What month and year was the Mascot Identity Task Force formed?,November 2006,
16,webpages,When did Scotty debut as CMU's official mascot?,November 2007,
17,webpages,What characteristics does the Scottish terrier represent?,"determined, thoughtful, strength, power, agility in a small package",
18,webpages,Where did the athletic director of CMU graduate from?,Brandeis,
19,webpages,When was Carnegie Technical Schools founded?,1900,


In [106]:
def generate_answer(question):
    answer, _ = flanT5_without_threshold(
        question, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER
    )
    return answer

In [107]:
# def generate_answer(question, return_relevant_docs = False, llm = llm_answer_gen):
#     answer, relevant_docs = answer_llama_new(question, KNOWLEDGE_VECTOR_DATABASE,
#                             reranker=RERANKER, llm = llm)
#     if return_relevant_docs:
#         return answer, relevant_docs
#     return answer
OUTPUT_FILE= 'flan-t5-large-output-new-embeddings.txt'
# note that this overwrites previously generated answers to the answer file
def generate_answers_all(qfile, afile):
    with open(qfile, 'r') as questions_file, open(afile, "a") as ans_file:
        questions = questions_file.readlines()
        for q in questions:
            ans = generate_answer(q.strip())  # Remove newline character
            ans_file.write(ans + '\n')
            ans_file.flush()  # Flush buffer to ensure content is written immediately

In [108]:
# Define the function to process the CSV and add model answers
# def generate_answers_and_save(csv_path):
#     df = pd.read_csv(csv_path)
#     if 'ModelAnswer' not in df.columns:
#         df['ModelAnswer'] = ''
#     for index, row in df.iterrows():
#         ModelAnswer = generate_answer(row['Question'])
#         df.at[index, 'ModelAnswer'] = ModelAnswer
#         df.to_csv(csv_output_dir, index=False)
#     return df

In [110]:
# df_with_model_answers = generate_answers_and_save(csv_path=csv_input_dir)
df_with_model_answers = generate_answers_all(QUESTIONS_FILE, OUTPUT_FILE)
print(df_with_model_answers.shape)
df_with_model_answers.head()

=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 25.72it/s]

=> Generating answer...





=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 20.54it/s]

=> Generating answer...





=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 72.42it/s]

=> Generating answer...





=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 74.00it/s]

=> Generating answer...





=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 51.29it/s]

=> Generating answer...





=> Retrieving documents...


Process SpawnProcess-210:
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py", line 514, in _encode_multi_process_worker
    embeddings = model.encode(
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py", line 350, in encode
    out_features = self.forward(features)
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/container.py", line 215, in forward
    input = module(input)
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/mo

KeyboardInterrupt: 

In [99]:
# with additional extraction prompt after generation without prompt
df = pd.read_csv('csv_qa_gold/llama-no-temp-output.csv')
type(df['ModelAnswer'][0]),type(df['Answer'][0])
df = df.astype(str)  # Convert columns to string type

from evaluation_csv import total_score_csv
exact_match, f1_score, recall_score = total_score_csv(df['ModelAnswer'], df['Answer'])
print(f'Exact match: {exact_match}')
print(f'F1 score: {f1_score}')
print(f'Recall score: {recall_score}')

Exact match: 0.04712041884816754
F1 score: 0.19903977874921847
Recall score: 0.45991834195224224


In [100]:
for category in df['Category'].unique():
    print(f'Category: {category}')
    df_category = df[df['Category'] == category]
    exact_match, f1_score, recall_score = total_score_csv(df_category['ModelAnswer'], df_category['Answer'])
    print(f'Exact match: {exact_match}')
    print(f'F1 score: {f1_score}')
    print(f'Recall score: {recall_score}')
    print('\n')

Category: webpages
Exact match: 0.09259259259259259
F1 score: 0.31618458605389715
Recall score: 0.6935956790123456


Category: tabular_webpages
Exact match: 0.0
F1 score: 0.14436440295815298
Recall score: 0.46610576923076924


Category: other_pdf
Exact match: 0.0
F1 score: 0.13257874211960932
Recall score: 0.2637853692201518


Category: papers_pdf
Exact match: 0.0
F1 score: 0.0574977817213842
Recall score: 0.22222222222222224


Category: schedule_pdf
Exact match: 0.0
F1 score: 0.015810276679841896
Recall score: 0.045454545454545456


Category: jsons
Exact match: 0.05714285714285714
F1 score: 0.2100834755820694
Recall score: 0.49030412753234165


Category: json_hard
Exact match: 0.0
F1 score: 0.030124777183600716
Recall score: 0.048863636363636366


