Souce:
- https://huggingface.co/learn/cookbook/en/advanced_rag
- https://arc.net/l/quote/vntkseji

# Assumptions
- the faiss_index embeddings are up to date

In [2]:
import os
from dotenv import load_dotenv
load_dotenv('.env')
hf_api = os.getenv('HF_API')
HUGGINGFACEHUB_API_TOKEN = hf_api

In [6]:
!pip3 install numpy
import numpy as np
np.__version__



'1.26.4'

In [7]:
pip install transformers==4.38.0

Note: you may need to restart the kernel to use updated packages.


In [8]:
!pip install -q torch accelerate bitsandbytes langchain sentence-transformers faiss-gpu openpyxl

[31mERROR: Could not find a version that satisfies the requirement faiss-gpu (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for faiss-gpu[0m[31m
[0m

In [9]:
!pip install unstructured ragatouille
# reranker
from ragatouille import RAGPretrainedModel



In [10]:
# fix colab error: https://stackoverflow.com/questions/56081324/why-are-google-colab-shell-commands-not-working
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [11]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import matplotlib.pyplot as plt
pd.set_option(
    "display.max_colwidth", None
)

In [12]:
# Imports
import os
import pandas as pd

# langchain imports
from langchain.docstore.document import Document as LangchainDocument
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# hf imports
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import T5Tokenizer, T5ForConditionalGeneration

# reranking

from ragatouille import RAGPretrainedModel
from transformers import Pipeline



# Specify the models/versions

In [14]:
# give the paths
# QUESTIONS_FILE = 'data/test/questions_webpages.txt'
# OUTPUT_FILE = 'system_outputs/webpages.txt'

# FAISS_FILE = '../faiss_index_author_papers_natural_language' # it's actually a folder but whatever
FAISS_FILE = '../faiss_index_total_final' # it's actually a folder but whatever

EMBEDDING_MODEL = "thenlper/gte-base" # make sure this matches whatever was used to create the doc embeddings
GENERATOR_MODEL = "google/flan-t5-large"
RERANKER_MODEL = "colbert-ir/colbertv2.0"

RERANKER = RAGPretrainedModel.from_pretrained(RERANKER_MODEL)

[Mar 12, 02:55:17] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...




In [15]:
# initialize the LLM and its tokenizer, we are using Flan T5 Large for this
tokenizer = T5Tokenizer.from_pretrained(GENERATOR_MODEL)
model = T5ForConditionalGeneration.from_pretrained(GENERATOR_MODEL)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Load the knowledge base

In [16]:
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  #  True for cosine similarity
    )

In [17]:
KNOWLEDGE_VECTOR_DATABASE = FAISS.load_local(FAISS_FILE, embedding_model)

In [18]:
# function to get the prediction and scores from the LLM, given a prompt
def get_prediction_and_scores(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    outputs =  model.generate(input_ids, output_scores=True, return_dict_in_generate=True, max_length=100)
                            #   skip_special_tokens=True)
    generated_sequence = outputs.sequences[0]

    # get the probability scores for each generated token
    transition_scores = torch.exp(model.compute_transition_scores(
        outputs.sequences, outputs.scores, normalize_logits=True
        # , skip_special_tokens = True
    )[0])
    return tokenizer.decode(generated_sequence), generated_sequence, transition_scores

## Retrieval and Answer Generation

In [69]:
def flanT5_without_threshold(
    question: str,
    knowledge_index: FAISS,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 5,
    num_docs_final: int = 3
    ):

    print("=> Retrieving documents...")
    # Gather documents with retriever
    relevant_docs_acquired = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    # print(relevant_docs_acquired)
    # print(relevant_docs_acquired)
    if reranker:
        print("=> Reranking documents...")
        relevant_docs = [doc.page_content for doc in relevant_docs_acquired]
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        # print(relevant_docs)
        relevant_docs_content = [doc["content"] for doc in relevant_docs]
        relevant_doc_score = [doc["score"] for doc in relevant_docs]

    else:
        relevant_docs_content = [doc.page_content for doc in relevant_docs_acquired]

    relevant_docs_content = relevant_docs_content[:num_docs_final]
    # relevant_doc_id = relevant_doc_id[:num_docs_final]
    # relevant_doc_index = relevant_doc_index[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs_content)])

    context_and_question = f"Keep your answers short and concise. If the text has date and time include the date, time both. If there are multiple right answers, include them all, but keep it short overall. If the question cannot be answered from the context, say 'I don't know'. \n Given the below context:\n{context}\n\n Answer the following \n{question}\n"

    # context_and_question = """
    # Answer the user's questions based on the below context. Please keep your answers short and concise. Only provide the answer itself."
    # ------------
    # {context}
    # ------------
    # Question: {question}
    # Answer:
    # """

    # Redact an answer
    print("=> Generating answer...")
    generated_sequence, _, _ = get_prediction_and_scores(context_and_question)
    # answer = f"{question} {generated_sequence}"

    # removing the special tokens and padding
    answer = generated_sequence.replace("<pad>", "").replace("</s>", "").replace("\n", "").strip()

    return answer, relevant_docs_content

In [71]:
user_query = 'Who is the first of the paper "Extracting training data from diffusion models"?'
# user_query = 'Was carnegie the best man alive"?'

In [72]:
answer, relevant_docs = flanT5_without_threshold(
    user_query, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER
)

=> Retrieving documents...




=> Reranking documents...
Your documents are roughly 262.4 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.52it/s]


=> Generating answer...


In [73]:
print("==================================Answer==================================")
print(len(relevant_docs))
print(f"{answer}")

3
Aditya Ramesh


In [74]:
print("==================================Source docs==================================")
for  doc in (relevant_docs):
    print(f"Document ------------------------------------------------------------")
    print(f'{doc}')

Document ------------------------------------------------------------
## TITLE
Extracting Training Data from Diffusion Models
Milad ran the membership inference experiments on Vikash ran extraction experiments on pretrained Daphne and Florian improved  gure clarity and presentation. Daphne, Borja, and Eric edited the paper and contributed Nicholas organized the project and wrote the initial paper draft.

Acknowledgements and Con icts of Interest The authors are grateful to Tom Goldstein, Olivia Wiles, Katherine Lee, Austin Tarango, Ian Wilbur, Jeff Dean, Andreas Terzis, Robin Rombach, and Andreas Blattmann for comments on early drafts of this paper.

Nicholas, Milad, Matthew, and Daphne are employed at Google, and Jamie and Borja are employed at DeepMind, companies that both train large machine learning models (including diffusion models) on both public and Eric Wallace is supported by the Apple Scholars in [1] Mart  n Abadi, Andy Chu, Ian Goodfellow, H Brendan McMahan, Ilya Mironov, K

## Evaluation

In [76]:
# specify output csv 
input_file = 'qa_gold-t5-large.csv'
output_file = 'flan-t5-large-output.csv'
csv_input_dir = f'csv_qa_gold/{input_file}'
csv_output_dir = f'csv_qa_gold/{output_file}'

In [77]:
def initialize_model_answer_column(df):
    if 'ModelAnswer' not in df.columns:
        df['ModelAnswer'] = None
    return df

In [78]:
import glob
import os
import pandas as pd

directory_csv = 'csv_qa_gold'
csv_files = ['csv_qa_gold/test_combined.csv']

# read in the csv files in the directory and concatenate
df_total = pd.DataFrame()
for file in csv_files:
    df = pd.read_csv(file)
    print(df.shape)
    df_total = pd.concat([df_total, df], axis=0)
print(df_total.shape)
df_total = initialize_model_answer_column(df_total)
df_total.to_csv(csv_input_dir, index=False) 
# doing this so the original is untouched

(191, 3)
(191, 3)


In [79]:
df_total.iloc[10:20, :] # please check that excel does not fuck up for row 15,16 the November 2006 to Nov-06 or some other format

Unnamed: 0,Category,Question,Answer,ModelAnswer
10,webpages,Was the Scottish terrier a familiar figure at CMU before it officially became a mascot?,Yes,
11,webpages,Is there an official mascot costume of Scotty?,Yes,
12,webpages,Who led the effort to make Carnegie Mellon's mascot the Scottish Terrier?,Susan Bassett and Jennifer Church,
13,webpages,Where did Scotty debug as CMU's official mascot?,at a football game,
14,webpages,Who did CMU partner with to create the Scotty mascot graphics?,SME Branding,
15,webpages,What month and year was the Mascot Identity Task Force formed?,November 2006,
16,webpages,When did Scotty debut as CMU's official mascot?,November 2007,
17,webpages,What characteristics does the Scottish terrier represent?,"determined, thoughtful, strength, power, agility in a small package",
18,webpages,Where did the athletic director of CMU graduate from?,Brandeis,
19,webpages,When was Carnegie Technical Schools founded?,1900,


In [49]:
def generate_answer(question):
    answer, _ = flanT5_without_threshold(
        question, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER
    )
    return answer

In [53]:
# Define the function to process the CSV and add model answers
def generate_answers_and_save(csv_path):
    df = pd.read_csv(csv_path)
    if 'ModelAnswer' not in df.columns:
        df['ModelAnswer'] = ''
    for index, row in df.iterrows():
        ModelAnswer = generate_answer(row['Question'])
        df.at[index, 'ModelAnswer'] = ModelAnswer
        df.to_csv(csv_output_dir, index=False)
    return df

In [54]:
df_with_model_answers = generate_answers_and_save(csv_path=csv_input_dir)
print(df_with_model_answers.shape)
df_with_model_answers.head()

=> Retrieving documents...




=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00,  2.78it/s]


=> Generating answer...
=> Retrieving documents...


  df.at[index, 'ModelAnswer'] = ModelAnswer


=> Reranking documents...
Your documents are roughly 236.6 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  2.01it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 236.6 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  2.02it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 290.4 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.35it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 236.6 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  2.01it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 327.8 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.37it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 271.2 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.54it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 271.2 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.51it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 236.6 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  2.03it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 271.2 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.16it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 283.0 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.70it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 309.2 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.17it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 324.2 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.23it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 296.0 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.37it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 258.6 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.72it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 283.0 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.74it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 299.6 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.37it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 312.8 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.38it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 294.8 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.25it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 359.8 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.38it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 320.2 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.46it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 283.0 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.73it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 313.6 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.34it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 289.6 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.40it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 301.2 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.38it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 313.8 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.51it/s]


=> Generating answer...
=> Retrieving documents...
=> Reranking documents...
Your documents are roughly 307.2 tokens long at the 90th percentile! This is quite long and might slow down reranking!
 Provide fewer documents, build smaller chunks or run on GPU if it takes too long for your needs!


100%|██████████| 1/1 [00:00<00:00,  1.46it/s]


=> Generating answer...


KeyboardInterrupt: 

In [None]:
# with additional extraction prompt after generation without prompt
type(df['ModelAnswer'][0]),type(df['Answer'][0])
df = df.astype(str)  # Convert columns to string type

from evaluation_csv import total_score_csv
exact_match, f1_score, recall_score = total_score_csv(df['ModelAnswer'], df['Answer'])
print(f'Exact match: {exact_match}')
print(f'F1 score: {f1_score}')
print(f'Recall score: {recall_score}')

In [None]:
for category in df['Category'].unique():
    print(f'Category: {category}')
    df_category = df[df['Category'] == category]
    exact_match, f1_score, recall_score = total_score_csv(df_category['ModelAnswer'], df_category['Answer'])
    print(f'Exact match: {exact_match}')
    print(f'F1 score: {f1_score}')
    print(f'Recall score: {recall_score}')
    print('\n')