## Retrieval Augmented Generation

## Setup the functions for prompting

In [1]:
import openai
import os, bibtexparser, pypdf, logging
import chromadb
import json
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer

from bert_score import BERTScorer
scorer = BERTScorer(model_type='bert-base-uncased')

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
def prompt_model(prompt):
    completion = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        store=True,
        messages=[
            {"role": "system", 'content': "You are a helpful assistant"},
            {"role": "user", 'content': prompt}
        ])

    model_response = completion.choices[0].message['content']
    tokens_used = completion['usage']['total_tokens']

    return model_response, tokens_used

## Parse data from source

In [23]:
logging.getLogger("pypdf").setLevel(logging.CRITICAL)

data_path = 'data/'
data = {}

files = os.listdir(data_path)
print('Reading %i files:' % len(files))
for f in files:
    path = os.path.join(data_path, f)

    # each datum will have at least these attributes
    d = {'filepath': None, 'title': None, 'text': None}

    # parse bibtex file, if exists
    if path.endswith('.bib'):
        if path[:-4] in data:
            d = data[path[:-4]]

        bib = bibtexparser.load(open(path, 'r'))
        if 'title' in bib.entries[0]:
            d['title'] = bib.entries[0]['title']
            data[path[:-4]] = d

    # parse pdf text, if exists
    if path.endswith('.pdf'):
        if path[:-4] in data:
            d = data[path[:-4]]

        print('  File: %s' % f)
        text = ''
        reader = pypdf.PdfReader(path)
        for page in reader.pages:
            text += page.extract_text()
        d['filepath'] = path
        d['text'] = text
        data[path[:-4]] = d

data = [d for d in data.values()]

Reading 78 files:
  File: 2023.findings-emnlp.620.pdf
  File: 29728-Article Text-33782-1-2-20240324-3.pdf
  File: 2024.acl-long.642.pdf
  File: 2021.findings-emnlp.320.pdf
  File: 2020.coling-main.207.pdf
  File: 2202.01110v2.pdf
  File: 2212.14024v2.pdf
  File: 2024.emnlp-industry.66.pdf
  File: 8917_Retrieval_meets_Long_Cont.pdf
  File: NeurIPS-2023-lift-yourself-up-retrieval-augmented-text-generation-with-self-memory.pdf
  File: NeurIPS-2023-leandojo-theorem-proving-with-retrieval-augmented-language-models.pdf
  File: NeurIPS-2020-retrieval-augmented-generation-for-knowledge-intensive-nlp-tasks.pdf
  File: 2023.acl-long.557.pdf
  File: tacl_a_00605.pdf
  File: 3637870.pdf
  File: 2023.emnlp-main.495.pdf
  File: 3626772.3657834.pdf
  File: 2402.19473v6.pdf
  File: 3626772.3657957.pdf
  File: 2024.eacl-demo.16.pdf
  File: 967_generate_rather_than_retrieve_.pdf
  File: 23-0037.pdf
  File: 2022.naacl-main.191.pdf
  File: 2312.10997v5.pdf
  File: 947_Augmented_Language_Models_.pdf


## Pre-process the data

In [24]:
# # Saving each paper's extracted text to a text file for manual cleaning
# for d in data:
#     text_filename = f"{d['filepath'][:-4]}.txt" 
#     with open(text_filename, 'w') as f:
#         f.write(d['text'])
#     print(f"Saved extracted text for {d['filepath']} to {text_filename}")

In [3]:
data_path = 'data/'
data = {}

files = os.listdir(data_path)
print(f'Reading {len(files)} files:')

for f in files:
    path = os.path.join(data_path, f)

    d = {'filepath': None, 'title': None, 'text': None}

    # Only process '.txt' files
    if path.endswith('.txt'):
        print(f'  File: {f}')
        
        with open(path, 'r', encoding='utf-8') as file:
            text = file.read()

        d['filepath'] = path
        d['text'] = text
        d['title'] = f[:-4] 

        data[path[:-4]] = d

data = [d for d in data.values()]

print(f'Loaded {len(data)} text files.')

Reading 78 files:
  File: 2212.14024v2.txt
  File: 2024.emnlp-industry.66.txt
  File: 8917_Retrieval_meets_Long_Cont.txt
  File: 2202.01110v2.txt
  File: 2020.coling-main.207.txt
  File: NeurIPS-2020-retrieval-augmented-generation-for-knowledge-intensive-nlp-tasks.txt
  File: NeurIPS-2023-lift-yourself-up-retrieval-augmented-text-generation-with-self-memory.txt
  File: NeurIPS-2023-leandojo-theorem-proving-with-retrieval-augmented-language-models.txt
  File: 29728-Article Text-33782-1-2-20240324-3.txt
  File: 2023.findings-emnlp.620.txt
  File: 2024.acl-long.642.txt
  File: 2021.findings-emnlp.320.txt
  File: 23-0037.txt
  File: 2022.naacl-main.191.txt
  File: 2312.10997v5.txt
  File: 967_generate_rather_than_retrieve_.txt
  File: 947_Augmented_Language_Models_.txt
  File: 2023.emnlp-main.495.txt
  File: 2023.acl-long.557.txt
  File: tacl_a_00605.txt
  File: 3637870.txt
  File: 3626772.3657957.txt
  File: 2024.eacl-demo.16.txt
  File: 3626772.3657834.txt
  File: 2402.19473v6.txt
Loaded

## Chunk data and generate indices

In [4]:
def generate_questions_for_chunk(chunk):
    prompt = f"Based on the following text, generate meaningful questions that can be answered using the content of the text:\n\n{chunk}\n\nPlease generate questions related to the content of the text."
    questions, tokens_used = prompt_model(prompt)
    return questions, tokens_used

In [27]:
chunk_question_mapping = {}
total_tokens = 0

for entry in data:
    text_chunk = entry['text'][:1000]
    questions, tokens_used = generate_questions_for_chunk(text_chunk)
    chunk_id = f"{entry['title']}_chunk_{len(chunk_question_mapping)}"
    
    chunk_question_mapping[chunk_id] = {
        'chunk': text_chunk,
        'questions': questions.split('\n'),  
    }
    print(f"Questions for {entry['title']}: {questions}")
    total_tokens += tokens_used    

with open('chunk_question_mapping.json', 'w') as f:
    json.dump(chunk_question_mapping, f, indent=4)

Questions for 2212.14024v2: 1. What is the main focus of the research presented in the abstract?
2. How does the proposed DSP framework differ from existing retrieval-augmented in-context learning methods?
3. What components are involved in the DSP framework?
4. What tasks does the DSP framework aim to address?
5. What are the advantages of using the DEMONSTRATE – SEARCH – PREDICT (DSP) framework for knowledge-intensive tasks?
6. How does the DSP framework break down problems for the language model and retrieval model?
7. What types of questions have the novel DSP programs been written to answer?
8. What is the significance of the early evaluations mentioned in the abstract?
9. What are the key features of "retrieve-then-read" pipelines as referenced in the text?
10. What does the text imply about the relationship between language models and retrieval models in the context of the DSP framework?
Questions for 2024.emnlp-industry.66: 1. What is Retrieval Augmented Generation (RAG) and ho

## Build the vector database

In [5]:
client = chromadb.Client(Settings(persist_directory="./chromadb"))
model = SentenceTransformer('all-MiniLM-L6-v2')
collection = client.get_or_create_collection("questions_data")

with open('chunk_question_mapping.json', 'r') as f:
    chunk_question_mapping = json.load(f)

questions = []
question_ids = []
metadatas = []

for chunk_id, data in chunk_question_mapping.items():
    for i, question in enumerate(data['questions']):
        question_id = f"{chunk_id}_question_{i}"
        
        questions.append(question)
        question_ids.append(question_id)
        metadatas.append({"chunk_id": chunk_id})

question_embeddings = model.encode(questions)

collection.add(
    documents=questions,
    embeddings=question_embeddings,
    metadatas=metadatas,
    ids=question_ids
)

print("Indexing complete.")

Indexing complete.


In [11]:
def query_vector_db(query, collection, model, chunk_question_mapping, top_k=3):
    query_embedding = model.encode(query)
    
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )
    
    retrieved_chunks = []
    for metadata in results['metadatas'][0]:
        chunk_id = metadata['chunk_id']
        
        if chunk_id in chunk_question_mapping:
            retrieved_chunks.append(chunk_question_mapping[chunk_id]['chunk'])
        else:
            print(f"Warning: {chunk_id} not found in chunk_question_mapping.")
    
    return retrieved_chunks

In [32]:
def generate_answer(query, chunks):
    context = "\n\n".join(chunks)
    prompt = f"Answer the following query based on the provided context. Use exact sentences from the context. If the answer is not in the context, respond with 'IDK'.\n\nQuery: {query}\n\nContext: {context}"
    answer, tokens_used = prompt_model(prompt)
    return answer, tokens_used

## Conduct experiments to evaluate user queries

In [41]:
with open('test-queries.json', 'r') as f:
    test_questions = json.load(f)

test_queries = [q['query'] for q in test_questions]
test_references = [q['answer'] for q in test_questions]

In [34]:
with open('dev-questions.json', 'r') as f:
    dev_questions = json.load(f)

dev_queries = [q['query'] for q in dev_questions]
dev_references = [q['answer'] for q in dev_questions]

In [35]:
test_rag_answers = []

for query in test_queries:
    retrieved_chunks = query_vector_db(query, collection, model, chunk_question_mapping)
    answer, tokens_used = generate_answer(query, retrieved_chunks)
    
    test_rag_answers.append(answer)

In [36]:
dev_rag_answers = []

for query in dev_queries:
    retrieved_chunks = query_vector_db(query, collection, model, chunk_question_mapping)
    answer, tokens_used = generate_answer(query, retrieved_chunks)
    
    dev_rag_answers.append(answer)

In [37]:
test_rag_answers

['LeanDojo is "an open-source Lean playground consisting of toolkits, data, models, and benchmarks."',
 'The primary contribution of the paper on Retrieval-Augmented Language Modeling (RALM) is that it "considers a simple alternative, which we dub In-Context RALM: leaving the LM architecture unchanged and prepending grounding documents to the input, without any further training of the LM."',
 'AIGC still faces hurdles such as updating knowledge, handling long-tail data, mitigating data leakage, and managing high training and inference costs.',
 'LeanDojo helps with theorem proving by "removing these barriers by introducing LeanDojo: an open-source Lean playground consisting of toolkits, data, models, and benchmarks." It "extracts data from Lean and enables interaction with the proof environment programmatically." LeanDojo "contains fine-grained annotations of premises in proofs, providing valuable data for premise selection—a key bottleneck in theorem proving." Using this data, "we dev

In [38]:
dev_rag_answers

['Retrieval-Augmented Generation (RAG) has recently emerged as a method to extend beyond the pre-trained knowledge of Large Language Models by augmenting the original prompt with relevant passages or documents retrieved by an Information Retrieval (IR) system. RAG has become increasingly important for Generative AI solutions, especially in enterprise settings or in any domain in which knowledge is constantly refreshed and cannot be memorized in the LLM.',
 'IDK',
 'The different metrics for evaluating a RAG system include "the ability of the retrieval system to identify relevant and focused context passages, the ability of the LLM to exploit such passages in a faithful way, and the quality of the generation itself." Additionally, with RAGA S, "we put forward a suite of metrics which can be used to evaluate these different dimensions without having to rely on ground truth human annotations."',
 'RAG has become increasingly important for Generative AI solutions, especially in enterprise 

In [30]:
# Test Queries

P, R, F1 = scorer.score(test_rag_answers, test_references)

print(f"Precision: {P.mean():.2f}")
print(f"Recall: {R.mean():.2f}")
print(f"F1 Score: {F1.mean():.2f}")

Precision: 0.84
Recall: 0.92
F1 Score: 0.87


In [40]:
# Dev Queries

P, R, F1 = scorer.score(dev_rag_answers, dev_references)

print(f"Precision: {P.mean():.2f}")
print(f"Recall: {R.mean():.2f}")
print(f"F1 Score: {F1.mean():.2f}")

Precision: 0.73
Recall: 0.75
F1 Score: 0.74
