In [11]:
"""
This block of code is responsible for building the index for our text corpus. We use BERT for our embeddings model and tokenizer, 
and we use FAISS cosine similarity to index our normalized vectors 
"""
import os
# import sys

# from src.processing import mean_pooling, mean_pooling_embedding_with_normalization

# sys.path.append('.')

import argparse
import json
import numpy as np

import faiss
import torch
# from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

#either musique dataset or 2wikimultihopqa dataset
# if __name__ == '__main__':
#     parser = argparse.ArgumentParser()
#     parser.add_argument('--dataset',type = str)

# args = parser.parse_args()

dim = 768
#normalize embeddings before building index using inner product. Note that maximal inner product with normalized embeddings is equivalent to cosine similarity 
norm = True
#CHANGE THIS
dataset = 'musique'
model_label = 'facebook_contriever'
vector_path = f'data/{dataset}/{dataset}_{model_label}_proposition_vectors_norm.npy'
index_path = f'data/{dataset}/{dataset}_{model_label}_proposition_ip_norm.index'
if(os.path.isfile(vector_path)):
    vectors = np.load(vector_path)
if dataset == 'musique':
    corpus = json.load(open('data/musique_proposition_corpus.json', 'r'))
elif dataset == '2wikimultihopqa':
    corpus = json.load(open('data/2wikimultihopqa_proposition_corpus.json', 'r'))
corpus_contents = []
for item in corpus:
    corpus_contents.append(item['title'] + '\n' + item['propositions'])
print('corpus size: {}'.format(len(corpus_contents)))

#create sentence-level embeddings using mean-pooling and normalize to prepare for cosine similarity indexing
#note: UPDATE TO USE distributedDataParallel

def mean_pooling(tokenEmbeddings, paddingInfo):
    tokenEmbeddingsNoPad = tokenEmbeddings.masked_fill(~paddingInfo[...,None].bool(), 0)
    sentenceEmbeddings = tokenEmbeddingsNoPad.sum(dim = 1) / paddingInfo.sum(dim = 1)[...,None]
    return sentenceEmbeddings

def mean_pooling_embedding_with_normalization(batch_str, tokenizer, model):
    mps_device = torch.device("mps") 
    encoding = tokenizer(batch_str, padding=True, truncation=True, return_tensors='pt').to(mps_device)
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    input_ids = input_ids.to(mps_device)
    attention_mask = attention_mask.to(mps_device)
    outputs = model(input_ids, attention_mask=attention_mask)
    sentenceEmbeddings = mean_pooling(outputs[0], attention_mask)
    sentenceEmbeddingsNorm = sentenceEmbeddings.divide(torch.linalg.norm(sentenceEmbeddings,dim = 1)[...,None])
    return sentenceEmbeddingsNorm

if os.path.isfile(vector_path):
    print('Loading existing vectors:', vector_path)
    vectors = np.load(vector_path)
    print('Vectors loaded:', len(vectors))

else:
    # load model
    tokenizer = AutoTokenizer.from_pretrained('facebook/contriever')
    model = AutoModel.from_pretrained('facebook/contriever')
    # Check if multiple GPUs are available and if so, use them all
    if not torch.backends.mps.is_available():
        if not torch.backends.mps.is_built():
            print("MPS not available because the current PyTorch install was not "
                "built with MPS enabled.")
        else:
            print("MPS not available because the current MacOS version is not 12.3+ "
                "and/or you do not have an MPS-enabled device on this machine.")
    else:
        # print("device available")
        mps_device = torch.device("mps")    
        model.to(mps_device)
        model = torch.nn.DataParallel(model)
    #test batch size = 16 and batch size = 32 
    batch_size = 16
    vectors = np.zeros((len(corpus_contents), dim))
    #get batch_size number of entries from corpus_contents, tokenize and embed them in 768 dimensional space
    for idx in range(0, len(corpus_contents), batch_size):
        end_idx = min(idx + batch_size, len(corpus_contents))
        seqs = corpus_contents[idx:end_idx]
        try:
            #read above comments to understand what this function does
            batch_embeddings = mean_pooling_embedding_with_normalization(seqs, tokenizer, model)
        except Exception as e:
            batch_embeddings = torch.zeros((len(seqs), dim))
            print(f'Error at {idx}:', e)
        vectors[idx:end_idx] = batch_embeddings.detach().to('cpu').numpy()
    print("Type of vectors is {}".format(type(vectors)))
    fp = open(vector_path, 'wb')
    np.save(fp, vectors)
    fp.close()
    print('vectors saved to {}'.format(vector_path))

    #using FAISS on CPU (GPU support unavailable for mac)
    if os.path.isfile(index_path):
            print('index file already exists:', index_path)
            print('index size: {}'.format(faiss.read_index(index_path).ntotal))
    else:
        print('Building index...')
        index = faiss.IndexFlatIP(dim)
        vectors = vectors.astype('float32')
        index.add(vectors)

        # save faiss index to file
        # fp = open(index_path, 'w')
        faiss.write_index(index, index_path)
        print('index saved to {}'.format(index_path))
        print('index size: {}'.format(index.ntotal))

corpus size: 11656
Type of vectors is <class 'numpy.ndarray'>
vectors saved to data/musique/musique_facebook_contriever_proposition_vectors_norm.npy
Building index...
index saved to data/musique/musique_facebook_contriever_proposition_ip_norm.index
index size: 11656


Sanity Check that Indexing Worked

In [12]:
D, I = index.search(vectors[:5], 4) # sanity check
print(I)
print(D)

[[ 0  9 19  1]
 [ 1  5  2  7]
 [ 2  1  7 10]
 [ 3 10 17  7]
 [ 4 15 13 18]]
[[1.         0.5485658  0.47989315 0.45996296]
 [1.0000001  0.5450722  0.5391499  0.5123867 ]
 [0.9999999  0.5391499  0.4831297  0.46856448]
 [1.         0.8503537  0.83556557 0.7047421 ]
 [1.0000001  0.5722819  0.4969578  0.49316454]]


Run the following block if you want to know statistics about approximate token size of each line in corpus

In [8]:
# total_len = 0
# max_len = 0
# min_len = 1000000
# for line in corpus_contents:
#     total_len += len(line)
#     if len(line) > max_len:
#         max_len = len(line)
#     if len(line) < min_len:
#         min_len = len(line)
# print(max_len / 4)
# print(min_len / 4)
# print((total_len / len(corpus_contents)) / 4)    

525.25
31.25
149.56335792724778


This block of code is responsible for evaluating our RAG system's retrieval on our two corpora

In [23]:
llm_model = 'gpt-3.5-turbo-1106'
llm = 'openai'
#how does the number of steps play a role?
max_steps = 1
#What does the number of documents in the demonstration mean?
num_demo = 0
#retrieving 8 documents at each step
top_k = 8
#what is parallel processing? how does the number of threads play a role?
thread = 6
#load dataset
if dataset == 'musique':
    data = json.load(open('data/musique.json', 'r'))
    corpus = json.load(open('data/musique_corpus.json', 'r'))
    prompt_path = 'data/ircot_prompts/musique/gold_with_3_distractors_context_cot_qa_codex.txt'
    max_steps = max_steps if max_steps is not None else 4
elif dataset == '2wikimultihopqa':
    data = json.load(open('data/2wikimultihopqa.json', 'r'))
    corpus = json.load(open('data/2wikimultihopqa_corpus.json', 'r'))
    prompt_path = 'data/ircot_prompts/2wikimultihopqa/gold_with_3_distractors_context_cot_qa_codex.txt'
    max_steps = max_steps if max_steps is not None else 2
else:
    raise NotImplementedError(f'Dataset {dataset} not implemented')


In [20]:
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
load_dotenv()
client = ChatOpenAI(api_key=os.environ.get("OPENAI_API_KEY"), model=llm_model, temperature=0.0, max_retries=5, timeout=60)

In [24]:
def parse_prompt(file_path: str, has_context=True):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Split the content by the metadata pattern
    parts = content.split('# METADATA: ')
    parsed_data = []
    if has_context:
        for part in parts[1:]:  # Skip the first split as it will be empty
            metadata_section, rest_of_data = part.split('\n', 1)
            metadata = json.loads(metadata_section)
            document_sections = rest_of_data.strip().split('\n\nQ: ')
            document_text = document_sections[0].strip()
            qa_pair = document_sections[1].split('\nA: ')
            question = qa_pair[0].strip()
            thought_and_answer = qa_pair[1].strip().split('So the answer is: ')
            thought = thought_and_answer[0].strip()
            answer = thought_and_answer[1].strip()

            parsed_data.append({
                'metadata': metadata,
                'document': document_text,
                'question': question,
                'thought_and_answer': qa_pair[1].strip(),
                'thought': thought,
                'answer': answer
            })
    else:
        for part in parts[1:]:
            metadata_section, rest_of_data = part.split('\n', 1)
            metadata = json.loads(metadata_section)
            s = rest_of_data.split('\n')
            question = s[0][3:].strip()
            thought_and_answer = s[1][3:].strip().split('So the answer is: ')
            thought = thought_and_answer[0].strip()
            answer = thought_and_answer[1].strip()

            parsed_data.append({
                'metadata': metadata,
                'question': question,
                'thought_and_answer': s[1][3:].strip(),
                'thought': thought,
                'answer': answer
            })

    return parsed_data
few_shot_samples = parse_prompt(prompt_path)
few_shot_samples = few_shot_samples[:num_demo]
print('num of demo:', len(few_shot_samples))

num of demo: 0
