In [1]:
import os

import argparse
import json
import numpy as np
import backoff


import faiss
import torch
# from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate

from abc import abstractmethod
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor

import openai
import tiktoken
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
from langchain_core.documents import Document
from typing_extensions import List, TypedDict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class State(TypedDict):
    context: str
    input: str
    output: str

In [2]:
def mean_pooling(tokenEmbeddings, paddingInfo):
    tokenEmbeddingsNoPad = tokenEmbeddings.masked_fill(~paddingInfo[...,None].bool(), 0)
    sentenceEmbeddings = tokenEmbeddingsNoPad.sum(dim = 1) / paddingInfo.sum(dim = 1)[...,None]
    return sentenceEmbeddings

def mean_pooling_embedding_with_normalization(batch_str, tokenizer, model):
    encoding = tokenizer(batch_str, padding=True, truncation=True, return_tensors='pt')
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    if(torch.cuda.is_available()):
        cuda_device = torch.device("cuda") 
        input_ids = input_ids.to(cuda_device)
        attention_mask = attention_mask.to(cuda_device)
    else:
        cuda_device = torch.device("cpu") 
        input_ids = input_ids.to(cuda_device)
        attention_mask = attention_mask.to(cuda_device)
    outputs = model(input_ids, attention_mask=attention_mask)
    sentenceEmbeddings = mean_pooling(outputs[0], attention_mask)
    sentenceEmbeddingsNorm = sentenceEmbeddings.divide(torch.linalg.norm(sentenceEmbeddings,dim = 1)[...,None])
    return sentenceEmbeddingsNorm    

In [3]:
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
# client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# ircot_reason_instruction = 'You serve as an intelligent assistant, adept at facilitating users through complex, multi-hop reasoning across multiple documents. This task is illustrated through a demonstration consisting of a document set paired with a relevant question and its multi-hop reasoning thoughts, delineated by the string "Thought". Your task is to generate one thought for current step, DON\'T generate all thoughts at once! If you reach what you believe to be the final step, start with "So the answer is:".'

In [5]:
# def parse_prompt(file_path: str, has_context=True):
#     with open(file_path, 'r', encoding='utf-8') as file:
#         content = file.read()

#     # Split the content by the metadata pattern
#     parts = content.split('# METADATA: ')
#     parsed_data = []
#     if has_context:
#         for part in parts[1:]:  # Skip the first split as it will be empty
#             metadata_section, rest_of_data = part.split('\n', 1)
#             metadata = json.loads(metadata_section)
#             document_sections = rest_of_data.strip().split('\n\nQ: ')
#             document_text = document_sections[0].strip()
#             qa_pair = document_sections[1].split('\nA: ')
#             question = qa_pair[0].strip()
#             thought_and_answer = qa_pair[1].strip().split('So the answer is: ')
#             thought = thought_and_answer[0].strip()
#             answer = thought_and_answer[1].strip()

#             parsed_data.append({
#                 'metadata': metadata,
#                 'document': document_text,
#                 'question': question,
#                 'thought_and_answer': qa_pair[1].strip(),
#                 'thought': thought,
#                 'answer': answer
#             })
#     else:
#         for part in parts[1:]:
#             metadata_section, rest_of_data = part.split('\n', 1)
#             metadata = json.loads(metadata_section)
#             s = rest_of_data.split('\n')
#             question = s[0][3:].strip()
#             thought_and_answer = s[1][3:].strip().split('So the answer is: ')
#             thought = thought_and_answer[0].strip()
#             answer = thought_and_answer[1].strip()

#             parsed_data.append({
#                 'metadata': metadata,
#                 'question': question,
#                 'thought_and_answer': s[1][3:].strip(),
#                 'thought': thought,
#                 'answer': answer
#             })

#     return parsed_data


def num_tokens_by_tiktoken(text: str):
    return len(enc.encode(text))

In [None]:
#what is the retriever in this case?
def retrieve_and_generate(idx, sample, dataset, top_k, k_list,max_steps, few_shot_samples, corpus, retriever, client, processed_ids):
    

In [None]:
def retrieve(state: State, tokenizer: AutoTokenizer, model: AutoModel):
    with torch.no_grad():
        query_embedding = mean_pooling_embedding_with_normalization(state["input"])

In [None]:
dim = 768
#normalize embeddings before building index using inner product. Note that maximal inner product with normalized embeddings is equivalent to cosine similarity 
norm = True

"""User-inputted args"""
unit = "hippo"
dataset = 'musique'
max_steps = 1
num_demo = 0


model_label = 'facebook_contriever'

vector_path = f'data/{dataset}/{dataset}_{model_label}_{unit}_vectors_norm.npy'
index_path = f'data/{dataset}/{dataset}_{model_label}_{unit}_ip_norm.index'
if(os.path.isfile(vector_path)):
    vectors = np.load(vector_path)
if dataset == 'musique':
    corpus = json.load(open('data/musique_corpus.json', 'r'))
elif dataset == '2wikimultihopqa':
    corpus = json.load(open('data/2wikimultihopqa_corpus.json', 'r'))

corpus_contents = List()
for item in corpus:
    corpus_contents.append(Document(page_content=item["text"], metadata=item["title"]))
print('corpus size: {}'.format(len(corpus_contents)))

#create sentence-level embeddings using mean-pooling and normalize to prepare for cosine similarity indexing

if os.path.isfile(vector_path):
    print('Loading existing vectors:', vector_path)
    vectors = np.load(vector_path)
    print('Vectors loaded:', len(vectors))

else:
    # load model
    tokenizer = AutoTokenizer.from_pretrained('facebook/contriever')
    model = AutoModel.from_pretrained('facebook/contriever')
    # Check if multiple GPUs are available and if so, use them all
    """CHANGE THIS FOR SERVER RUN"""
    if (torch.cuda.is_available()):
        device = torch.device("cuda")    
        model.to(device)
    else:
        device = torch.device("cpu")
        model.to(device)
    #test batch size = 16 and batch size = 32 
    batch_size = 16
    vectors = np.zeros((len(corpus_contents), dim))
    #get batch_size number of entries from corpus_contents, tokenize and embed them in 768 dimensional space
    for idx in range(0, len(corpus_contents), batch_size):
        end_idx = min(idx + batch_size, len(corpus_contents))
        seqs = corpus_contents[idx:end_idx]
        try:
            #read above comments to understand what this function does
            batch_embeddings = mean_pooling_embedding_with_normalization(seqs, tokenizer, model)
        except Exception as e:
            batch_embeddings = torch.zeros((len(seqs), dim))
            print(f'Error at {idx}:', e)
        vectors[idx:end_idx] = batch_embeddings.detach().to('cpu').numpy()
    print("Type of vectors is {}".format(type(vectors)))
    fp = open(vector_path, 'wb')
    np.save(fp, vectors)
    fp.close()
    print('vectors saved to {}'.format(vector_path))

    # using FAISS on CPU (GPU support unavailable for mac)
    if os.path.isfile(index_path):
            print('index file already exists:', index_path)
            print('index size: {}'.format(faiss.read_index(index_path).ntotal))
    else:
        print('Building index...')
        index = faiss.IndexFlatIP(dim)
        vectors = vectors.astype('float32')
        index.add(vectors)

        # save faiss index to file
        fp = open(index_path, 'w')
        faiss.write_index(index, index_path)
        print('index saved to {}'.format(index_path))
        print('index size: {}'.format(index.ntotal))

llm_model = 'gpt-3.5-turbo-1106'
llm = 'openai'
"""
User-inputted args
"""

"""
For 2wikimultihopqa, change max_steps to 1, num_demo to 1 to perform multistep retrieval.
For musique, change max_steps to 3, num_demo 1 to perform multistep retrieval.
"""
max_steps = 1
num_demo = 1
top_k = 8
#load dataset
if dataset == 'musique':
    data = json.load(open('data/musique.json', 'r'))
    corpus = json.load(open('data/musique_corpus.json', 'r'))
    prompt_path = 'data/ircot_prompts/musique/gold_with_3_distractors_context_cot_qa_codex.txt'
    max_steps = max_steps if max_steps is not None else 4
elif dataset == '2wikimultihopqa':
    data = json.load(open('data/2wikimultihopqa.json', 'r'))
    corpus = json.load(open('data/2wikimultihopqa_corpus.json', 'r'))
    prompt_path = 'data/ircot_prompts/2wikimultihopqa/gold_with_3_distractors_context_cot_qa_codex.txt'
    max_steps = max_steps if max_steps is not None else 2
else:
    raise NotImplementedError(f'Dataset {dataset} not implemented')

# doc_ensemble = ''
# doc_ensemble_str = 'doc_ensemble' if doc_ensemble else 'no_ensemble'
doc_ensemble_str = ''
alt_model_label = 'facebook_contriever'
if max_steps > 1:
    k_list = [1, 2, 5, 8]
    output_path = f'output/ircot/{dataset}_{alt_model_label}_demo_{num_demo}_{llm_model}_{doc_ensemble_str}_step_{max_steps}_top_{top_k}.json'
else:  # only one step
    top_k = 100
    output_path = f'output/proposition_{dataset}_{alt_model_label}_{doc_ensemble_str}.json'
    k_list = [1, 2, 5, 10, 15, 20, 30, 50, 100]

if dataset == 'musique':
    faiss_index = faiss.read_index('data/musique/musique_facebook_contriever_proposition_ip_norm.index')
else:
    faiss_index = faiss.read_index('data/2wikimultihopqa/2wikimultihopqa_facebook_contriever_proposition_ip_norm.index')
    
model_label = 'facebook/contriever'


total_recall = {k: 0 for k in k_list}

results = data
processed_ids = set()

load_dotenv('.env')
# print(os.getenv("OPENAI_API_KEY"))
#Create OpenAI Client
# client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

#examples for model
# few_shot_samples = parse_prompt(prompt_path)
# few_shot_samples = few_shot_samples[:num_demo]
# print('num of demo:', len(few_shot_samples))

k_list = [1, 2, 5, 8]

total_recall = {k: 0 for k in k_list}
processed_ids = set()

#requests per minute: 500
#tokens per minute: 10,000 

#few-shot prompting
# llm = ChatOpenAI(model="gpt-4o-mini", temperature = 0.0)
# examples = [{"input":f'{few_shot_samples["document"]}\n\nQuestion: {few_shot_samples["question"]}'}, {"output":f'Answer: {few_shot_samples["thought_and_answer"]}'}]


# Application Logic - multistep retrieval
# example_prompt = ChatPromptTemplate.from_messages(
#     [
#         ("human", "{input}"),
#         ("ai", "{output}"),
#     ]
# )
# few_shot_prompt = FewShotChatMessagePromptTemplate(
#     example_prompt=example_prompt,
#     examples=examples,
# )

# final_prompt = ChatPromptTemplate.from_messages(
#     [
#         ("system", "You are an assistant for question-answering tasks, adept at facilitating users through complex, multi-hop reasoning across multiple pieces of information. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise."),
#         few_shot_prompt,
#         ("human", "{input}"),
#     ]
# )

#application logic - single step retrieval


"""how to use:
chain = final_prompt | model
chain.invoke({"input": "What is 2 ðŸ¦œ 9?"})
"""

#tokenizer and embedding model
if(torch.cuda.is_available()):
    device  = torch.device("cuda") 
else:
    device  = torch.device("cpu") 
tokenizer = AutoTokenizer.from_pretrained(model_label).to(device)
model = AutoModel.from_pretrained(model_label).to(device)


for idx in range(len(data)):
    #change this logic
    idx, recall, retrieved_passages, thoughts, it = retrieve_and_generate(idx, data[idx], dataset, top_k, k_list,max_steps, few_shot_samples, corpus, retriever, client, processed_ids) 
    # print metrics
    for k in k_list:
        total_recall[k] += recall[k]
        print(f'R@{k}: {total_recall[k] / (idx + 1):.4f} ', end='')
    print()
    if max_steps > 1:
        print('[ITERATION]', it, '[PASSAGE]', len(retrieved_passages), '[THOUGHT]', thoughts)
    
    # record results
    results[idx]['retrieved'] = retrieved_passages
    results[idx]['recall'] = recall
    results[idx]['thoughts'] = thoughts
        
    # if idx % 50 == 0:
    #     f = open(output_path, 'w')
    #     json.dump(results, f)
    #     f.close()

# save results
f = open(output_path, 'w')
json.dump(results, f)
f.close()
print(f'Saved results to {output_path}')
for k in k_list:
    #average recall (across 1,000 questions for musique)
    print(f'R@{k}: {total_recall[k] / len(data):.4f} ', end='')