In [1]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = ...

In [3]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [4]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4-turbo")

In [5]:
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_text_splitters import CharacterTextSplitter
import json

In [6]:
from langchain_core.prompts import PromptTemplate

template = """You are a literature professor. I will provide you with snippets from a novel along with a question and corresponding choices pertaining to it. Please thoroughly analyze the content to accurately respond to the question.

Relevant snippets from the novel: 

{context}

Question: 

{question}

Only respond with the index of the correct answer (e.g., choose between A, B, C, and D). Your output should not contain anything else."""

custom_rag_prompt = PromptTemplate.from_template(template)

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

folder_path = "./NovelQA/Raw_Novels"
final_output = {}

for i, filename in enumerate(os.listdir(folder_path)):
    if filename.endswith(".txt"):
        print(i, len(os.listdir(folder_path)), filename)
        file_path = os.path.join(folder_path, filename)
        loader = TextLoader(file_path)
        documents = loader.load()

        # split it into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0, add_start_index=True)
        splits = text_splitter.split_documents(documents)
        vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(model="text-embedding-3-large"))
        retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 10})

        # Get the base name of the file (without the extension)
        base_name = os.path.splitext(filename)[0]

        # Construct the new filename with the .json extension
        json_filename = base_name + '.json'

        question_path = "./NovelQA/Data"
        question_path = os.path.join(question_path, json_filename)
        answers = []
        
        with open(question_path) as file:
            # Load the JSON data into a Python object
            questions = json.load(file)
        
            # Iterate over the elements in the JSON data
            for i, question in enumerate(questions):
                print(i, len(questions))
                # Access specific fields in each item
                question_name = question['Question']
                question_options = '\n'.join(f'{chr(65+i)}. {s}' for i, s in enumerate(question['Options']))
                title_name = '\n The title of the novel is: ' + base_name + '\n'
                final_question = title_name + question_name + '\n' + question_options
                rag_chain = (
                    {"context": retriever | format_docs, "question": RunnablePassthrough()}
                    | custom_rag_prompt
                    | llm
                    | StrOutputParser()
                )
                response = rag_chain.invoke(final_question)
                answers.append(response[0])
            
            final_output[base_name] = answers
            print(base_name, final_output)
        vectorstore.delete_collection()

with open('res_mc_gpt4.json', 'w') as json_file:
    json.dump(final_output, json_file, indent=4)

In [9]:
# refresh vectors
vectorstore.delete_collection()