# Create a Research Agent


This is a concept agent that is inspired by my process of researching on a subject. <br>
I also took some inspiration from BabyAGI (without tools) implementation


In [6]:
from dotenv import load_dotenv

load_dotenv("./.env")
import uuid
import os
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from yachalk import chalk

In [7]:
text_embedding_model = "text-embedding-ada-002"
embeddings = OpenAIEmbeddings(model=text_embedding_model)

## Set up the vector store


In [8]:
use_localdb = True

SUPABASE_PASSWORD = os.environ["SUPABASE_PASSWORD"]
SUPABASE_DBUSER = os.environ["SUPABASE_DBUSER"]
SUPABASE_DATABASE = os.environ["SUPABASE_DATABASE"]
supabasedb_string = f"postgresql://{SUPABASE_DBUSER}:{SUPABASE_PASSWORD}@db.doxggeyqopdnxfhseufq.supabase.co:5432/{SUPABASE_DATABASE}"

PGVECTOR_USER = os.environ["PGVECTOR_USER"]
PGVECTOR_PASSWORD = os.environ["PGVECTOR_PASSWORD"]
PGVECTOR_DATABASE = os.environ["PGVECTOR_DATABASE"]
localdb_string = f"postgresql://{PGVECTOR_USER}:{PGVECTOR_PASSWORD}@localhost:5432/{PGVECTOR_DATABASE}"

connection_string = localdb_string if use_localdb else supabasedb_string

In [9]:
## Law store
from langchain.vectorstores import PGVector

law_compilation_store = PGVector(
    collection_name="law_compilation",
    connection_string=connection_string,
    embedding_function=embeddings,
)

## Mahabharata Store
mahabharata_store = PGVector(
    collection_name="mahabharat_combined_text",
    connection_string=connection_string,
    embedding_function=embeddings,
)

### Supabase vector store for storing runs

The supabase client here is not used as a vector store.
I am only using it to save the runs data.
You can remove it if you dont need it.


In [10]:
from supabase.client import Client, create_client
from langchain.vectorstores import SupabaseVectorStore

supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

runs_store = SupabaseVectorStore(
    embedding=embeddings, client=supabase, table_name="runs", query_name="match_runs"
)

# ## Testing store
# run_id = str(uuid.uuid4())
# runs_store.add_texts(texts=["testing the store"], metadatas=[{"key": "value"}], ids=[run_id])
# matched_docs = runs_store.similarity_search_with_relevance_scores("testing store", 1)
# matched_docs

### Import Chains


#### Question creator

Generate new questions based on

-   `question` - Original question. This is important so that the pertinence to the original question is always maintained. Or else the context can diverge quickly into impertinent results.
-   `unanswered_questions`: So that the new questions do not overlap with the old ones.
-   `context`: The answer to the last question.
-   `num_questions`: Configurable hyper parameter.
-   `start_id`: This is passed so that the ids of the newly generated question do overlap with the current list.

---

#### Most pertinent Question chain

Pick the most pertinent question out of the given list of questions <br>
I am not using any additional context other than the `original_question` for decising the pertinence.

---

#### Retrieval QA

This chain is used to answer the intermediate questions. The idea is to generate succinct answers which can be used as notes to finally answer the original question

---

#### Result Analyser

Not using this right now. I am not able to get this piece working well. So currently I will just run the agent for a fixed number of iterations and then compile the answer.


In [11]:
## Import all the chains.
from chains_v2.create_questions import QuestionCreationChain
from chains_v2.most_pertinent_question import MostPertinentQuestion
from chains_v2.retrieval_qa import retrieval_qa
from chains_v2.research_compiler import research_compiler
from chains_v2.question_atomizer import QuestionAtomizer
from chains_v2.refine_answer import RefineAnswer

## Model with parameters
gpt3t = "gpt-3.5-turbo"


def language_model(
    model_name: str = gpt3t, temperature: float = 0, verbose: bool = False
):
    llm = ChatOpenAI(model_name=model_name, temperature=temperature, verbose=verbose)
    return llm

### Importing Question helpers


In [17]:
from helpers.response_helpers import result2QuestionsList
from helpers.response_helpers import qStr2Dict
from helpers.questions_helper import getAnsweredQuestions
from helpers.questions_helper import getUnansweredQuestions
from helpers.questions_helper import getSubQuestions
from helpers.questions_helper import getHopQuestions
from helpers.questions_helper import getLastQuestionId
from helpers.questions_helper import markAnswered
from helpers.questions_helper import getQuestionById

In [18]:
## Define log printers


def print_iteration(current_iteration):
    print(
        chalk.bg_yellow_bright.black.bold(
            f"\n   Iteration - {current_iteration}  ▷▶  \n"
        )
    )


def print_unanswered_questions(unanswered):
    print(
        chalk.cyan_bright("** Unanswered Questions **"),
        chalk.cyan("".join([f"\n'{q['id']}. {q['question']}'" for q in unanswered])),
    )


def print_next_question(current_question_id, current_question):
    print(
        chalk.magenta.bold("** 🤔 Next Questions I must ask: **\n"),
        chalk.magenta(current_question_id),
        chalk.magenta(current_question["question"]),
    )


def print_answer(current_question):
    print(
        chalk.yellow_bright.bold("** Answer **\n"),
        chalk.yellow_bright(current_question["answer"]),
    )


def print_final_answer(answerpad):
    print(
        chalk.white("** Answer **\n"),
        chalk.white(answerpad[-1]),
    )


def print_max_iterations():
    print(
        chalk.bg_yellow_bright.black.bold(
            "\n ✔✔  Max Iterations Reached. Compiling the results ...\n"
        )
    )


def print_result(result):
    print(chalk.italic.white_bright((result["text"])))


def print_sub_question(q):
    print(chalk.magenta.bold(f"** Sub Question **\n{q['question']}\n{q['answer']}\n"))

# The Research Agent


Question Data Schema

```
  Question: {
    id: int,
    question: string,
    type: 'subquestion' | 'hops',
    status: 'answered' | 'unanswered',
    answer: string,
    documents: []
  }
```


In [34]:
## ---- The researcher ----- ##


class Agent:
    ## Create chains
    def __init__(self, agent_settings, scratchpad, store, verbose):
        self.store = store
        self.scratchpad = scratchpad
        self.agent_settings = agent_settings
        self.verbose = verbose
        self.question_creation_chain = QuestionCreationChain.from_llm(
            language_model(
                temperature=self.agent_settings["question_creation_temperature"]
            ),
            verbose=self.verbose,
        )
        self.question_atomizer = QuestionAtomizer.from_llm(
            llm=language_model(
                temperature=self.agent_settings["question_atomizer_temperature"]
            ),
            verbose=self.verbose,
        )
        self.most_pertinent_question = MostPertinentQuestion.from_llm(
            language_model(
                temperature=self.agent_settings["question_creation_temperature"]
            ),
            verbose=self.verbose,
        )
        self.refine_answer = RefineAnswer.from_llm(
            language_model(
                temperature=self.agent_settings["refine_answer_temperature"]
            ),
            verbose=self.verbose,
        )

    def run(self, question):
        ## Step 0. Prepare the initial set of questions
        atomized_questions_response = self.question_atomizer.run(
            question=question,
            num_questions=self.agent_settings["num_atomistic_questions"],
        )

        self.scratchpad["questions"] += result2QuestionsList(
            question_response=atomized_questions_response,
            type="subquestion",
            status="unanswered",
        )

        for q in self.scratchpad["questions"]:
            q["answer"], q["documents"] = retrieval_qa(
                llm=language_model(
                    temperature=self.agent_settings["qa_temperature"],
                    verbose=self.verbose,
                ),
                retriever=self.store.as_retriever(
                    search_type="mmr", search_kwargs={"k": 5, "fetch_k": 10}
                ),
                question=q["question"],
                answer_length=self.agent_settings["intermediate_answers_length"],
                verbose=self.verbose,
            )
            q["status"] = "answered"
            print_sub_question(q)

        current_context = "".join(
            f"\n{q['id']}. {q['question']}\n{q['answer']}\n"
            for q in self.scratchpad["questions"]
        )

        current_iteration = 0

        while True:
            current_iteration += 1
            print_iteration(current_iteration)

            # STEP 1: create questions
            start_id = getLastQuestionId(self.scratchpad["questions"]) + 1
            questions_response = self.question_creation_chain.run(
                question=question,
                context=current_context,
                previous_questions=[
                    "".join(f"\n{q['question']}") for q in self.scratchpad["questions"]
                ],
                num_questions=self.agent_settings["num_questions_per_iteration"],
                start_id=start_id,
            )
            self.scratchpad["questions"] += result2QuestionsList(
                question_response=questions_response,
                type="hop",
                status="unanswered",
            )

            # STEP 2: Choose question for current iteration
            unanswered = getUnansweredQuestions(self.scratchpad["questions"])
            unanswered_questions_prompt = self.unanswered_questions_prompt(unanswered)
            print_unanswered_questions(unanswered)
            response = self.most_pertinent_question.run(
                original_question=question,
                unanswered_questions=unanswered_questions_prompt,
            )
            current_question_dict = qStr2Dict(question=response)
            current_question_id = current_question_dict["id"]
            current_question = getQuestionById(
                self.scratchpad["questions"], current_question_id
            )
            print_next_question(current_question_id, current_question)

            # STEP 3: Answer the question
            current_question["answer"], current_question["documents"] = retrieval_qa(
                llm=language_model(
                    temperature=self.agent_settings["qa_temperature"],
                    verbose=self.verbose,
                ),
                retriever=self.store.as_retriever(
                    search_type="mmr", search_kwargs={"k": 5, "fetch_k": 10}
                ),
                question=current_question["question"],
                answer_length=self.agent_settings["intermediate_answers_length"],
                verbose=self.verbose,
            )
            markAnswered(self.scratchpad["questions"], current_question_id)
            print_answer(current_question)
            current_context = current_question["answer"]

            ## STEP 4: refine the answer
            refinement_context = current_question["question"] + current_context
            refine_answer = self.refine_answer.run(
                question=question,
                context=refinement_context,
                answer=self.get_latest_answer(),
            )
            self.scratchpad["answerpad"] += [refine_answer]
            print_final_answer(self.scratchpad["answerpad"])

            if current_iteration > self.agent_settings["max_iterations"]:
                print_max_iterations()
                break

    def unanswered_questions_prompt(self, unanswered):
        return (
            "[" + "".join([f"\n{q['id']}. {q['question']}" for q in unanswered]) + "]"
        )

    def notes_prompt(self, answered_questions):
        return "".join(
            [
                f"{{ Question: {q['question']}, Answer: {q['answer']} }}"
                for q in answered_questions
            ]
        )

    def get_latest_answer(self):
        answers = self.scratchpad["answerpad"]
        answer = answers[-1] if answers else ""
        return answer

In [26]:
run_id = str(uuid.uuid4())

scratchpad = {
    "questions": [],  # list of type Question
    "answerpad": [],
}

store = law_compilation_store

agent_settings = {
    "max_iterations": 3,
    "num_atomistic_questions": 2,
    "num_questions_per_iteration": 4,
    "question_atomizer_temperature": 0,
    "question_creation_temperature": 0.4,
    "question_prioritisation_temperature": 0,
    "refine_answer_temperature": 0,
    "qa_temperature": 0,
    "analyser_temperature": 0,
    "intermediate_answers_length": 200,
    "answer_length": 500,
}

agent = Agent(agent_settings, scratchpad, store)

In [None]:
question = "What is a private placement?"

agent = Agent(agent_settings, scratchpad, store)
agent.run(question)