In [1]:
# import library
from pinecone import Pinecone
from langchain_upstage import ChatUpstage, UpstageEmbeddings
from dotenv import load_dotenv
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import os

In [2]:
load_dotenv()

index_name = "quickstart"
client = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"), source_tag="langchain")
llm_upstage = ChatUpstage(api_key=os.environ.get("UPSTAGE_API_KEY"), temperature=0)
embeddings_query = UpstageEmbeddings(model="embedding-query") #4096

In [3]:
retriever_prompt_template = """
\n\nHuman: Here is the context information, inside <context></context> XML tags.

<context>
{context}
</context>

Given the context information and not prior knowledge.
generate only questions based on the below query.

You are a Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. The questions should not contain options, start with "-"
Restrict the questions to the context information provided.

\n\nAssistant:"""

PROMPT_RETRIEVER = PromptTemplate(
    template=retriever_prompt_template, input_variables=["context", "num_questions_per_chunk"]
)

In [4]:
generation_prompt_template = """
Here is the context, inside <context></context> XML tags.

<context>
{context}
</context>

Only using the context as above, answer the following question with the rules as below:
    - Don't insert XML tag such as <context> and </context> when answering.
    - Write as much as you can
    - Be courteous and polite
    - Only answer the question if you can find the answer in the context with certainty.
    - Skip the preamble
    - Use three sentences maximum and keep the answer concise.
    - If the answer is not in the context, just say "Could not find answer in given contexts."

Question:
{question}

Answer:"""

PROMPT_GENERATION = PromptTemplate(
    template=generation_prompt_template, input_variables=["context", "question"]
)

In [5]:
index = client.Index(index_name)

In [14]:
def GTGenerator(index, llm_retriever, llm_generation, prompt_retriever, prompt_generation, batch_size=50, max_batch=2, num_questions_per_chunk=1):

    llm_chain_retriever = LLMChain(llm=llm_retriever, prompt=prompt_retriever)
    llm_chain_generation = LLMChain(llm=llm_generation, prompt=prompt_generation)
    gt = [] # [question, 정답 id, 정답 text]

    all_ids = list(index.list(limit=batch_size))
    
    if max_batch < len(all_ids):
        all_ids = all_ids[:max_batch]    

    # ID를 기반으로 데이터를 하나씩 가져오기
    for fetched_ids in all_ids:
        fetched_docs = index.fetch(ids=fetched_ids)
        fetched_docs = fetched_docs.vectors
        
        for doc_id in fetched_ids:
            doc_text =  fetched_docs[doc_id]["metadata"]["text"]

            questions = llm_chain_retriever.predict(context=doc_text, num_questions_per_chunk=str(num_questions_per_chunk))
            
            questions = questions.split("\n\n-")
            if len(questions) <= num_questions_per_chunk + 1:

                if len(questions) == num_questions_per_chunk:
                    questions = list(map(lambda x:x.strip(), questions))
                else:
                    questions = list(map(lambda x:x.strip(), questions[1:]))
                for q in questions:
                    answer = llm_chain_generation.predict(question=q, context=doc_text)
                    answer = answer.strip()
                    gt.append([q, answer, doc_id, doc_text])
            else:
                print ("err")
                print (questions)

    return gt

In [17]:
gt = GTGenerator(
    index = index,
    llm_retriever=llm_upstage,
    llm_generation=llm_upstage,
    prompt_retriever=PROMPT_RETRIEVER,
    prompt_generation=PROMPT_GENERATION,
    batch_size=50,
    max_batch=3,
    num_questions_per_chunk=1
)

In [18]:
import pandas as pd
eval_dataset_retriever = pd.DataFrame(gt, columns=["question", "answer", "doc_id", "doc"])
eval_dataset_retriever.to_csv("eval_dataset.csv", index=False)