In [None]:
import sys
sys.path.append('../backend')

In [None]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv(filename="demo.env"))

In [None]:
from langchain.chains import QAGenerationChain
from app_config import config

In [None]:
for key, value in config.__dict__.items():
    print(f'{key}={value}')

In [None]:
import openai

openai.api_key = config.azure_openai_key
openai.api_base = config.api_endpoint
openai.api_type = "azure"
openai.api_version = "2024-08-01-preview"

In [None]:
from modules.document.utils.DocumentReader import DocumentReader
from modules.document.utils.DocumentReaderProviders import Providers

document_reader = DocumentReader(provider=Providers.LANG_CHAIN, file_path="../regulations/CRSD_CELEX_32022L2464_EN_TXT.pdf")
documents = document_reader.read(is_directory=False)

In [None]:
len(documents)

In [None]:
documents[1].page_content

In [None]:
# calculate number of tokens in the regulation, and avg number fo tokens per page
import tiktoken

token_encoding = tiktoken.get_encoding("cl100k_base")

def calculate_avg_tokens_per_page(documents):
    total_tokens = 0
    total_pages = 0
    for doc in documents:
        total_tokens += len(token_encoding.encode(doc.page_content))
        total_pages += 1
    return total_tokens / total_pages

def calculate_total_document_tokens(documents):
    total_tokens = 0
    for doc in documents:
        total_tokens += len(token_encoding.encode(doc.page_content))
    return total_tokens

In [None]:
print(f'avg tokens per page is {calculate_avg_tokens_per_page(documents)}')
print(f'total tokens in document is {calculate_total_document_tokens(documents)}')

In [None]:
from langchain.chat_models import AzureChatOpenAI
from langchain.evaluation.qa import QAGenerateChain

chatOpenAI = AzureChatOpenAI(
    temperature=0.0,
    deployment_name=config.azure_gpt4_deployment_name,
    openai_api_base=openai.api_base,
    openai_api_version=openai.api_version,
)

In [None]:
qa_prompt = """You are a legal expert and a teacher coming up with one question to ask on a quiz about ESG and CSR regulations. \
\nGiven the following document, please generate a question and answer based on that document. \
\n The answer must be composed of an exact excerpt from the document. \
\n\nExample Format: \
\n<Begin Document>\n...\n<End Document> \
\nQUESTION: question here \
\nANSWER: answer here \
\n\nThese question should be detailed and be based explicitly on information in the document. Begin! \
\n\n<Begin Document>\n{doc}\n<End Document>"""

In [None]:
qa_prompt_multi = """You are a legal expert and a teacher coming up with questions to ask on a quiz about ESG and CSR regulations. \
\nGiven the following document, please generate {num_questions} questions and corresponding answers based on that document. \
\nGenerate only short questions without compound sentences. Generate a variety of questions that cover different aspects about the document. \
\n Here are some topic ideas for the questions: 
    1. requirements for compliance
    2. penalties for non-compliance
    3. applicability of the regulation
\nMake sure they are complete questions, and that they can be answered by extracting excerpts from the document. \
\nEach answer must be composed of an exact excerpt from the document. \
\nGive the questions and answers in a json format.
\n\nExample Format: \
\n<Begin Document>\n...\n<End Document> \
\nRESULTS:
\n<begin json>...<end json>
\n\nThese questions should be detailed and be based explicitly on information in the document. Begin! \
\n\n<Begin Document>\n{doc}\n<End Document>"""

In [None]:
# qa_gen_chain = QAGenerateChain.from_string(chatOpenAI, qa_prompt_multi)

In [None]:
from langchain_core.output_parsers import StrOutputParser

qa_gen_chain = QAGenerateChain.from_string(chatOpenAI, qa_prompt_multi)
qa_gen_chain.output_parser = StrOutputParser()

In [None]:
for key in qa_gen_chain.prompt.__dict__:
    print(f'{key}')

In [None]:
qa_gen_chain.prompt

In [None]:
num_questions = 3

In [None]:
len(documents[1:2])

In [None]:
# generated_qas = qa_gen_chain.apply_and_parse(
#     [{"doc": t, "num_questions": num_questions} for t in documents[0:10]]
# )

generated_qas = [
    qa_gen_chain.invoke({"doc": t, "num_questions": num_questions})
    for t in documents[0:10]
]

In [None]:
generated_qas

In [None]:
documents[1].page_content