# Question-Answer Generator



Auto generate question answering pairs for paragraphs from documents. Evaluate LLM based on synthetic QA pairs.

Based on https://github.com/langchain-ai/auto-evaluator/tree/main licensed under Elastic License 2.0 (ELv2)

In [None]:
!python --version

In [None]:
!pip install --upgrade sagemaker --quiet

In [None]:
import sagemaker
import boto3
import botocore

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client("iam")
    role = iam.get_role(RoleName="sagemaker_execution_role")["Role"]["Arn"]

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

## Load documents

In [None]:
s3 = sess.boto_session.resource("s3")
bucket_name = "<YOUR_S3_INPUT_BUCKET_NAME>"
s3_key = "crawlers/admin-ch/admin_ch_press_releases.json"
output_bucket = "<YOUR_S3_OUTPUT_BUCKET_NAME>"
s3_output_dataset_key = "admin_ch_dataset/train"
obj1 = s3.Object(bucket_name, s3_key)
jsonfile_content_str = obj1.get()["Body"].read().decode("utf-8")

In [None]:
import json

question_answers = json.loads(jsonfile_content_str)

In [None]:
import pandas as pd

df_origin = pd.DataFrame.from_dict(question_answers)

In [None]:
df_origin.iloc[:1]

In [None]:
df = df_origin

In [None]:
dataset_key_to_generate_qa_for = "paragraphs"
dataset_key_to_generate_qa_for = "textContent"

## Use Falcon

In [None]:
import boto3
import json

# Create a low-level client representing Amazon SageMaker Runtime
# sagemaker_runtime = boto3.client("sagemaker-runtime", region_name="eu-west-1")

# The name of the endpoint. The name must be unique within an AWS Region in your AWS account.
endpoint_name = "falcon-40b-instruct-48xl-5"

# After you deploy a model into production using SageMaker hosting
# services, your client applications use this API to get inferences
# from the model hosted at the specified endpoint.
# response = sagemaker_runtime.invoke_endpoint(
#                             EndpointName=endpoint_name,
#                             Body=bytes('{"features": ["This is great!"]}', 'utf-8') # Replace with your own data.
#                             )

# Optional - Print the response body and decode it so it is human read-able.
# print(response['Body'].read().decode('utf-8'))


client = boto3.client("sagemaker-runtime")
request = {
    "inputs": "The first paragraph of the book 'Alice in Wonderland' by Lewis Carroll reads as:",
    "parameters": {"truncation": True},
}

response = client.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType="application/json",
    Accept="application/json",
    Body=json.dumps(request),
)
print(response["Body"].read().decode())

## Create Prompt to generate question answer pairs

In [None]:
number_of_questions_to_generate = 5

In [None]:
# falcon_prompt = """
# >>INTRODUCTION<<
# {}

# Generate a list of """+ str(number_of_questions_to_generate) +""" question and answer pairs for the text.

# When coming up with this question/answer pair, you must respond in the following format:
# ```
# [\n"""+(" {{\n  \"question\": \"...\",\n  \"answer\": \"...\"\n }},\n"*number_of_questions_to_generate)[:-2]+"""\n]
# ```
# Everything between the ``` must be valid json.


# Assistant: [
# """

In [None]:
# falcon_prompt

In [None]:
# prompt_response_format = """
# The JSON result should be inside <format></format> XML tag.
# <format>
# [
#  {{
#   \"question\": \"...\",
#   \"answer\": \"...\"
#  }},
#  {{
#   \"question\": \"...\",
#   \"answer\": \"...\"
#  }},
#  ...
# ]

# </format>

# Text:
# {}


# Assistant: Here is a list of """ + str(number_of_questions_to_generate) +""" question and answer pairs extracted from the text: ["""

# prompts = [

#     """Human: Generate a list of """ + str(number_of_questions_to_generate) +""" question and answer pairs for the following text.""" + prompt_response_format,
#     """Human: You are a smart assistant designed to help high school teachers come up with reading comprehension questions.
# Given a piece of text, you must come up with a question and answer pair that can be used to test a student's reading comprehension abilities.
# Generate a list of """ + str(number_of_questions_to_generate) +""" question and answer pairs for the following text.""" + prompt_response_format
# ]

In [None]:
falcon_kwargs = {
    "parameters": {
        "do_sample": True,
        "top_p": 0.95,
        "temperature": 0.1,
        "top_k": 50,
        "max_new_tokens": 1000,
        "repetition_penalty": 1.03,
        "stop": ["<|end|>", "<|endoftext|>", "]"],
    }
}

In [None]:
import json


def run_llm(text):
    # prompt_template = prompts[0]
    falcon_prompt = (
        f"""
    >>INTRODUCTION<<
    {text}

    Generate a list of """
        + str(number_of_questions_to_generate)
        + """ question and answer pairs for the text.

    When coming up with this question/answer pair, you must respond in the following format:
    ```
    [\n"""
        + (
            ' {{\n  "question": "...",\n  "answer": "..."\n }},\n'
            * number_of_questions_to_generate
        )[:-2]
        + """\n]
    ```
    Everything between the ``` must be valid json.



    Assistant: [
    """
    )

    prompt_template = falcon_prompt

    # parameters = {
    #     "max_tokens_to_sample": 600,
    # # "temperature": 1,
    # # "top_k": 250,
    # # "top_p": 0.999,
    # "stop_sequences": ["\\n\\nHuman:", "</format>","<format>", "<paragraphs>","[","]"],
    #     }
    prompt_data = prompt_template.format(text)
    prompt_data = falcon_prompt
    # print("------START-PROMPT-DATA---------------------")
    # # print(prompt_data)
    body = json.dumps({"inputs": prompt_data, **falcon_kwargs})
    # print("------END-PROMPT-DATA---------------------")
    # print(body)
    # print("------END-BODY---------------------")
    # modelId = 'anthropic.claude-v2' # change this to use a different version from the model provider
    accept = "application/json"
    contentType = "application/json"

    # response = bedrock.invoke_model(body=body, modelId=modelId, accept=accept, contentType=contentType)
    # response_body = json.loads(response.get('body').read())
    # return "["+response_body.get('completion')+"]"
    response = client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType=contentType,
        Accept=accept,
        # Body=json.dumps(request),
        Body=body,
    )
    return response["Body"].read().decode()


res = run_llm(df[dataset_key_to_generate_qa_for][4])
print(json.loads(res))

In [None]:
print(json.loads(res)[0]["generated_text"])

In [None]:
# initial the column where the list of qa pairs has to be stored
df["qa_pairs"] = None

In [None]:
import time
from json import JSONDecodeError
import tqdm
import json

for i in tqdm.tqdm(range(df.shape[0])):
    current_res = df["qa_pairs"].values[i]
    if current_res is None or len(current_res) == 0:
        row = df.iloc[i]
        paragraph = row[dataset_key_to_generate_qa_for]

        try:
            qa_pairs = run_llm(paragraph)
            list_pairs = json.loads(qa_pairs)
            df["qa_pairs"].values[i] = list_pairs
            # time.sleep(12)
            time.sleep(1)
        except Exception as e:
            # print("------------------------")
            # print(qa_pairs)
            print("------------------------")
            print(e)
            print("------------------------")
            print("Sleeping")
            # time.sleep(12)
            time.sleep(1)
            df["qa_pairs"].values[i] = []
    else:
        print(f"Skipping row: {str(i)}")
        continue


print(len(df["qa_pairs"]))

In [None]:
# show rows for which qa generation did not succeed
df[df["qa_pairs"].apply(lambda x: True if x is None or len(x) == 0 else False)]

In [None]:
df["qa_pairs"]

In [None]:
df.iloc[:4]

In [None]:
df_clean = df.explode("qa_pairs")
df_clean = df_clean[df_clean["qa_pairs"].notna()]
df_clean.reset_index(drop=True, inplace=True)
df_clean = df_clean.join(pd.json_normalize(df_clean.qa_pairs))
df_clean.drop("qa_pairs", axis=1, inplace=True)

In [None]:
df_clean.iloc[:1]

In [None]:
df = df_clean

## Prepare input

In [None]:
import numpy as np

In [None]:
intro = pd.Series(
    np.full(df.shape[0], fill_value="The conversation between human and AI assistant.")
)
human_tag = pd.Series(np.full(df.shape[0], fill_value="\n[|Human|] "))
ai_tag = pd.Series(np.full(df.shape[0], fill_value="\n[|AI|] "))

In [None]:
df.head(5)

In [None]:
df["input"] = intro + human_tag + df["question"] + ai_tag + df["answer"] + human_tag

In [None]:
df["input"][1]

In [None]:
len(df)

In [None]:
df = df.drop_duplicates(subset=["input"])

In [None]:
len(df)

## Store dataset

!pip install datasets[s3]

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

dataset = dataset.train_test_split(test_size=0.1, seed=2303)
dataset

In [None]:
from datasets.filesystems import S3FileSystem

session = botocore.session.get_session()
s3 = S3FileSystem(session=session)

In [None]:
dataset.save_to_disk(
    f"s3://{output_bucket}/{s3_output_dataset_key}", storage_options=s3.storage_options
)