# Question-Answer Generator



Auto generate question answering pairs for paragraphs from documents. Evaluate LLM based on synthetic QA pairs.

Based on https://github.com/langchain-ai/auto-evaluator/tree/main licensed under Elastic License 2.0 (ELv2)

In [2]:
!python --version

Python 3.10.8


In [3]:
!pip install --upgrade sagemaker --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
awscli 1.27.132 requires botocore==1.29.132, but you have botocore 1.31.21 which is incompatible.
awscli 1.27.132 requires PyYAML<5.5,>=3.10, but you have pyyaml 6.0.1 which is incompatible.[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
import sagemaker
import boto3
import botocore

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client("iam")
    role = iam.get_role(RoleName="sagemaker_execution_role")["Role"]["Arn"]

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::843197046435:role/service-role/AmazonSageMaker-ExecutionRole-20230626T095793
sagemaker bucket: sagemaker-eu-west-1-843197046435
sagemaker session region: eu-west-1


## Load documents

In [5]:
s3 = sess.boto_session.resource("s3")
bucket_name = "gen-ai-foundation"
s3_key = "crawlers/admin-ch/admin_ch_press_releases.json"
output_bucket = "sagemaker-eu-west-1-843197046435"
s3_output_dataset_key = "admin_ch_dataset/train"
obj1 = s3.Object(bucket_name, s3_key)
jsonfile_content_str = obj1.get()["Body"].read().decode("utf-8")

In [6]:
import json

question_answers = json.loads(jsonfile_content_str)

In [7]:
import pandas as pd

df_origin = pd.DataFrame.from_dict(question_answers)

In [8]:
df_origin.iloc[:1]

Unnamed: 0,title,byline,dir,lang,content,textContent,length,excerpt,siteName,source,target_urls,link_text
0,Press releases from the Federal Administration,The Federal Council,,en,"<div id=""readability-page-1"" class=""page""><div...",\n\t\t\t\n\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t11.0...,7660,Current information from the Federal Administr...,,https://www.admin.ch/gov/en/start/documentatio...,"[{'text': 'Homepage', 'url': 'https://www.admi...",root


In [9]:
df = df_origin

In [10]:
dataset_key_to_generate_qa_for = "paragraphs"
dataset_key_to_generate_qa_for = "textContent"

## Use Falcon

In [15]:
import boto3
import json

# Create a low-level client representing Amazon SageMaker Runtime
# sagemaker_runtime = boto3.client("sagemaker-runtime", region_name="eu-west-1")

# The name of the endpoint. The name must be unique within an AWS Region in your AWS account.
endpoint_name = "falcon-40b-instruct-48xl-5"

# After you deploy a model into production using SageMaker hosting
# services, your client applications use this API to get inferences
# from the model hosted at the specified endpoint.
# response = sagemaker_runtime.invoke_endpoint(
#                             EndpointName=endpoint_name,
#                             Body=bytes('{"features": ["This is great!"]}', 'utf-8') # Replace with your own data.
#                             )

# Optional - Print the response body and decode it so it is human read-able.
# print(response['Body'].read().decode('utf-8'))


client = boto3.client("sagemaker-runtime")
request = {
    "inputs": "The first paragraph of the book 'Alice in Wonderland' by Lewis Carroll reads as:",
    "parameters": {"truncation": True},
}

response = client.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType="application/json",
    Accept="application/json",
    Body=json.dumps(request),
)
print(response["Body"].read().decode())

[{"generated_text":"The first paragraph of the book 'Alice in Wonderland' by Lewis Carroll reads as:\n'Alice was beginning to get very tired of sitting by her sister on the bank, and of"}]


## Create Prompt to generate question answer pairs

In [61]:
number_of_questions_to_generate = 5

In [70]:
# falcon_prompt = """
# >>INTRODUCTION<<
# {}

# Generate a list of """+ str(number_of_questions_to_generate) +""" question and answer pairs for the text.

# When coming up with this question/answer pair, you must respond in the following format:
# ```
# [\n"""+(" {{\n  \"question\": \"...\",\n  \"answer\": \"...\"\n }},\n"*number_of_questions_to_generate)[:-2]+"""\n]
# ```
# Everything between the ``` must be valid json.


# Assistant: [
# """

In [82]:
# falcon_prompt

In [83]:
# prompt_response_format = """
# The JSON result should be inside <format></format> XML tag.
# <format>
# [
#  {{
#   \"question\": \"...\",
#   \"answer\": \"...\"
#  }},
#  {{
#   \"question\": \"...\",
#   \"answer\": \"...\"
#  }},
#  ...
# ]

# </format>

# Text:
# {}


# Assistant: Here is a list of """ + str(number_of_questions_to_generate) +""" question and answer pairs extracted from the text: ["""

# prompts = [

#     """Human: Generate a list of """ + str(number_of_questions_to_generate) +""" question and answer pairs for the following text.""" + prompt_response_format,
#     """Human: You are a smart assistant designed to help high school teachers come up with reading comprehension questions.
# Given a piece of text, you must come up with a question and answer pair that can be used to test a student's reading comprehension abilities.
# Generate a list of """ + str(number_of_questions_to_generate) +""" question and answer pairs for the following text.""" + prompt_response_format
# ]

In [84]:
falcon_kwargs = {
    "parameters": {
        "do_sample": True,
        "top_p": 0.95,
        "temperature": 0.1,
        "top_k": 50,
        "max_new_tokens": 1000,
        "repetition_penalty": 1.03,
        "stop": ["<|end|>", "<|endoftext|>", "]"],
    }
}

In [85]:
import json


def run_llm(text):
    # prompt_template = prompts[0]
    falcon_prompt = (
        f"""
    >>INTRODUCTION<<
    {text}

    Generate a list of """
        + str(number_of_questions_to_generate)
        + """ question and answer pairs for the text.

    When coming up with this question/answer pair, you must respond in the following format:
    ```
    [\n"""
        + (
            ' {{\n  "question": "...",\n  "answer": "..."\n }},\n'
            * number_of_questions_to_generate
        )[:-2]
        + """\n]
    ```
    Everything between the ``` must be valid json.



    Assistant: [
    """
    )

    prompt_template = falcon_prompt

    # parameters = {
    #     "max_tokens_to_sample": 600,
    # # "temperature": 1,
    # # "top_k": 250,
    # # "top_p": 0.999,
    # "stop_sequences": ["\\n\\nHuman:", "</format>","<format>", "<paragraphs>","[","]"],
    #     }
    prompt_data = prompt_template.format(text)
    prompt_data = falcon_prompt
    # print("------START-PROMPT-DATA---------------------")
    # # print(prompt_data)
    body = json.dumps({"inputs": prompt_data, **falcon_kwargs})
    # print("------END-PROMPT-DATA---------------------")
    # print(body)
    # print("------END-BODY---------------------")
    # modelId = 'anthropic.claude-v2' # change this to use a different version from the model provider
    accept = "application/json"
    contentType = "application/json"

    # response = bedrock.invoke_model(body=body, modelId=modelId, accept=accept, contentType=contentType)
    # response_body = json.loads(response.get('body').read())
    # return "["+response_body.get('completion')+"]"
    response = client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType=contentType,
        Accept=accept,
        # Body=json.dumps(request),
        Body=body,
    )
    return response["Body"].read().decode()


res = run_llm(df[dataset_key_to_generate_qa_for][4])
print(json.loads(res))

[{'generated_text': '\n    >>INTRODUCTION<<\n    \n\n    Federal Statistical Office\n                \n                    Neuchâtel,\xa013.07.2023\xa0-\xa0In 2022, the 11 major border regions outside Switzerland recorded lower activity rates than the Swiss regions bordering them, but higher rates than their respective national rates. Unemployment rates based on ILO definition were lower than the national averages in the countries concerned and in some cases lower than those of their Swiss neighbours. This illustrates the positive impact of the Swiss labour market on these regions, in particular the employment of 373 000 foreign cross-border commuters. These are some of the results from the publication “L’offre de travail en Suisse et dans les régions limitrophes en 2022”, available in French or German, from the Federal Statistical Office (FSO).\n                \n            \n\n            \n            \n                This press release and further information on this topic can be

In [89]:
print(json.loads(res)[0]["generated_text"])


    >>INTRODUCTION<<
    

    Federal Statistical Office
                
                    Neuchâtel, 13.07.2023 - In 2022, the 11 major border regions outside Switzerland recorded lower activity rates than the Swiss regions bordering them, but higher rates than their respective national rates. Unemployment rates based on ILO definition were lower than the national averages in the countries concerned and in some cases lower than those of their Swiss neighbours. This illustrates the positive impact of the Swiss labour market on these regions, in particular the employment of 373 000 foreign cross-border commuters. These are some of the results from the publication “L’offre de travail en Suisse et dans les régions limitrophes en 2022”, available in French or German, from the Federal Statistical Office (FSO).
                
            

            
            
                This press release and further information on this topic can be found on the FSO website (see link below)

In [90]:
# initial the column where the list of qa pairs has to be stored
df["qa_pairs"] = None

In [None]:
import time
from json import JSONDecodeError
import tqdm
import json

for i in tqdm.tqdm(range(df.shape[0])):
    current_res = df["qa_pairs"].values[i]
    if current_res is None or len(current_res) == 0:
        row = df.iloc[i]
        paragraph = row[dataset_key_to_generate_qa_for]

        try:
            qa_pairs = run_llm(paragraph)
            list_pairs = json.loads(qa_pairs)
            df["qa_pairs"].values[i] = list_pairs
            # time.sleep(12)
            time.sleep(1)
        except Exception as e:
            # print("------------------------")
            # print(qa_pairs)
            print("------------------------")
            print(e)
            print("------------------------")
            print("Sleeping")
            # time.sleep(12)
            time.sleep(1)
            df["qa_pairs"].values[i] = []
    else:
        print(f"Skipping row: {str(i)}")
        continue


print(len(df["qa_pairs"]))

  0%|          | 0/210 [00:00<?, ?it/s]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1849 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


  0%|          | 1/210 [00:01<03:45,  1.08s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 2138 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


  2%|▏         | 5/210 [00:42<36:07, 10.58s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1300 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


  7%|▋         | 15/210 [02:19<36:03, 11.09s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1197 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


  8%|▊         | 16/210 [02:20<26:04,  8.06s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1900 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


  9%|▊         | 18/210 [02:38<29:00,  9.06s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1347 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 10%|█         | 22/210 [03:26<39:30, 12.61s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1914 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 17%|█▋        | 35/210 [06:04<39:43, 13.62s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1153 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 17%|█▋        | 36/210 [06:05<28:32,  9.84s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 2119 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 18%|█▊        | 38/210 [06:19<25:41,  8.96s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1380 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 20%|██        | 43/210 [07:13<31:54, 11.47s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1903 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 21%|██        | 44/210 [07:14<23:03,  8.33s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 2246 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 21%|██▏       | 45/210 [07:15<16:53,  6.14s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1437 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 22%|██▏       | 46/210 [07:16<12:35,  4.61s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1093 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 28%|██▊       | 59/210 [09:44<32:52, 13.06s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1397 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 29%|██▉       | 61/210 [09:58<25:54, 10.43s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1509 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 31%|███▏      | 66/210 [10:51<27:29, 11.45s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1678 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 32%|███▏      | 68/210 [11:05<23:09,  9.78s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1049 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 35%|███▍      | 73/210 [11:56<26:14, 11.50s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 2655 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 37%|███▋      | 77/210 [13:00<41:20, 18.65s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1053 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 39%|███▉      | 82/210 [13:58<28:54, 13.55s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1530 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 40%|████      | 84/210 [14:07<19:27,  9.27s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1485 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 40%|████      | 85/210 [14:08<14:09,  6.79s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1868 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 41%|████      | 86/210 [14:09<10:27,  5.06s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 2325 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 44%|████▍     | 93/210 [15:31<24:57, 12.80s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1133 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 47%|████▋     | 98/210 [16:18<21:20, 11.44s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1135 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 48%|████▊     | 101/210 [16:39<16:45,  9.23s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1241 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 49%|████▉     | 103/210 [16:51<14:01,  7.86s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1123 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 50%|█████     | 106/210 [17:10<12:29,  7.20s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1125 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 51%|█████     | 107/210 [17:11<09:10,  5.35s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1316 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 54%|█████▍    | 113/210 [18:26<20:27, 12.65s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1585 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 54%|█████▍    | 114/210 [18:27<14:39,  9.16s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1176 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 55%|█████▍    | 115/210 [18:28<10:38,  6.72s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1266 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 57%|█████▋    | 120/210 [19:30<20:24, 13.60s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1370 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 60%|██████    | 126/210 [20:24<14:19, 10.23s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1699 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 60%|██████    | 127/210 [20:25<10:19,  7.47s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 2250 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 61%|██████▏   | 129/210 [20:35<08:58,  6.65s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1111 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 63%|██████▎   | 133/210 [21:07<10:36,  8.26s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1516 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 66%|██████▌   | 138/210 [22:07<14:41, 12.24s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1176 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 68%|██████▊   | 143/210 [22:58<12:49, 11.48s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1545 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 72%|███████▏  | 151/210 [24:29<12:37, 12.84s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1326 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 72%|███████▏  | 152/210 [24:30<08:59,  9.30s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1065 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 78%|███████▊  | 163/210 [26:11<07:49,  9.99s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1490 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 80%|████████  | 168/210 [27:34<14:35, 20.84s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 2268 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 80%|████████  | 169/210 [27:35<10:10, 14.90s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1886 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 87%|████████▋ | 183/210 [30:38<05:17, 11.75s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1514 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 93%|█████████▎| 195/210 [32:56<02:43, 10.87s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1282 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 94%|█████████▍| 197/210 [33:17<02:28, 11.44s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 5476 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 94%|█████████▍| 198/210 [33:18<01:39,  8.32s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1290 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 95%|█████████▌| 200/210 [33:26<01:06,  6.60s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1084 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 96%|█████████▌| 202/210 [33:46<01:11,  8.90s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1268 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


 97%|█████████▋| 203/210 [33:47<00:45,  6.53s/it]

------------------------
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (422) from primary with message "{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 1747 `inputs` tokens and 1000 `max_new_tokens`","error_type":"validation"}". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/falcon-40b-instruct-48xl-5 in account 843197046435 for more information.
------------------------
Sleeping


100%|██████████| 210/210 [35:07<00:00, 10.04s/it]

210





In [93]:
# show rows for which qa generation did not succeed
df[df["qa_pairs"].apply(lambda x: True if x is None or len(x) == 0 else False)]

Unnamed: 0,title,byline,dir,lang,content,textContent,length,excerpt,siteName,source,target_urls,link_text,qa_pairs
0,Press releases from the Federal Administration,The Federal Council,,en,"<div id=""readability-page-1"" class=""page""><div...",\n\t\t\t\n\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t11.0...,7660,Current information from the Federal Administr...,,https://www.admin.ch/gov/en/start/documentatio...,"[{'text': 'Homepage', 'url': 'https://www.admi...",root,[]
1,Press releases from the Federal Administration,The Federal Council,,en,"<div id=""readability-page-1"" class=""page""><div...",\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\t\t\n\t\t\t\t...,9030,Current information from the Federal Administr...,,https://www.admin.ch/gov/en/start/documentatio...,"[{'text': 'Homepage', 'url': 'https://www.admi...",root,[]
5,co-operate: model for climate-friendly constru...,The Federal Council,,en,"<div id=""readability-page-1"" class=""page""><div...",\n\n Federal Laboratory for Materials Testi...,5907,Current information from the Federal Administr...,,https://www.admin.ch/gov/en/start/documentatio...,"[{'text': 'Homepage', 'url': 'https://www.admi...",\n\t\t\t\t\t\t\tco-operate: model for climate-...,[]
15,"On 25th anniversary of Rome Statute, Switzerla...",The Federal Council,,en,"<div id=""readability-page-1"" class=""page""><div...",\n\n \n\n\n\n\t\n\n\t\n\n\t\n\t\t\n\n\t\n\t...,5504,Current information from the Federal Administr...,,https://www.admin.ch/gov/en/start/documentatio...,"[{'text': 'Homepage', 'url': 'https://www.admi...",\n\t\t\t\t\t\t\tOn 25th anniversary of Rome St...,[]
16,Earlier detection of breast cancer,The Federal Council,,en,"<div id=""readability-page-1"" class=""page""><div...",\n\n Paul Scherrer Institut\n ...,8514,Current information from the Federal Administr...,,https://www.admin.ch/gov/en/start/documentatio...,"[{'text': 'Homepage', 'url': 'https://www.admi...",\n\t\t\t\t\t\t\tEarlier detection of breast ca...,[]
18,New EU policy measures based on Empa research:...,The Federal Council,,en,"<div id=""readability-page-1"" class=""page""><div...",\n\n Federal Laboratory for Materials Testi...,6580,Current information from the Federal Administr...,,https://www.admin.ch/gov/en/start/documentatio...,"[{'text': 'Homepage', 'url': 'https://www.admi...",\n\t\t\t\t\t\t\tNew EU policy measures based o...,[]
22,Quantum technology: Masters of defects,The Federal Council,,en,"<div id=""readability-page-1"" class=""page""><div...",\n\n Federal Laboratory for Materials Testi...,8762,Current information from the Federal Administr...,,https://www.admin.ch/gov/en/start/documentatio...,"[{'text': 'Homepage', 'url': 'https://www.admi...",\n\t\t\t\t\t\t\tQuantum technology: Masters of...,[]
35,First report on cantonal measures relating to ...,The Federal Council,,en,"<div id=""readability-page-1"" class=""page""><div...",\n\n Federal Department of Finance\n ...,5268,Current information from the Federal Administr...,,https://www.admin.ch/gov/en/start/documentatio...,"[{'text': 'Homepage', 'url': 'https://www.admi...",\n\t\t\t\t\t\t\tFirst report on cantonal measu...,[]
36,Until the chemistry is just right,The Federal Council,,en,"<div id=""readability-page-1"" class=""page""><div...",\n\n Federal Laboratory for Materials Testi...,9186,Current information from the Federal Administr...,,https://www.admin.ch/gov/en/start/documentatio...,"[{'text': 'Homepage', 'url': 'https://www.admi...",\n\t\t\t\t\t\t\tUntil the chemistry is just right,[]
38,"First recovery, then stagnation: The state of ...",The Federal Council,,en,"<div id=""readability-page-1"" class=""page""><div...",\n\n Eawag: Swiss Federal Institute of Aqua...,7649,Current information from the Federal Administr...,,https://www.admin.ch/gov/en/start/documentatio...,"[{'text': 'Homepage', 'url': 'https://www.admi...","\n\t\t\t\t\t\t\tFirst recovery, then stagnatio...",[]


In [108]:
df["qa_pairs"]

KeyError: 'qa_pairs'

In [96]:
df.iloc[:4]

Unnamed: 0,title,byline,dir,lang,content,textContent,length,excerpt,siteName,source,target_urls,link_text,qa_pairs
0,Press releases from the Federal Administration,The Federal Council,,en,"<div id=""readability-page-1"" class=""page""><div...",\n\t\t\t\n\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t11.0...,7660,Current information from the Federal Administr...,,https://www.admin.ch/gov/en/start/documentatio...,"[{'text': 'Homepage', 'url': 'https://www.admi...",root,[]
1,Press releases from the Federal Administration,The Federal Council,,en,"<div id=""readability-page-1"" class=""page""><div...",\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\t\t\n\t\t\t\t...,9030,Current information from the Federal Administr...,,https://www.admin.ch/gov/en/start/documentatio...,"[{'text': 'Homepage', 'url': 'https://www.admi...",root,[]
2,State Secretary Hirayama attends informal EU m...,The Federal Council,,en,"<div id=""readability-page-1"" class=""page""><div...","\n\n State Secretariat for Education, Resea...",3230,Current information from the Federal Administr...,,https://www.admin.ch/gov/en/start/documentatio...,"[{'text': 'Homepage', 'url': 'https://www.admi...",\n\t\t\t\t\t\t\tState Secretary Hirayama atten...,[{'generated_text': '  >>INTRODUCTION<<  ...
3,Issue result for Confederation bonds,The Federal Council,,en,"<div id=""readability-page-1"" class=""page""><div...",\n\n Federal Finance Administration\n ...,1602,Current information from the Federal Administr...,,https://www.admin.ch/gov/en/start/documentatio...,"[{'text': 'Homepage', 'url': 'https://www.admi...",\n\t\t\t\t\t\t\tIssue result for Confederation...,[{'generated_text': '  >>INTRODUCTION<<  ...


In [97]:
df_clean = df.explode("qa_pairs")
df_clean = df_clean[df_clean["qa_pairs"].notna()]
df_clean.reset_index(drop=True, inplace=True)
df_clean = df_clean.join(pd.json_normalize(df_clean.qa_pairs))
df_clean.drop("qa_pairs", axis=1, inplace=True)

In [116]:
df_clean.iloc[:1]

0    \n    >>INTRODUCTION<<\n    \n\n    State Secr...
Name: generated_text, dtype: object


In [99]:
df = df_clean

## Prepare input

In [100]:
import numpy as np

In [104]:
intro = pd.Series(
    np.full(df.shape[0], fill_value="The conversation between human and AI assistant.")
)
human_tag = pd.Series(np.full(df.shape[0], fill_value="\n[|Human|] "))
ai_tag = pd.Series(np.full(df.shape[0], fill_value="\n[|AI|] "))

In [106]:
df.head(5)

Unnamed: 0,title,byline,dir,lang,content,textContent,length,excerpt,siteName,source,target_urls,link_text,generated_text
0,State Secretary Hirayama attends informal EU m...,The Federal Council,,en,"<div id=""readability-page-1"" class=""page""><div...","\n\n State Secretariat for Education, Resea...",3230,Current information from the Federal Administr...,,https://www.admin.ch/gov/en/start/documentatio...,"[{'text': 'Homepage', 'url': 'https://www.admi...",\n\t\t\t\t\t\t\tState Secretary Hirayama atten...,\n >>INTRODUCTION<<\n \n\n State Secr...
1,Issue result for Confederation bonds,The Federal Council,,en,"<div id=""readability-page-1"" class=""page""><div...",\n\n Federal Finance Administration\n ...,1602,Current information from the Federal Administr...,,https://www.admin.ch/gov/en/start/documentatio...,"[{'text': 'Homepage', 'url': 'https://www.admi...",\n\t\t\t\t\t\t\tIssue result for Confederation...,\n >>INTRODUCTION<<\n \n\n Federal Fi...
2,Switzerland's border regions: higher activity ...,The Federal Council,,en,"<div id=""readability-page-1"" class=""page""><div...",\n\n Federal Statistical Office\n ...,1723,Current information from the Federal Administr...,,https://www.admin.ch/gov/en/start/documentatio...,"[{'text': 'Homepage', 'url': 'https://www.admi...",\n\t\t\t\t\t\t\tSwitzerland's border regions: ...,\n >>INTRODUCTION<<\n \n\n Federal St...
3,Producer and Import Price Index remained stabl...,The Federal Council,,en,"<div id=""readability-page-1"" class=""page""><div...",\n\n Federal Statistical Office\n ...,1499,Current information from the Federal Administr...,,https://www.admin.ch/gov/en/start/documentatio...,"[{'text': 'Homepage', 'url': 'https://www.admi...",\n\t\t\t\t\t\t\tProducer and Import Price Inde...,\n >>INTRODUCTION<<\n \n\n Federal St...
4,Extension of investigation activities to Xplai...,The Federal Council,,en,"<div id=""readability-page-1"" class=""page""><div...",\n\n Federal Data Protection and Informatio...,1529,Current information from the Federal Administr...,,https://www.admin.ch/gov/en/start/documentatio...,"[{'text': 'Homepage', 'url': 'https://www.admi...",\n\t\t\t\t\t\t\tExtension of investigation act...,\n >>INTRODUCTION<<\n \n\n Federal Da...


In [102]:
df["input"] = intro + human_tag + df["question"] + ai_tag + df["answer"] + human_tag

KeyError: 'question'

In [None]:
df["input"][1]

In [103]:
len(df)

158

In [None]:
df = df.drop_duplicates(subset=["input"])

In [None]:
len(df)

## Store dataset

!pip install datasets[s3]

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

dataset = dataset.train_test_split(test_size=0.1, seed=2303)
dataset

In [None]:
from datasets.filesystems import S3FileSystem

session = botocore.session.get_session()
s3 = S3FileSystem(session=session)

In [None]:
dataset.save_to_disk(
    f"s3://{output_bucket}/{s3_output_dataset_key}", storage_options=s3.storage_options
)