## Distributed Golden Dataset Creation

### Environment Setup

In [None]:
import os
import ray

# os.environ['OPENAI_API_KEY'] = 'sk_...'
os.environ['RAY_ADDRESS'] = 'ray://localhost:10001'

runtime_env = {
    'pip': [
        'llama-index==0.10.27',
        'boto3==1.34.79',
        'botocore==1.34.79',
        'ipython==8.18.1',
        'pandas==2.2.1',
        'ragas==0.1.7',
        'pypdf2==3.0.1',
        'boto3==1.34.79',
        'langchain==0.1.14',
        'unstructured==0.13.2'
        
    ],
    "env_vars": {
        'AWS_ACCESS_KEY_ID': os.environ['AWS_ACCESS_KEY_ID'],
        'AWS_SECRET_ACCESS_KEY': os.environ['AWS_SECRET_ACCESS_KEY'],
        'HUGGINGFACE_API_TOKEN': os.environ['HUGGINGFACE_API_TOKEN'],
        'OPENAI_API_KEY': os.environ['OPENAI_API_KEY'],
    }

}

ray.init(runtime_env=runtime_env, include_dashboard=True, log_to_driver=False)

### Distributed Testset Generation (Ragas + Ray)

In [20]:
BUCKET_NAME = 'unstructured-data'

In [21]:
import boto3

s3 = boto3.client(
        's3',
        endpoint_url='http://localhost:9000',
        aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
        aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY']
    )

def list_files_in_bucket(bucket_name):

    file_paths = []
    paginator = s3.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket_name)

    for page in page_iterator:
        if "Contents" in page:
            for obj in page['Contents']:
                file_paths.append(obj['Key'])

    return file_paths

In [26]:
@ray.remote
def create_golden_dataset(bucket_name, file_name):
    from llama_index.core import SimpleDirectoryReader
    import os
    import boto3
    from ragas.testset.generator import TestsetGenerator
    from langchain_openai import ChatOpenAI, OpenAIEmbeddings
    import fitz  # PyMuPDF
    from llama_index.core import SimpleDirectoryReader
    import shutil
    import time
    import random
    
    DIRECTORY_NAME = os.path.join(os.getcwd(), 'data')
    LOCAL_FILE_PATH = os.path.join(DIRECTORY_NAME, file_name)
    
    s3_client = boto3.client(
        's3',
        aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
        aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
        endpoint_url='http://minio.minio.svc:9000'
    )
    
    bucket_name = bucket_name
    object_key = file_name
    
    # Fetch the PDF file from S3
    pdf_file = s3_client.get_object(Bucket=bucket_name, Key=object_key)
    pdf_content = pdf_file['Body'].read()
    
    if not os.path.exists(DIRECTORY_NAME):
        os.makedirs(DIRECTORY_NAME)
        
    with open(LOCAL_FILE_PATH, 'wb') as f:
        f.write(pdf_content) 


    reader = SimpleDirectoryReader(input_dir=DIRECTORY_NAME)
    documents = reader.load_data()
    
    # generator with openai models
    generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
    critic_llm = ChatOpenAI(model="gpt-3.5-turbo")
    embeddings = OpenAIEmbeddings()
    
    generator = TestsetGenerator.from_langchain(
        generator_llm,
        critic_llm,
        embeddings
    )
    
    # generate testset
    try:
        # to get past rate limits....sad :'(
        time.sleep(random.uniform(30, 60))
        testset = generator.generate_with_llamaindex_docs(documents, test_size=3)
    except:
        print("Uh oh!")
    
    if os.path.exists(DIRECTORY_NAME):
        shutil.rmtree(DIRECTORY_NAME)
        print(f"Directory {DIRECTORY_NAME} and its contents have been deleted.")
        
    return testset

In [27]:
pdfs_on_s3 = list_files_in_bucket(BUCKET_NAME)
futures = []
for pdf in pdfs_on_s3[:3]:
    futures.append(create_golden_dataset.remote(BUCKET_NAME, pdf))

# Retrieve and print the results
golden_dfs = ray.get(futures)

In [31]:
import pandas as pd
combined_golden_df = pd.DataFrame()
for testset in golden_dfs:
    combined_golden_df = pd.concat([combined_golden_df, testset.to_pandas()], ignore_index=True)

combined_golden_df

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What were the radiographic findings in the lun...,[ the morning but decreased to 87% by mid-afte...,The radiographic findings in the lungs of the ...,simple,"[{'page_label': '785', 'file_name': 'Acute res...",True
1,What is the purpose of using Protein A conjuga...,[Bentancor et al - perfringens epsilon -toxin ...,Protein A conjugated to horse radish peroxidas...,simple,"[{'page_label': '2', 'file_name': 'Antibody re...",True
2,What clinical criteria were used to diagnose a...,[ the morning but decreased to 87% by mid-afte...,The clinical criteria used to diagnose acute r...,reasoning,"[{'page_label': '785', 'file_name': 'Acute res...",True
3,What was the purpose of including unvaccinated...,[10] . In \nthis report we describe the charac...,The purpose of including unvaccinated controls...,multi_context,"[{'page_label': '1', 'file_name': 'Antibody re...",True
4,What is the composition of herbaceous species ...,[25 \n20 \n1s e \ni \nIi \n'0 j \n5 \n0 \nFig....,The composition of herbaceous species in the h...,simple,"[{'page_label': '2', 'file_name': 'Alpaca live...",True
5,How did the liveweight of the alpaca flock cha...,[f \nFig. 3. Liveweight variation in alpaca fl...,The liveweight of the alpaca flock increased f...,simple,"[{'page_label': '3', 'file_name': 'Alpaca live...",True
6,How was the annual fiber production analyzed i...,[25 \n20 \n1s e \ni \nIi \n'0 j \n5 \n0 \nFig....,The annual fiber production was analyzed throu...,reasoning,"[{'page_label': '2', 'file_name': 'Alpaca live...",True
7,How were liveweight changes calculated and wha...,[25 \n20 \n1s e \ni \nIi \n'0 j \n5 \n0 \nFig....,Liveweight changes were calculated using the f...,multi_context,"[{'page_label': '2', 'file_name': 'Alpaca live...",True
8,What is the relationship between the number of...,"[,12 ± 0,01 0,13 ± 0,02 0,16 ± 0,05 0,14 ± ...",The relationship between the number of vaccine...,simple,"[{'page_label': '3', 'file_name': 'Antibody re...",True
9,How do llama antibody responses differ from th...,[Bentancor et al - perfringens epsilon -toxin ...,Llama antibody responses differ from those of ...,simple,"[{'page_label': '4', 'file_name': 'Antibody re...",True


In [33]:
import re
pattern = r"[^a-zA-Z0-9.,? ]"

# Define a function to replace special characters in a string
def remove_special_chars(s):
    return re.sub(pattern, '', str(s))

# Apply the function to each cell in the DataFrame
combined_golden_df = combined_golden_df.applymap(remove_special_chars)
combined_golden_df

  combined_golden_df = combined_golden_df.applymap(remove_special_chars)


Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What were the radiographic findings in the lun...,the morning but decreased to 87 by midafterno...,The radiographic findings in the lungs of the ...,simple,"pagelabel 785, filename Acute respiratory dist...",True
1,What is the purpose of using Protein A conjuga...,Bentancor et al perfringens epsilon toxin in ...,Protein A conjugated to horse radish peroxidas...,simple,"pagelabel 2, filename Antibody response to the...",True
2,What clinical criteria were used to diagnose a...,the morning but decreased to 87 by midafterno...,The clinical criteria used to diagnose acute r...,reasoning,"pagelabel 785, filename Acute respiratory dist...",True
3,What was the purpose of including unvaccinated...,10 . In nthis report we describe the character...,The purpose of including unvaccinated controls...,multicontext,"pagelabel 1, filename Antibody response to the...",True
4,What is the composition of herbaceous species ...,25 n20 n1s e ni nIi n0 j n5 n0 nFig. 1. Averag...,The composition of herbaceous species in the h...,simple,"pagelabel 2, filename Alpaca liveweight variat...",True
5,How did the liveweight of the alpaca flock cha...,f nFig. 3. Liveweight variation in alpaca floc...,The liveweight of the alpaca flock increased f...,simple,"pagelabel 3, filename Alpaca liveweight variat...",True
6,How was the annual fiber production analyzed i...,25 n20 n1s e ni nIi n0 j n5 n0 nFig. 1. Averag...,The annual fiber production was analyzed throu...,reasoning,"pagelabel 2, filename Alpaca liveweight variat...",True
7,How were liveweight changes calculated and wha...,25 n20 n1s e ni nIi n0 j n5 n0 nFig. 1. Averag...,Liveweight changes were calculated using the f...,multicontext,"pagelabel 2, filename Alpaca liveweight variat...",True
8,What is the relationship between the number of...,",12 0,01 0,13 0,02 0,16 0,05 0,14 0,04 ...",The relationship between the number of vaccine...,simple,"pagelabel 3, filename Antibody response to the...",True
9,How do llama antibody responses differ from th...,Bentancor et al perfringens epsilon toxin in ...,Llama antibody responses differ from those of ...,simple,"pagelabel 4, filename Antibody response to the...",True


### Save golden dataset to S3

In [35]:
import io
csv_buffer = io.StringIO()
combined_golden_df.to_csv(csv_buffer, index=False)

s3.put_object(Bucket=BUCKET_NAME, Body=csv_buffer.getvalue(), Key='golden_dataset.csv')

{'ResponseMetadata': {'RequestId': '17C4EE3C788A15DE',
  'HostId': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '0',
   'etag': '"faa2946ff0a9a10f7298b834bb1a3faf"',
   'server': 'MinIO',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-id-2': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
   'x-amz-request-id': '17C4EE3C788A15DE',
   'x-content-type-options': 'nosniff',
   'x-xss-protection': '1; mode=block',
   'date': 'Wed, 10 Apr 2024 13:28:21 GMT'},
  'RetryAttempts': 0},
 'ETag': '"faa2946ff0a9a10f7298b834bb1a3faf"'}

![](assets/golden_dataset.png)