## Distributed Golden Dataset Creation

### Environment Setup

In [1]:
import os
import ray

# os.environ['OPENAI_API_KEY'] = 'sk_...'
os.environ['RAY_ADDRESS'] = 'ray://localhost:10001'

runtime_env = {
    'pip': [
        'llama-index==0.10.27',
        'boto3==1.34.79',
        'botocore==1.34.79',
        'ipython==8.18.1',
        'pandas==2.2.1',
        'ragas==0.1.7',
        'pypdf2==3.0.1',
        'boto3==1.34.79',
        'langchain==0.1.14',
        'unstructured==0.13.2'
        
    ],
    "env_vars": {
        'AWS_ACCESS_KEY_ID': os.environ['AWS_ACCESS_KEY_ID'],
        'AWS_SECRET_ACCESS_KEY': os.environ['AWS_SECRET_ACCESS_KEY'],
        'HUGGINGFACE_API_TOKEN': os.environ['HUGGINGFACE_API_TOKEN'],
        'OPENAI_API_KEY': os.environ['OPENAI_API_KEY'],
    }

}

ray.init(runtime_env=runtime_env, include_dashboard=True, log_to_driver=False)

2024-04-10 15:30:35,548	INFO worker.py:1432 -- Using address ray://localhost:10001 set in the environment variable RAY_ADDRESS
2024-04-10 15:30:35,578	INFO client_builder.py:244 -- Passing the following kwargs to ray.init() on the server: include_dashboard, log_to_driver
SIGTERM handler is not set because current thread is not the main thread.


0,1
Python version:,3.9.18
Ray version:,2.10.0
Dashboard:,http://10.10.255.231:8265


Log channel is reconnecting. Logs produced while the connection was down can be found on the head node of the cluster in `ray_client_server_[port].out`


### Distributed Testset Generation (Ragas + Ray)

In [2]:
BUCKET_NAME = 'unstructured-data'

In [3]:
import boto3
import os

s3 = boto3.client(
        's3',
        endpoint_url='http://localhost:9000',
        aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
        aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY']
    )

def list_files_in_bucket(bucket_name):
    file_paths = []
    paginator = s3.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket_name)

    for page in page_iterator:
        if "Contents" in page:
            for obj in page['Contents']:
                if obj['Key'].lower().endswith('.pdf'):  # Check if the file is a PDF
                    file_paths.append(obj['Key'])

    return file_paths

In [4]:
@ray.remote
def create_golden_dataset(bucket_name, file_names):
    import os
    import boto3
    import fitz  # PyMuPDF
    import shutil
    import time
    import random
    from llama_index.core import SimpleDirectoryReader
    from ragas.testset.generator import TestsetGenerator
    from langchain_openai import ChatOpenAI, OpenAIEmbeddings

    DIRECTORY_NAME = os.path.join(os.getcwd(), 'data')
    s3_client = boto3.client(
        's3',
        aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
        aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
        endpoint_url='http://minio.minio.svc:9000'
    )
    
    if not os.path.exists(DIRECTORY_NAME):
        os.makedirs(DIRECTORY_NAME)
    
    for file_name in file_names:
        LOCAL_FILE_PATH = os.path.join(DIRECTORY_NAME, file_name)
        
        # Fetch the PDF file from S3
        pdf_file = s3_client.get_object(Bucket=bucket_name, Key=file_name)
        pdf_content = pdf_file['Body'].read()
        
        with open(LOCAL_FILE_PATH, 'wb') as f:
            f.write(pdf_content)

    reader = SimpleDirectoryReader(input_dir=DIRECTORY_NAME)
    documents = reader.load_data()
    
    # Generator with openai models
    generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
    critic_llm = ChatOpenAI(model="gpt-3.5-turbo")
    embeddings = OpenAIEmbeddings()
    
    generator = TestsetGenerator.from_langchain(
        generator_llm,
        critic_llm,
        embeddings
    )
    
    # Generate testset
    try:
        # To get past rate limits
        time.sleep(random.uniform(30, 60))
        testset = generator.generate_with_llamaindex_docs(documents, test_size=3)
    except:
        testset = None
        print("Uh oh!")
    
    shutil.rmtree(DIRECTORY_NAME)
    print(f"Directory {DIRECTORY_NAME} and its contents have been deleted.")
        
    return testset

In [5]:
pdfs_on_s3 = list_files_in_bucket(BUCKET_NAME)
futures = []
# Process up to 3 PDFs at a time
for i in range(0, len(pdfs_on_s3), 3):
    batch = pdfs_on_s3[i:i+3]
    future = create_golden_dataset.remote(BUCKET_NAME, batch)
    futures.append(future)

combined_golden_testsets = ray.get(futures)

In [6]:
import pandas as pd
combined_golden_df = pd.DataFrame()
for testset in combined_golden_testsets:
    combined_golden_df = pd.concat([combined_golden_df, testset.to_pandas()], ignore_index=True)

combined_golden_df

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What method was used to determine the antitoxi...,[Bentancor et al - perfringens epsilon -toxin ...,Indirect ELISA was used to determine if indire...,simple,"[{'page_label': '2', 'file_name': 'Antibody re...",True
1,How were liveweight changes calculated and mon...,[25 \n20 \n1s e \ni \nIi \n'0 j \n5 \n0 \nFig....,Liveweight changes were calculated monthly usi...,simple,"[{'page_label': '2', 'file_name': 'Alpaca live...",True
2,What changes occurred in the cria's respirator...,[ the morning but decreased to 87% by mid-afte...,The cria's respiratory condition deteriorated ...,reasoning,"[{'page_label': '785', 'file_name': 'Acute res...",True
3,What are the mean monthly temperatures in June...,[e hasta alcanzar valores de 75 g dim-1 a 10s ...,June is the coldest month with a mean monthly ...,multi_context,"[{'page_label': '1', 'file_name': 'Alpaca live...",True
4,What factors contributed to the variation in v...,[TABLE I. VITAMIN E CONCENTRATIONS (MG/ KG) O...,The variation in vitamin E concentrations in t...,simple,"[{'page_label': '3', 'file_name': 'Evaluation ...",True
5,How was the rupture of the gastrocnemius muscl...,[Conservativemanagementofaruptured\ngastrocnem...,The rupture of the gastrocnemius muscle in a m...,simple,"[{'page_label': '1', 'file_name': 'Conservativ...",True
6,How did cholesterol levels in female alpacas c...,[ \nconcentrations were different within the m...,Cholesterol concentrations were found to be di...,multi_context,"[{'page_label': '2', 'file_name': 'Evaluation ...",True
7,What are some potential treatment options for ...,[treatment did not appear to result in direct ...,Surgical plication of the diaphragm is a poten...,simple,"[{'page_label': '385', 'file_name': 'Neurologi...",True
8,What is the publication date of the article on...,[oup.com/af/article/12/4/78/6663962 by guest o...,,simple,"[{'page_label': '80', 'file_name': 'Influence ...",True
9,What pH range indicates respiratory acidosis i...,[rit (24%). There\nwere no other signs indicat...,Respiratory acidosis in alpacas is indicated b...,reasoning,"[{'page_label': '381', 'file_name': 'Neurologi...",True


In [7]:
import re
pattern = r"[^a-zA-Z0-9.,? ]"

# Define a function to replace special characters in a string
def remove_special_chars(s):
    return re.sub(pattern, '', str(s))

# Apply the function to each cell in the DataFrame
combined_golden_df = combined_golden_df.map(remove_special_chars)
combined_golden_df

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What method was used to determine the antitoxi...,Bentancor et al perfringens epsilon toxin in ...,Indirect ELISA was used to determine if indire...,simple,"pagelabel 2, filename Antibody response to the...",True
1,How were liveweight changes calculated and mon...,25 n20 n1s e ni nIi n0 j n5 n0 nFig. 1. Averag...,Liveweight changes were calculated monthly usi...,simple,"pagelabel 2, filename Alpaca liveweight variat...",True
2,What changes occurred in the crias respiratory...,the morning but decreased to 87 by midafterno...,The crias respiratory condition deteriorated f...,reasoning,"pagelabel 785, filename Acute respiratory dist...",True
3,What are the mean monthly temperatures in June...,e hasta alcanzar valores de 75 g dim1 a 10s 8....,June is the coldest month with a mean monthly ...,multicontext,"pagelabel 1, filename Alpaca liveweight variat...",True
4,What factors contributed to the variation in v...,TABLE I. VITAMIN E CONCENTRATIONS MG KG ON A ...,The variation in vitamin E concentrations in t...,simple,"pagelabel 3, filename Evaluation of cholestero...",True
5,How was the rupture of the gastrocnemius muscl...,Conservativemanagementofarupturedngastrocnemiu...,The rupture of the gastrocnemius muscle in a m...,simple,"pagelabel 1, filename Conservative management ...",True
6,How did cholesterol levels in female alpacas c...,nconcentrations were different within the mal...,Cholesterol concentrations were found to be di...,multicontext,"pagelabel 2, filename Evaluation of cholestero...",True
7,What are some potential treatment options for ...,treatment did not appear to result in direct i...,Surgical plication of the diaphragm is a poten...,simple,"pagelabel 385, filename Neurological Causes of...",True
8,What is the publication date of the article on...,oup.comafarticle124786663962 by guest on 09 Oc...,,simple,"pagelabel 80, filename Influence of effects on...",True
9,What pH range indicates respiratory acidosis i...,rit 24. Therenwere no other signs indicative o...,Respiratory acidosis in alpacas is indicated b...,reasoning,"pagelabel 381, filename Neurological Causes of...",True


### Save golden dataset to S3

In [8]:
import io
csv_buffer = io.StringIO()
combined_golden_df.to_csv(csv_buffer, index=False)

s3.put_object(Bucket=BUCKET_NAME, Body=csv_buffer.getvalue(), Key='golden_dataset.csv')

{'ResponseMetadata': {'RequestId': '17C4F5777F822675',
  'HostId': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '0',
   'etag': '"083f923f435ca42369d80875a11b7f6a"',
   'server': 'MinIO',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-id-2': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
   'x-amz-request-id': '17C4F5777F822675',
   'x-content-type-options': 'nosniff',
   'x-xss-protection': '1; mode=block',
   'date': 'Wed, 10 Apr 2024 15:40:51 GMT'},
  'RetryAttempts': 0},
 'ETag': '"083f923f435ca42369d80875a11b7f6a"'}

![](assets/golden_dataset.png)