### Setup Environment

In [8]:
import os
import ray

def init_ray():
    # os.environ['OPENAI_API_KEY'] = 'sk_...'
    ray.shutdown()
    os.environ['RAY_ADDRESS'] = 'ray://localhost:10001'
    
    runtime_env = {
        'pip': [
            'llama-index==0.10.27',
            'boto3==1.34.79',
            'botocore==1.34.79',
            'ipython==8.18.1',
            'pandas==2.2.1',
            'ragas==0.1.7',
            'boto3==1.34.79',
            'llama-index-embeddings-huggingface==0.2.0',
            'llama-index-embeddings-openai==0.1.7',
            'llama-index-llms-openai==0.1.14'
                    
        ],
        "env_vars": {
            'AWS_ACCESS_KEY_ID': os.environ['AWS_ACCESS_KEY_ID'],
            'AWS_SECRET_ACCESS_KEY': os.environ['AWS_SECRET_ACCESS_KEY'],
            'HUGGINGFACE_API_TOKEN': os.environ['HUGGINGFACE_API_TOKEN'],
            'OPENAI_API_KEY': os.environ['OPENAI_API_KEY'],
        }
    
    }
    
    ray.init(runtime_env=runtime_env, include_dashboard=True, log_to_driver=False)

# init_ray()

In [8]:
from ray import tune, train

def train_model(config):
    # Simulated training logic
    for i in range(10):  # simulate 10 iterations of training
        accuracy = (i + config['alpha']) * config['beta']
        # Send the current training result back to Tune
        train.report({'accuracy': accuracy})

# Configuration for hyperparameters to tune
config = {
    'alpha': tune.choice([0.1, 0.2, 0.3]),  # Trying 3 different alphas
    'beta': tune.choice([0.5, 2.0]),  # Randomly choosing beta in the range [0.5, 2.0]
}

# Run the experiment
analysis = tune.run(
    train_model,
    config=config,
    num_samples=1,  # Number of times to sample from the hyperparameter space
    verbose=1  # Verbosity level
)

# Get the best hyperparameters
best_config = analysis.get_best_config(metric="accuracy", mode="max")
print("Best config: ", best_config)

# You can also access a dataframe with the results:
df = analysis.results_df
df

0,1
Current time:,2024-04-14 13:33:43
Running for:,00:00:02.32
Memory:,12.3/31.3 GiB

Trial name,status,loc,alpha,beta,iter,total time (s),accuracy
train_model_9b919_00000,TERMINATED,139.59.90.44:3668643,0.1,0.5,10,0.00365186,4.55


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
2024-04-14 13:33:43,720	INFO tune.py:1016 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/train_model_2024-04-14_13-33-41' in 0.0085s.
2024-04-14 13:33:43,731	INFO tune.py:1048 -- Total run time: 2.37 seconds (2.31 seconds for the tuning loop).


Best config:  {'alpha': 0.1, 'beta': 0.5}


Unnamed: 0_level_0,accuracy,timestamp,checkpoint_dir_name,done,training_iteration,date,time_this_iter_s,time_total_s,pid,hostname,node_ip,time_since_restore,iterations_since_restore,experiment_tag,config/alpha,config/beta
trial_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
9b919_00000,4.55,1713101623,,True,10,2024-04-14_13-33-43,0.000222,0.003652,3668643,goku,139.59.90.44,0.003652,10,"0_alpha=0.1000,beta=0.5000",0.1,0.5


### Helper Functions

In [9]:
from llama_index.core.node_parser import HierarchicalNodeParser, SimpleNodeParser, SentenceWindowNodeParser
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.core import Settings, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import SimpleDirectoryReader
import pandas as pd
import boto3
import io
from ragas.metrics import (
    faithfulness, 
    answer_relevancy, 
    context_precision, 
    context_recall, 
    answer_similarity, 
    answer_correctness
)
from datasets import Dataset
from ragas import evaluate as ragas_evaluate
import time
import random



def list_files_in_bucket(s3_client, bucket_name):
    file_paths = []
    paginator = s3_client.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket_name)

    for page in page_iterator:
        if "Contents" in page:
            for obj in page['Contents']:
                if obj['Key'].lower().endswith('.pdf'):  # Check if the file is a PDF
                    file_paths.append(obj['Key'])
    return file_paths


def load_gds(s3_client, bucket_name, gds_csv):
    csv_obj = s3_client.get_object(Bucket=bucket_name, Key=gds_csv)
    gds_df = pd.read_csv(io.BytesIO(csv_obj['Body'].read()))
    return gds_df


def query_engine_picker(query_engine_identifier, data_dir, embed_model, llm):
    docs = SimpleDirectoryReader(data_dir).load_data()
    Settings.llm = llm
    Settings.embed_model = embed_model
    Settings.chunk_size = 256
    query_engine = None
    
    if query_engine_identifier == 'hierarchical':
        print('Initialising Hierarchical Retrieval...')
        node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=[256, 512, 1024])
        nodes = node_parser.get_nodes_from_documents(docs)
        print('Creating Vector Store Index...')
        index = VectorStoreIndex(nodes)
        retriever = AutoMergingRetriever(
                        index.as_retriever(similarity_top_k=3),
                        storage_context=index.storage_context,
                        verbose=True
                    )
        print('Creating Query Engine...')
        query_engine = RetrieverQueryEngine.from_args(retriever)
        
    elif query_engine_identifier == 'sentence_window':
        print('Initialising Sentence Window Retrieval...')
        node_parser = SentenceWindowNodeParser.from_defaults(
                            window_size=3,
                            window_metadata_key="window",
                            original_text_metadata_key="original_text"
                    )
        nodes = node_parser.get_nodes_from_documents(docs)
        print('Creating Vector Store Index...')
        index = VectorStoreIndex(nodes)
        query_engine = index.as_query_engine(
                            similarity_top_k=3,
                            # the target key defaults to `window` to match the node_parser's default
                            node_postprocessors=[
                                MetadataReplacementPostProcessor(target_metadata_key="window")
                            ],
                        )

    else: #basic
        print('Initialising Basic Retrieval...')
        node_parser = SimpleNodeParser.from_defaults()
        nodes = node_parser.get_nodes_from_documents(docs)
        print('Creating Vector Store Index...')
        index = VectorStoreIndex(nodes)
        query_engine = index.as_query_engine(similarity_top_k=3)

    return query_engine


def embed_model_picker(embed_model_identifier):
    print('Initialising embedding model...')
    return OpenAIEmbedding(model=embed_model_identifier)


def llm_picker(llm_identifier):
    print('Initialising LLM...')
    return OpenAI(model=llm_identifier, temperature=0.1)

def evaluator(gds_df, query_engine):
    # List of evaluation metrics functions to be used.
    metrics = [
        faithfulness,           # Evaluates faithfulness of the response to the source material.
        answer_relevancy,       # Assesses relevance of the response to the query.
        context_precision,      # Measures precision of the context in the response.
        context_recall,         # Measures recall of the context in the response.
        answer_correctness,     # Checks correctness of the answer.
        answer_similarity,      # Evaluates similarity of the answer to a reference answer.
    ]

    test_questions = gds_df.head(10)['question'].fillna('').astype(str).values.tolist()
    test_answers =  gds_df.head(10)['ground_truth'].fillna('').astype(str).values.tolist()


    responses = [query_engine.query(q) for q in test_questions]
    answers = []
    contexts = []
    for r in responses:
        answers.append(r.response)
        contexts.append([c.node.get_content() for c in r.source_nodes])

    dataset_dict = {
        'question': test_questions,
        'answer': answers,
        'contexts': contexts,
        'ground_truth': test_answers
    }

    ds = Dataset.from_dict(dataset_dict)
    time.sleep(random.choice([60, 120, 180]))
    result = ragas_evaluate(ds, metrics)

    return result


In [10]:
def experiment(config):
    
    # load gds csv form s3
    print('Initialising S3 client...')
    s3_client = boto3.client(
        's3',
        aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
        aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
        endpoint_url=config['s3_endpoint']
    )
    print('Loading Golden Dataset...')
    gds_df = load_gds(s3_client, config['bucket_name'], config['gds_csv'])

    # load pdfs from s3
    print('Download pdfs...')
    DIRECTORY_NAME = os.path.join(os.getcwd(), 'data')
    if not os.path.exists(DIRECTORY_NAME):
        os.makedirs(DIRECTORY_NAME)
    pdfs = list_files_in_bucket(s3_client, config['bucket_name'])
    for pdf in pdfs:
        LOCAL_FILE_PATH = os.path.join(DIRECTORY_NAME, pdf)
        
        # Fetch the PDF file from S3
        pdf_file = s3_client.get_object(Bucket=config['bucket_name'], Key=pdf)
        pdf_content = pdf_file['Body'].read()
        
        with open(LOCAL_FILE_PATH, 'wb') as f:
            f.write(pdf_content)
    
    # assemble llamaindex pipeline
    print('Initialising LlamaIndex pipeline...')
    embed_model = embed_model_picker(config['embed_model_identifier'])
    llm = llm_picker(config['llm_identifier'])
    query_engine = query_engine_picker(config['query_engine_identifier'], DIRECTORY_NAME, embed_model, llm)
    
    print(query_engine.query('How do camelid genetics influence wool quality?').response)

    print('Beginning evaluation...')
    try:
        evaluation_result = evaluator(gds_df, query_engine)
    except Exception as e:
        print(e)
        evaluation_result = {} # return empty in case of any failures...mostly for debugging

    train.report(evaluation_result)
    # pass
    
    # run evaluation
    # return 
    # log results in mlflow


In [35]:
from ray import tune, train

init_ray() #helps to create a new job for each experiment instead of re-using the same session

config = {
    's3_endpoint': 'http://minio.minio.svc:9000',
    'bucket_name': 'unstructured-data',
    'gds_csv': 'golden_dataset.csv',
    'query_engine_identifier': tune.grid_search(['hierarchical', 'sentence_window', 'basic']),
    'embed_model_identifier': tune.grid_search(['text-embedding-3-small', 'text-embedding-3-large']),
    'llm_identifier': tune.grid_search(['gpt-4', 'gpt-3.5-turbo'])
    
}


analysis = tune.run(
    experiment,
    config=config,
    num_samples=1,
    verbose=1              
)

![](assets/ray_tasks.png)

In [36]:
analysis.results_df

Unnamed: 0_level_0,faithfulness,answer_relevancy,context_precision,context_recall,answer_correctness,answer_similarity,timestamp,checkpoint_dir_name,done,training_iteration,...,node_ip,time_since_restore,iterations_since_restore,experiment_tag,config/s3_endpoint,config/bucket_name,config/gds_csv,config/query_engine_identifier,config/embed_model_identifier,config/llm_identifier
trial_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27630_00000,0.9,0.87134,0.9,0.633333,0.650026,0.935107,1713077614,,True,1,...,10.10.255.234,229.04018,1,0_embed_model_identifier=text-embedding-3-smal...,http://minio.minio.svc:9000,unstructured-data,golden_dataset.csv,hierarchical,text-embedding-3-small,gpt-4
27630_00001,0.9,0.873009,0.9,0.633333,0.644999,0.929998,1713077673,,True,1,...,10.10.255.245,288.323461,1,1_embed_model_identifier=text-embedding-3-larg...,http://minio.minio.svc:9000,unstructured-data,golden_dataset.csv,hierarchical,text-embedding-3-large,gpt-4
27630_00002,0.9,0.97345,0.9,0.6,0.660225,0.934467,1713077741,,True,1,...,10.10.255.207,356.069403,1,2_embed_model_identifier=text-embedding-3-smal...,http://minio.minio.svc:9000,unstructured-data,golden_dataset.csv,hierarchical,text-embedding-3-small,gpt-3.5-turbo
27630_00003,0.977778,0.97369,0.9,0.6,0.647201,0.93094,1713077630,,True,1,...,10.10.255.247,244.456134,1,3_embed_model_identifier=text-embedding-3-larg...,http://minio.minio.svc:9000,unstructured-data,golden_dataset.csv,hierarchical,text-embedding-3-large,gpt-3.5-turbo
27630_00004,0.833333,0.872803,0.866667,0.816667,0.654322,0.933445,1713077630,,True,1,...,10.10.255.247,243.432015,1,4_embed_model_identifier=text-embedding-3-smal...,http://minio.minio.svc:9000,unstructured-data,golden_dataset.csv,sentence_window,text-embedding-3-small,gpt-4
27630_00005,0.972222,0.957777,0.9,0.816667,0.662781,0.930075,1713077974,,True,1,...,10.10.255.234,349.402413,1,5_embed_model_identifier=text-embedding-3-larg...,http://minio.minio.svc:9000,unstructured-data,golden_dataset.csv,sentence_window,text-embedding-3-large,gpt-4
27630_00006,0.922222,0.864686,0.916667,0.866667,0.687165,0.935326,1713078099,,True,1,...,10.10.255.247,454.76579,1,6_embed_model_identifier=text-embedding-3-smal...,http://minio.minio.svc:9000,unstructured-data,golden_dataset.csv,sentence_window,text-embedding-3-small,gpt-3.5-turbo
27630_00007,0.944444,0.862193,0.966667,0.916667,0.697976,0.935144,1713078081,,True,1,...,10.10.255.247,435.467988,1,7_embed_model_identifier=text-embedding-3-larg...,http://minio.minio.svc:9000,unstructured-data,golden_dataset.csv,sentence_window,text-embedding-3-large,gpt-3.5-turbo
27630_00008,0.916667,0.864794,0.933333,0.933333,0.673057,0.932944,1713077864,,True,1,...,10.10.255.245,180.303094,1,8_embed_model_identifier=text-embedding-3-smal...,http://minio.minio.svc:9000,unstructured-data,golden_dataset.csv,basic,text-embedding-3-small,gpt-4
27630_00009,0.927778,0.871978,0.933333,0.883333,0.7175,0.932856,1713077931,,True,1,...,10.10.255.207,178.146051,1,9_embed_model_identifier=text-embedding-3-larg...,http://minio.minio.svc:9000,unstructured-data,golden_dataset.csv,basic,text-embedding-3-large,gpt-4


In [38]:
analysis.results_df.to_csv('eval_results.csv') # checkpointing for later :)

## Logging distributed experiment results to MLFlow

In [5]:
import pandas as pd

results_df = pd.read_csv('eval_results.csv')
results_df.columns

Index(['trial_id', 'faithfulness', 'answer_relevancy', 'context_precision',
       'context_recall', 'answer_correctness', 'answer_similarity',
       'timestamp', 'checkpoint_dir_name', 'done', 'training_iteration',
       'date', 'time_this_iter_s', 'time_total_s', 'pid', 'hostname',
       'node_ip', 'time_since_restore', 'iterations_since_restore',
       'experiment_tag', 'config/s3_endpoint', 'config/bucket_name',
       'config/gds_csv', 'config/query_engine_identifier',
       'config/embed_model_identifier', 'config/llm_identifier'],
      dtype='object')

In [6]:
# minor cleanup
results_df.columns = [col.split('/')[-1] if 'config/' in col else col for col in results_df.columns]

In [10]:
import mlflow
from mlflow.data import from_pandas

mlflow.set_tracking_uri('http://localhost:5000')

def log_experiments(df):

    mlflow.set_experiment("distributed-rag-experiment")

    dataset_source = 'http://minio.minio.svc:9000/unstructured-data/golden_dataset.csv'

    s3_client = boto3.client(
        's3',
        aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
        aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
        endpoint_url='http://localhost:9000'
    )
    
    gds_df = load_gds(s3_client, 'unstructured-data', 'golden_dataset.csv')
    
    dataset = from_pandas(gds_df, source=dataset_source, name="Golden Dataset")

    for index, row in df.iterrows():
        with mlflow.start_run():
            # log dataset for data lineage
            mlflow.log_input(dataset, context="gds_rag_eval")
            
            # log metrics
            metrics = ['faithfulness', 'answer_relevancy', 'context_precision',
                       'context_recall', 'answer_correctness', 'answer_similarity']
            for metric in metrics:
                mlflow.log_metric(metric, row[metric])

            # Log parameters
            parameters = ['query_engine_identifier', 'embed_model_identifier', 'llm_identifier']
            for param in parameters:
                mlflow.log_param(param, row[param])

            # Log S3 bucket information as a parameter
            mlflow.log_param('s3_bucket', 'unstructured-data')


            # End this MLflow run
            mlflow.end_run()

# Assuming results_df is your DataFrame
log_experiments(results_df)


  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]


![](assets/mlflow_4.png)
![](assets/mlflow_1.png)
![](assets/mlflow_2.png)
![](assets/mlflow_3.png)