In [13]:
seed_prompt_path = "evaluator.prompty"

input_data = "data\input_data\evaluator_alignment_data.jsonl"

self_improver_prompt = "self_improver.prompty"

In [14]:
from dotenv import load_dotenv
load_dotenv()

from evaluator import evaluator
from azure.ai.evaluation import evaluate
import pandas as pd
import json
import os
import prompty.azure
from sklearn.metrics import roc_auc_score


In [15]:
# execute the prompt
def get_eval(prompt_path, data_path):
    evaluate_results = evaluate(
        data=data_path,
        evaluators={
            "eval": evaluator
        },
        evaluator_config={
            "default": {
                "question": "${data.question}",
                "answer": "${data.answer}",
                "context": "${data.context}",
                "prompt_path": prompt_path,
            },
        },
    )

    eval_result = pd.DataFrame(evaluate_results["rows"])

    # Extract 'chain of thought' from the JSON strings
    eval_result['chain of thought'] = eval_result['outputs.eval.output'].apply(lambda x: json.loads(x)['chain of thought'])

    # Extract 'following guidelines'
    eval_result['following guidelines'] = eval_result['outputs.eval.output'].apply(lambda x: json.loads(x)['following guidelines'])

    return eval_result


In [16]:
def sample_results(results, number_of_correct_samples=5, number_of_incorrect_samples=5):
    correct_samples = results[results['correct evaluated'] == True].sample(number_of_correct_samples)
    incorrect_samples = results[results['correct evaluated'] == False].sample(number_of_incorrect_samples)
    
    samples = pd.concat([correct_samples, incorrect_samples])
    
    return samples

In [17]:
model_config = {
#"azure_endpoint": os.environ["AZURE_OPENAI_ENDPOINT"],
#"api_version": os.environ["AZURE_OPENAI_API_VERSION"],
"api_key": os.environ["AZURE_OPENAI_KEY"]
}

# Construct the absolute path
prompty_file_path = os.path.join(os.getcwd(), 'self_improver.prompty')

def sample_new_prompts(prompt_to_expand, results, number=2):

    mutated_prompts = []
    for i in range(0, number):
          # execute the prompty file
      result = prompty.execute(
        prompty_file_path, 
        inputs={
          "prompt": prompt_to_expand,
          "example_results": sample_results(results, 1, 1)
        },
        configuration=model_config
      )

      result_dict = json.loads(result)

      mutated_prompts.append(result_dict['new prompt'])

    return mutated_prompts

In [18]:
def compare_to_human_labels(results, human_labels):
    results['human_label'] = human_labels
    results['correct evaluated'] = results['following guidelines'] == results['human_label']
    return results

In [19]:
def get_auc(results):
    return roc_auc_score(results['human_label'], results['following guidelines'])

In [20]:
# 0. Evaluate seed prompt

# 1. Calculate AUC of seed prompt

# 2. Generate new prompts

# 3. Evaluate new prompts

# 4. Calculate UCT

# 5. Select prompt to expand

# 6. Iterate 2-5 until stopping criteria is met

In [21]:
def get_human_labels(input_data):
    human_labels = []
    with open(input_data, 'r') as file:
        for line in file:
            json_obj = json.loads(line.strip())
            human_label = json_obj["human_label"].strip().lower() == 'true'
            human_labels.append(human_label)
    return human_labels

In [22]:
import math

class MCTSNode:
    def __init__(self, prompt, value, results_benchmarked, parent=None):
        self.prompt = prompt
        self.parent = parent
        self.children = []
        self.visits = 0
        self.value = value
        self.results_benchmarked = results_benchmarked

    def add_child(self, child):
        self.children.append(child)


class MCTSTree:
    def __init__(self, input_data, root_prompt, value, results_benchmarked, max_expansions=2):
        self.root = MCTSNode(prompt=root_prompt, value=value, results_benchmarked=results_benchmarked)
        self.input_data = input_data
        self.human_labels = get_human_labels(input_data)
        self.max_expansions = max_expansions
        self.expansions= 0

    def add_child(self, parent_node, child_prompt, value):
        child_node = MCTSNode(child_prompt, parent=parent_node, value=value)
        parent_node.add_child(child_node)
        return child_node
    
    def select_node(self):
        current_node = self.root
        nodes_traversed = [current_node]
        # traverse down until a leaf node
        while current_node.children:
            current_node = self._uct_select(current_node)
            nodes_traversed.append(current_node)
        # Add a visit to all parent nodes to the selected leaf node
        for node in nodes_traversed:
            node.visits += 1
        return current_node
    
    def _uct_select(node, c=1.41):
        best_child = None
        best_uct_value = -float('inf')
        for child in node.children:
            # If child not explored yet, pick it immediately
            if child.visits == 0:
                return child
            exploitation = child.value / child.visits
            exploration = c * math.sqrt(math.log(node.visits + 1) / (child.visits + 1))
            uct_value = exploitation + exploration
            if uct_value > best_uct_value:
                best_uct_value = uct_value
                best_child = child
        return best_child
    
    def expand_node(self, node):
        # Generate new prompts
        new_prompts = sample_new_prompts(node.prompt, node.results_benchmarked, number=2)
        for prompt in new_prompts:
            
            print(prompt)

            result = get_eval(prompt['new prompt'], input_data)

            result_benchmarked = compare_to_human_labels(result, self.human_labels)

            auc = get_auc(result_benchmarked)

            leaf_node = MCTSNode(prompt, parent=node, value=auc, results_benchmarked=result_benchmarked)
            
            self.add_child(node, leaf_node)
    
    def expand_tree(self):
        node = self.select_node()
        self.expand_node(node)

    def _find_best_node(self):
        best_node = self.root
        queue = [self.root]
        while queue:
            current = queue.pop(0)
            if current.value > best_node.value:
                best_node = current
            queue.extend(current.children)
        return best_node
    
    def run_mcts(self):
        while self.expansions < self.max_expansions:
            self.expand_tree()
            self.expansions += 1
        
        return self._find_best_node()


In [23]:
result = get_eval(seed_prompt_path, input_data)

human_labels = get_human_labels(input_data)

result_benchmarked = compare_to_human_labels(result, human_labels)

auc = get_auc(result_benchmarked)

mctsTree = MCTSTree(
                input_data=input_data,
                root_prompt=seed_prompt_path,
                value=auc,
                results_benchmarked=result_benchmarked,
                max_expansions=2
            )

[2024-12-22 15:50:29 +0100][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run evaluator_evaluator_b0n0q2c8_20241222_155029_314444, log path: C:\Users\albinlnnflt\.promptflow\.runs\evaluator_evaluator_b0n0q2c8_20241222_155029_314444\logs.txt


Prompt flow service has started...
You can view the traces in local from http://127.0.0.1:23333/v1.0/ui/traces/?#run=evaluator_evaluator_b0n0q2c8_20241222_155029_314444
2024-12-22 15:50:29 +0100   33264 execution.bulk     INFO     Current thread is not main thread, skip signal handler registration in BatchEngine.
2024-12-22 15:50:29 +0100   33264 execution.bulk     INFO     The timeout for the batch run is 3600 seconds.
2024-12-22 15:50:29 +0100   33264 execution.bulk     INFO     Current system's available memory is 10275.921875MB, memory consumption of current process is 821.296875MB, estimated available worker count is 10275.921875/821.296875 = 12
2024-12-22 15:50:29 +0100   33264 execution.bulk     INFO     Set process count to 4 by taking the minimum value among the factors of {'default_worker_count': 4, 'row_count': 8, 'estimated_worker_count_based_on_memory_usage': 12}.
2024-12-22 15:50:33 +0100   33264 execution.bulk     INFO     Process name(SpawnProcess-7)-Process id(26652)-L

In [24]:
best_prompt = mctsTree.run_mcts()

evaluator.prompty 

 Ensure that the chatbot's response is polite, clear, and informative. Focus specifically on maintaining a neutral tone while providing factual and accurate information. Responses should be concise and directly address the user's question. Double-check for any compliance with communication guidelines to bolster user confidence in the chatbot's responses.


TypeError: string indices must be integers, not 'str'