In [None]:
import sys
import requests
import json 
from datetime import datetime, timedelta
import pandas as pd
from os.path import abspath, join

utils_path = abspath(join('..', 'utils'))
if utils_path not in sys.path:
    sys.path.append(utils_path)

from shield_utils import setup_env, get_token_usage, query_inferences

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

setup_env(base_url="<URL>", api_key="<API_KEY>")

Local Helper Functions 

In [40]:
def bucket_inferences_by_day(inferences): 
    def millis_to_datetime(millis):
        return datetime.fromtimestamp(millis / 1000.0)

    bucketed_objects = {}
    for inf in inferences:
        date = millis_to_datetime(inf["created_at"]).date()
        if date not in bucketed_objects:
            bucketed_objects[date] = []
        bucketed_objects[date].append(inf)

    return bucketed_objects

def get_prompt_and_response_inf_per_day(task_inferences_map_by_day):

    prompt_inf_day_map = {}
    response_inf_day_map = {}

    for day in task_inferences_map_by_day: 

        for inf in task_inferences_map_by_day.get(day): 
            prompt_inf_curr_day = prompt_inf_day_map.get(day)

            if prompt_inf_curr_day is None: 
                prompt_inf_curr_day = []

            response_inf_curr_day = response_inf_day_map.get(day)
            if response_inf_curr_day is None: 
                response_inf_curr_day = []

            prompt_inf = inf["inference_prompt"]
            response_inf = inf["inference_response"]

            if prompt_inf: 
                prompt_inf_curr_day.append(prompt_inf)
            if response_inf: 
                response_inf_curr_day.append(response_inf)

            prompt_inf_day_map[day] = prompt_inf_curr_day
            response_inf_day_map[day] = response_inf_curr_day    

    return prompt_inf_day_map, response_inf_day_map

Enter the following: 

- Start Time 
- End Time 
- Task Ids 
- Rule Types 

In [None]:
end = datetime.now() + timedelta(days=1)
start = end - timedelta(days=7)

task_ids = ["<TASK1>", "<TASK2>"]
rule_types = ["ModelHallucinationRuleV2", "ModelSensitiveDataRule", "PIIDataRule", "ToxicityRule", "RegexRule", "KeywordRule", "PromptInjectionRule"]

inferences = query_inferences(start=start, end=end, task_ids=task_ids)

print(inferences)

In [34]:
from itertools import groupby

def custom_sort(item):
    task_id = item.get('task_id')
    if task_id is None:
        return "N/A"
    else:
        return task_id

sorted_inferences = sorted(inferences, key=custom_sort)

tasks_inferences_map = {task_id: list(objects) for task_id, objects in groupby(
    sorted_inferences, key=custom_sort)}

In [None]:
for task in tasks_inferences_map: 
    print(task)
    inferences = tasks_inferences_map.get(task)
    print(f"Total num of inferences {len(inferences)}")
    inferences_by_day = bucket_inferences_by_day(inferences)

    prompt_inf_day_map, response_inf_day_map = get_prompt_and_response_inf_per_day(inferences_by_day)


In [None]:
for task in tasks_inferences_map: 
    print(task)
    inferences = tasks_inferences_map.get(task)
    print(f"Total num of inferences {len(inferences)}")
    inferences_by_day = bucket_inferences_by_day(inferences)

    prompt_inf_day_map, response_inf_day_map = get_prompt_and_response_inf_per_day(inferences_by_day)

In [None]:
# Token usage information 
usage = get_token_usage(start=start, end=end, groupby_rule_type=True, groupby_task=True)

usage_per_task_map = {}

for usage_metric in usage: 

    task_id = usage_metric["task_id"]
    rule_type = usage_metric["rule_type"]

    task_usage_data =  usage_per_task_map.get(task_id)

    if task_usage_data is None: 
        task_usage_data = {}
    
    task_usage_data_per_rule = task_usage_data.get(rule_type)

    if task_usage_data_per_rule is None: 
        task_usage_data_per_rule = usage_metric["count"]
    

    task_usage_data[rule_type] = task_usage_data_per_rule

    usage_per_task_map[task_id] = task_usage_data

print(usage_per_task_map)


In [None]:
import matplotlib.pyplot as plt


print(f'''
*********************************************************************************************
      
Usage metrics per task. Start: {start} to End: {end}
      
*********************************************************************************************
''')



for task in tasks_inferences_map: 
    print("*********************************************************************************************")
    print(task)
    inferences = tasks_inferences_map.get(task)
    print(f"Total num of inferences {len(inferences)}")
    inferences_by_day = bucket_inferences_by_day(inferences)

    prompt_inf_day_map, response_inf_day_map = get_prompt_and_response_inf_per_day(inferences_by_day)

    sorted_days = sorted(inferences_by_day.keys(), reverse=False)

    # Total counts 
    total_inf_counts = [len(inferences_by_day[day]) for day in sorted_days]
    prompt_inf_counts = [len(prompt_inf_day_map[day]) for day in sorted_days]
    response_inf_counts =  [len(response_inf_day_map[day]) for day in sorted_days]
    sorted_days_str = [str(day) for day in sorted_days]

    # Specific counts 
    response_inf_counts_failed = [sum(1 for result in response_inf_day_map[day] if result['result'] == "Fail") for day in sorted_days]
    response_inf_counts_pass = [sum(1 for result in response_inf_day_map[day] if result['result'] == "Pass") for day in sorted_days]

    prompt_inf_counts_failed = [sum(1 for result in prompt_inf_day_map[day] if result['result'] == "Fail") for day in sorted_days]
    prompt_inf_counts_pass = [sum(1 for result in prompt_inf_day_map[day] if result['result'] == "Pass") for day in sorted_days]


    # Plotting the prompt data 
    plt.figure(figsize=(8, 6))
    plt.plot(sorted_days_str, prompt_inf_counts, marker='o', color='black', linestyle='-', label = 'Prompt Inf Count')
    plt.plot(sorted_days_str, prompt_inf_counts_failed, marker='o', color='red', linestyle='--', label = 'Failures')
    plt.plot(sorted_days_str, prompt_inf_counts_pass, marker='o', color='green', linestyle='--', label = 'Success')

    plt.ylabel('Inferences')
    plt.title(f"{task} prompt inferences")
    plt.xticks(rotation=45) 
    plt.grid(True)
    plt.tight_layout() 
    plt.legend()
    plt.show()

    # Plotting the response data 
    plt.figure(figsize=(8, 6))
    plt.plot(sorted_days_str, response_inf_counts, marker='o', color='black', linestyle='-', label = 'Response Inf Count')
    plt.plot(sorted_days_str, response_inf_counts_failed, marker='o', color='red', linestyle='--', label = 'Failures')
    plt.plot(sorted_days_str, response_inf_counts_pass, marker='o', color='green', linestyle='--', label = 'Success')
    plt.ylabel('Inferences')
    plt.title(f"{task} response inferences")
    plt.xticks(rotation=45) 
    plt.grid(True)
    plt.tight_layout() 
    plt.legend()
    plt.show()

    # Token Usage Data 
    print("Token usage information")
    print('''
    User input (raw input by user) - metric produced by Arthur
    Prompt (raw input by user + Shield prompt augmentation) - metric produced by Azure
    Completion (raw input by user + Shield prompt augmentation) - metric produced by Azure
    ''')
    print(json.dumps(usage_per_task_map.get(task), indent=4))

