In [36]:
import pickle
import re
from statistics import mean
import pandas as pd
from eval_util import compute_footprint_fitness, compute_footprint_matrix, compute_footprint_matrix_pairs, generate_traces_from_tree

task ="pt"
model= "mistral"

runs = [0, 1, 2]

result_records = []
individual_results = []

for run in runs:
    try:
        with open(f'eval/generation-ft-results/{task}_{model}_test_activity_{run}.pkl', 'rb') as f:
            data = pickle.load(f)
            print(len(data["labels"]))
            fitness = []
            idx = 0
            for labels, preds in zip(data['labels'], data['preds']):
                idx += 1
                # Output the results
                print("----------------")
                print(labels)
                print("----------")
                print(preds)
                print("----------------")
                
                if task == "dfg":
                    unique_activities = set()
                    
                    preds = preds.lower()
                    labels = labels.lower()
                    # split labels by lnew ine and then turn each line into a pair of activities by splitting on ' -> '
                    label_pairs = []
                    for line in labels.split('\n'):
                        label_pairs.append((line.split('->')[0].strip(), line.split('->')[1].strip())) 
                    unique_activities.update([pair[0] for pair in label_pairs])
                    unique_activities.update([pair[1] for pair in label_pairs])
                    # split preds by lnew ine and then turn each line into a pair of activities by splitting on ' -> '
                    pred_pairs = []
                    for line in preds.split('\n'):
                        try:
                            pred_pairs.append((line.split('->')[0].strip(), line.split('->')[1].strip()))
                        except:
                            print(line)
                    
                    # make sure activities are the same
                    true_matrix = compute_footprint_matrix_pairs(label_pairs, unique_activities)
                    pred_matrix = compute_footprint_matrix_pairs(pred_pairs, unique_activities)
                elif task == "pt":
                    if "*" in labels:
                        print("loop in gt")
                        continue
                    if "*" in preds:
                        print("loop in pred")
                        continue

                    unique_activities = set()
                    # Regex pattern to match strings within single quotes
                    pattern = r"'(.*?)'"

                    # Find all matches
                    matches = re.findall(pattern, labels)
                    unique_activities.update(matches)
                    preds = preds.lower()
                    labels = labels.lower()
                    unique_activities = {act.lower() for act in unique_activities}
                    
                    true_str_traces = generate_traces_from_tree(labels, unique_activities)
                    true_matrix = compute_footprint_matrix(true_str_traces, unique_activities)
                    str_traces = generate_traces_from_tree(preds, unique_activities)
                    pred_matrix = compute_footprint_matrix(str_traces, unique_activities)
                current_fitness = compute_footprint_fitness(true_matrix, pred_matrix)
                print(current_fitness)
                fitness.append(current_fitness)
                individual_results.append({
                    "sample_size": 'max',
                    "run": run,
                    "fitness": current_fitness,
                    "true": labels,
                    "pred": preds,
                    "unique_activities": unique_activities
                })
            rec = {
                "Task": task,
                "Approach": model,
                "sample_size": 'max',
                "run": run,
                "avg_fitness": mean(fitness),
            }
            print(len(fitness))
            print(fitness)
        result_records.append(rec)
        print(result_records)
    except Exception as e:
        print(e)
        continue


1528
----------------
->( 'Navigate to Goods Receipt screen', 'Enter Goods movement info.', 'Select Adopt+ Details"', 'Select Delivery completed flag', 'Save Goods receipt' )
----------
->( 'Navigate to Goods Receipt screen', 'Enter goods movement info.', 'Select Adopt+ Details"', 'Select delivery completed flag', 'Save Goods Receipt' )
----------------
1.0
----------------
->( 'Invalid passport or visa application processed by an agent', 'TA can no longer guarantee the trip', 'Inform the client' )
----------
->( 'Invalid passport or visa application processed by an agent', 'Ta can no longer guarantee the trip', 'Inform the client' )
----------------
1.0
----------------
->( 'Measure customer satisfaction with warranty handling and resolution', 'Monitor and report on warranty management metrics', 'Identify improvement opportunities', 'Identify opportunities to eliminate warranty waste', 'Investigate fraudulent claims' )
----------
->( 'Monitor and report on warranty management metrics'

In [49]:
# individual results df
df = pd.DataFrame(individual_results)

# add column for size of 'unique_activities'
df['unique_activities_size'] = df['unique_activities'].apply(lambda x: len(x))
# keep only the rows where the size of the unique activities is greater than 1
#df = df[df['unique_activities_size'] > 1]
# group activity len sizes by adding a column for the size ranges of the unique activities in steps of 5
df['unique_activities_size_range'] = df['unique_activities_size'].apply(lambda x: (x//5)*5)
df


Unnamed: 0,sample_size,run,fitness,true,pred,unique_activities,unique_activities_size,unique_activities_size_range
0,max,0,1.000000,"->( 'navigate to goods receipt screen', 'enter...","->( 'navigate to goods receipt screen', 'enter...","{navigate to goods receipt screen, select deli...",5,5
1,max,0,1.000000,->( 'invalid passport or visa application proc...,->( 'invalid passport or visa application proc...,"{ta can no longer guarantee the trip, invalid ...",3,0
2,max,0,0.440000,->( 'measure customer satisfaction with warran...,->( 'monitor and report on warranty management...,"{investigate fraudulent claims, monitor and re...",5,5
3,max,0,1.000000,->( 'user navigates to create your first app` ...,->( 'user navigates to create your first app` ...,"{api call: post `<acnapi_host>/apps/v, fills u...",6,5
4,max,0,1.000000,"->( 'search for book', 'enter book number', 'r...","->( 'search for book', 'enter book number', 'r...","{confirm reservation, enter book number, save ...",5,5
...,...,...,...,...,...,...,...,...
4489,max,2,0.600000,"->( 'halaman my course', 'archive', 'halaman l...","->( 'halaman my course', 'detail course finish...","{klik course inished, halaman my course, halam...",5,5
4490,max,2,0.802469,+( ->( 'define process on data maintenance (ad...,->( 'define process on data maintenance (adi-'...,{prepare environments and their deployment (ad...,9,5
4491,max,2,0.836735,"->( 'weight mannualy', 'fill in weighing map f...","->( 'deliver sales map to the vessel', 'weight...","{weight mannualy, place batch in buyers locati...",7,5
4492,max,2,0.591837,"->( 'previous activities', 'discard applicatio...","->( 'discard applications', 'convert gpa score...","{previous activities, appoint letter score, di...",7,5


In [None]:
# compute the average fitness and standard deviation for different sets of unique activities
df_grouped = df.groupby(['true', 'unique_activities_size']).agg({'fitness': ['mean', 'std', 'size']}).reset_index()



df_grouped

Unnamed: 0_level_0,true,unique_activities_size,fitness,fitness,fitness
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,size
0,"+( 'alert their subscribers', 'send flyers to ...",3,1.000000,0.000000,3
1,"+( 'approve the claim', 'send notification' )",2,0.500000,0.000000,3
2,"+( 'arrange company', 'arrange title', 'arrang...",3,1.000000,0.000000,3
3,"+( 'arrange travel advance', 'book tickets and...",2,0.500000,0.000000,3
4,"+( 'assess vaccinee booking status', 'assess v...",2,0.666667,0.288675,3
...,...,...,...,...,...
1472,"->( x( 'request market report', 'request branc...",7,0.850340,0.023565,3
1473,"->( x( 'resolve questions', 'seek senior clini...",6,0.777778,0.096225,3
1474,"->( x( 'stage material', 'issue raw material' ...",3,0.555556,0.000000,3
1475,->( x( ->( 'arrange in clinical assistant queu...,7,0.850340,0.131206,3


In [39]:



df = pd.DataFrame(result_records)

In [40]:
df

Unnamed: 0,Task,Approach,sample_size,run,avg_fitness
0,pt,mistral,max,0,0.84059
1,pt,mistral,max,1,0.837693
2,pt,mistral,max,2,0.837613


In [41]:
# Grouping by multiple categories and computing averages and variance
grouped = df.groupby(['Task', 'Approach']).agg({
    'avg_fitness': ['mean', 'std']
})
# Rounding variance columns to three decimal places
grouped[('avg_fitness', 'std')] = grouped[('avg_fitness', 'std')].round(3)
grouped[('avg_fitness', 'mean')] = grouped[('avg_fitness', 'mean')].round(2)

In [42]:
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,avg_fitness,avg_fitness
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std
Task,Approach,Unnamed: 2_level_2,Unnamed: 3_level_2
pt,mistral,0.84,0.002
