### Imports


In [1]:
from prettytable import PrettyTable

import json
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

In [2]:
# Jaccard Similarities on sets of keys, factoring in values
def jaccard_keyval_dict_similarity(A, B):
    keys_A, keys_B = set(A.keys()), set(B.keys())
    
    # Compute key intersection and union
    key_intersection = keys_A & keys_B
    key_union = keys_A | keys_B
    
    # Count matching key-value pairs
    value_match_count = sum(1 for k in key_intersection if A[k] == B[k])
    
    # Compute Jaccard similarity considering values
    jaccard_score = value_match_count / len(key_union) if key_union else 0.0  # Handle empty dictionaries
    return jaccard_score

def mean_jaccard_keyval_dict_similarity(predicted, reference):
    similarities = [jaccard_keyval_dict_similarity(A, B) for A, B in zip(predicted, reference)]
    return sum(similarities) / len(similarities) if similarities else 0.0

# Jaccard Similarities on sets of keys
def jaccard_key_dict_similarity(A, B):
    keys_A, keys_B = set(A.keys()), set(B.keys())
    
    # Compute key intersection and union
    key_intersection = keys_A & keys_B
    key_union = keys_A | keys_B
    
    # Compute Jaccard similarity considering values
    jaccard_score = len(key_intersection) / len(key_union) if key_union else 0.0  # Handle empty dictionaries
    return jaccard_score

def mean_jaccard_key_dict_similarity(predicted, reference):
    similarities = [jaccard_key_dict_similarity(A, B) for A, B in zip(predicted, reference)]
    return sum(similarities) / len(similarities) if similarities else 0.0

In [3]:
INTENT_LABELS = [
    "SetReminder", "SetAlarm", "CreateCalendarEvent", "SendMessage", "SendEmail",
    "MakeCall", "OpenApp", "SearchWeb", "SetTimer", "CheckWeather",
    "TurnOnDevice", "TurnOffDevice", "AdjustBrightness", "AdjustTemperature",
    "LockDoor", "UnlockDoor", "StartVacuum", "StopVacuum", "CheckSecurityCamera",
    "SetScene", "PlayMusic", "PauseMusic", "SkipTrack", "PlayPodcast", "PlayVideo",
    "AdjustVolume", "SetPlaybackSpeed", "SearchMovie", "ShowTVGuide",
    "GetDirections", "CheckTraffic", "FindNearbyPlace", "EstimateArrivalTime",
    "StartNavigation", "StopNavigation", "SendTextMessage", "MakePhoneCall",
    "StartVideoCall", "CheckVoicemail", "ReadMessage", "ReplyToMessage",
    "SendGroupMessage", "AnswerGeneralQuestion", "DefineWord", "ConvertUnits",
    "GetSportsScores", "CheckStockPrice", "GetFact", "TranslateText",
    "MathCalculation", "FindPersonInfo", "GetNewsUpdate"
]

intent_mapping = {
    "AnswerGeneralQuestion": "AnswerGeneralQuestion",
    "SearchWeb": "SearchWeb",
    "FindNearbyPlace": "FindNearbyPlace",
    "GetDirections": "GetDirections",
    "SetReminder": "SetReminder",
    "SendMessage": "SendMessage",
    "SendEmail": "SendEmail",
    "SendTextMessage": "SendTextMessage",
    "OpenApp": "OpenApp",
    "CreateCalendarEvent": "CreateCalendarEvent",
    "FindPersonInfo": "FindPersonInfo",
    "SetAlarm": "SetAlarm",
    "StartNavigation": "StartNavigation",
    "MakePhoneCall": "MakePhoneCall",
    "MakeCall": "MakePhoneCall",
    "SetTimer": "Others", "CheckWeather": "Others","TurnOnDevice": "Others", "TurnOffDevice": "Others", 
    "AdjustBrightness": "Others", "AdjustTemperature": "Others", "LockDoor": "Others", "UnlockDoor": "Others",
    "StartVacuum": "Others", "StopVacuum": "Others", "CheckSecurityCamera": "Others", "SetScene": "Others",
    "PlayMusic": "Others", "PauseMusic": "Others", "SkipTrack": "Others", "PlayPodcast": "Others", "PlayVideo": "Others",
    "AdjustVolume": "Others", "SetPlaybackSpeed": "Others", "SearchMovie": "Others", "ShowTVGuide": "Others",
    "CheckTraffic": "Others", "EstimateArrivalTime": "Others", "StopNavigation": "Others",
    "StartVideoCall": "Others", "CheckVoicemail": "Others", "ReadMessage": "Others", "ReplyToMessage": "Others",
    "SendGroupMessage": "Others", "DefineWord": "Others", "ConvertUnits": "Others", "GetSportsScores": "Others",
    "CheckStockPrice": "Others", "GetFact": "Others", "TranslateText": "Others", "MathCalculation": "Others",
    "GetNewsUpdate": "Others"
}
for v in sorted(list(set(intent_mapping.keys()))):
    print(v, end=", ")

AdjustBrightness, AdjustTemperature, AdjustVolume, AnswerGeneralQuestion, CheckSecurityCamera, CheckStockPrice, CheckTraffic, CheckVoicemail, CheckWeather, ConvertUnits, CreateCalendarEvent, DefineWord, EstimateArrivalTime, FindNearbyPlace, FindPersonInfo, GetDirections, GetFact, GetNewsUpdate, GetSportsScores, LockDoor, MakeCall, MakePhoneCall, MathCalculation, OpenApp, PauseMusic, PlayMusic, PlayPodcast, PlayVideo, ReadMessage, ReplyToMessage, SearchMovie, SearchWeb, SendEmail, SendGroupMessage, SendMessage, SendTextMessage, SetAlarm, SetPlaybackSpeed, SetReminder, SetScene, SetTimer, ShowTVGuide, SkipTrack, StartNavigation, StartVacuum, StartVideoCall, StopNavigation, StopVacuum, TranslateText, TurnOffDevice, TurnOnDevice, UnlockDoor, 

In [None]:
ARGUMENT_LABELS = {
    "ReminderContent", "DateTime", "AlarmTime", "EventTitle", "EventLocation",
    "EventDateTime", "RecipientName", "MessageContent", "EmailSubject",
    "EmailBody", "AppName", "QueryText", "TimerDuration", "WeatherLocation",
    "WeatherDate", "DeviceName", "BrightnessLevel", "TemperatureValue",
    "SceneName", "LockState", "CameraLocation", "SongName", "ArtistName",
    "PodcastTitle", "EpisodeTitle", "VolumeLevel", "PlaybackSpeed", "MovieName",
    "TVChannel", "Destination", "CurrentLocation", "PlaceCategory", "ETA",
    "RouteType", "Recipient", "MessageBody", "ContactName", "VoicemailSender",
    "QuestionText", "WordToDefine", "UnitToConvert", "StockSymbol", "SportEvent",
    "PersonName", "LanguagePair", "MathExpression", "NewsTopic"
}
argument_mapping = {
    "AlarmTime": "AlarmTime",
    "AppName": "AppName",
    "ArtistName": "ArtistName",
    "BrightnessLevel": "BrightnessLevel",
    "CameraLocation": "CameraLocation",
    "ContactName": "ContactName",
    "CurrentLocation": "CurrentLocation",
    "DateTime": "DateTime",
    "Destination": "CurrentLocation",
    "DeviceName": "DeviceName",
    "ETA": "ETA",
    "EmailBody": "EmailBody",
    "EmailSubject": "EmailBody",
    "EpisodeTitle": "EpisodeTitle",
    "EventDateTime": "DateTime",
    "EventLocation": "CurrentLocation",
    "EventTitle": "EventTitle",
    "LanguagePair": "LanguagePair",
    "LockState": "LockState",
    "MathExpression": "MathExpression",
    "MessageBody": "EmailBody",
    "MessageContent": "EmailBody",
    "MovieName": "MovieName",
    "NewsTopic": "NewsTopic",
    "PersonName": "ContactName",
    "PlaceCategory": "PlaceCategory",
    "PlaybackSpeed": "PlaybackSpeed",
    "PodcastTitle": "PodcastTitle",
    "QueryText": "QueryText",
    "QuestionText": "QueryText",
    "Recipient": "ContactName",
    "RecipientName": "ContactName",
    "ReminderContent": "ReminderContent",
    "RouteType": "RouteType",
    "SceneName": "SceneName",
    "SongName": "SongName",
    "SportEvent": "SportEvent",
    "StockSymbol": "StockSymbol",
    "TVChannel": "TVChannel",
    "TemperatureValue": "TemperatureValue",
    "TimerDuration": "AlarmTime",
    "UnitToConvert": "UnitToConvert",
    "VoicemailSender": "VoicemailSender",
    "VolumeLevel": "VolumeLevel",
    "WeatherDate": "DateTime",
    "WeatherLocation": "CurrentLocation",
    "WordToDefine": "QueryText"
}
for v in sorted(list(set(argument_mapping.keys()))):
    print(v, end=", ")

AlarmTime, AppName, ArtistName, BrightnessLevel, CameraLocation, ContactName, CurrentLocation, DateTime, Destination, DeviceName, ETA, EmailBody, EmailSubject, EpisodeTitle, EventDateTime, EventLocation, EventTitle, LanguagePair, LockState, MathExpression, MessageBody, MessageContent, MovieName, NewsTopic, PersonName, PlaceCategory, PlaybackSpeed, PodcastTitle, QueryText, QuestionText, Recipient, RecipientName, ReminderContent, RouteType, SceneName, SongName, SportEvent, StockSymbol, TVChannel, TemperatureValue, TimerDuration, UnitToConvert, VoicemailSender, VolumeLevel, WeatherDate, WeatherLocation, WordToDefine, 

In [5]:
def compute_intent_metrics(pure_data, weak_data):
    """
    Computes Precision, Recall, and F-score for intent classification.

    Parameters:
        pure_data (list): List of ground truth strings.
        weak_data (list): List of predicted strings.

    Returns:
        dict: Precision, Recall, and F1-score for intent classification.
    """
    assert len(pure_data) == len(weak_data), "Datasets must have the same number of samples."
    accuracy = accuracy_score(pure_data, weak_data)
    precision, recall, f1, support = precision_recall_fscore_support(pure_data, weak_data, labels=INTENT_LABELS, average='macro', zero_division=0)

    print(classification_report(pure_data, weak_data, zero_division=0, digits=4))
    return {
        "Intent Accuracy": accuracy,
        "Intent Precision": precision,
        "Intent Recall": recall,
        "Intent F1": f1
    }

def compute_argument_metrics(pure_data, weak_data):
    """
    Parameters:
        pure_data (list): List of ground truth dictionaries.
        weak_data (list): List of predicted dictionaries.
    """
    assert len(pure_data) == len(weak_data), "Datasets must have the same number of samples."
    return {
        "Accuracy (MJS, Keys Only)": mean_jaccard_key_dict_similarity(pure_data, weak_data),
        "Accuracy (MJS, Keys & Values)": mean_jaccard_keyval_dict_similarity(pure_data, weak_data)
    }

def compute_metrics(pure_df, weak_df):
    """
    Computes Precision, Recall, and F-score for both intent classification and argument extraction.

    Parameters:
        pure_data (list): List of ground truth JSON objects (System A).
        weak_data (list): List of predicted JSON objects (System B).

    Returns:
        dict: Dictionary with Intent and Argument precision, recall, and F1-score.
    """
    remap_intents = lambda x: intent_mapping[x]
    intent_metrics = compute_intent_metrics(pure_df["Intent"].apply(remap_intents).to_list(), 
                                            weak_df["Intent"].apply(remap_intents).to_list())
    
    clean = lambda x: {k: v for k, v in x.items() if v is not None and v.lower() != "null"}
    def remap_args(d: dict):
        new_d = dict()
        for k in sorted(d.keys(), reverse=True):
            new_d[argument_mapping[k]] = d[k]
        return new_d

    pure_args = pure_df["Arguments"].apply(json.loads).apply(clean).apply(remap_args).to_list()
    weak_args = weak_df["Arguments"].apply(json.loads).apply(clean).apply(remap_args).to_list()

    argument_metrics = compute_argument_metrics(pure_args, weak_args)

    table = PrettyTable()
    table.field_names = ["Metric", "Score (%)"]
    
    for k, v in intent_metrics.items():
        table.add_row([k, v*100])
    for k, v in argument_metrics.items():
        table.add_row([k, v*100])

    return table

In [6]:
def read_files_and_run(prediction_file, reference_file):
    prediction_df = pd.read_csv(prediction_file, sep="\t", keep_default_na=False)
    reference_df = pd.read_csv(reference_file, sep="\t", keep_default_na=False)

    print(compute_metrics(prediction_df, reference_df))

## All Metrics

### With Respect To qa_reference (the reference training rewritten prompts)

In [7]:
read_files_and_run(prediction_file="qa_baseline.tsv", reference_file="qa_reference.tsv")

                       precision    recall  f1-score   support

AnswerGeneralQuestion     0.8538    0.1433    0.2453      1794
  CreateCalendarEvent     0.7957    0.5103    0.6218       290
      FindNearbyPlace     0.6263    0.7939    0.7002      1218
       FindPersonInfo     0.3682    0.7183    0.4869       142
        GetDirections     0.8414    0.6420    0.7283       405
        MakePhoneCall     0.9200    0.8519    0.8846       108
              OpenApp     0.7500    0.8627    0.8024       233
               Others     0.2682    0.5644    0.3636       932
            SearchWeb     0.5627    0.5561    0.5594      1016
            SendEmail     0.9594    0.8015    0.8734       413
          SendMessage     0.2219    0.5547    0.3170       128
      SendTextMessage     0.9176    0.6903    0.7879       339
             SetAlarm     0.6800    0.6641    0.6719       128
          SetReminder     0.8164    0.8875    0.8505       471
      StartNavigation     0.5134    0.8138    0.6296  

In [8]:
read_files_and_run(prediction_file="qa_paligemma.tsv", reference_file="qa_reference.tsv")

                       precision    recall  f1-score   support

AnswerGeneralQuestion     0.7375    0.0694    0.1269      3199
  CreateCalendarEvent     0.0000    0.0000    0.0000        27
      FindNearbyPlace     0.0602    0.3536    0.1029       263
       FindPersonInfo     0.0289    0.0385    0.0330       208
        GetDirections     0.0647    0.3509    0.1093        57
        MakePhoneCall     0.0000    0.0000    0.0000         9
              OpenApp     0.0187    0.0331    0.0239       151
               Others     0.3361    0.3434    0.3397      1919
            SearchWeb     0.2510    0.1310    0.1721      1924
            SendEmail     0.0058    0.2500    0.0113         8
          SendMessage     0.0031    0.1429    0.0061         7
      SendTextMessage     0.0039    0.0714    0.0074        14
             SetAlarm     0.0160    0.1250    0.0284        16
          SetReminder     0.0000    0.0000    0.0000         2
      StartNavigation     0.0000    0.0000    0.0000  

In [9]:
read_files_and_run(prediction_file="qa_qwen.tsv", reference_file="qa_reference.tsv")

                       precision    recall  f1-score   support

AnswerGeneralQuestion     0.6744    0.1551    0.2522      1309
  CreateCalendarEvent     0.6935    0.7633    0.7268       169
      FindNearbyPlace     0.5706    0.6867    0.6233      1283
       FindPersonInfo     0.4874    0.6054    0.5400       223
        GetDirections     0.9094    0.4668    0.6169       602
        MakePhoneCall     0.3200    0.9143    0.4741        35
              OpenApp     0.3955    0.8480    0.5394       125
               Others     0.2708    0.6852    0.3882       775
            SearchWeb     0.7191    0.3235    0.4462      2232
            SendEmail     0.5159    0.8856    0.6520       201
          SendMessage     0.4813    0.6286    0.5451       245
      SendTextMessage     0.3059    0.9630    0.4643        81
             SetAlarm     0.6400    1.0000    0.7805        80
          SetReminder     0.7969    0.9315    0.8589       438
      StartNavigation     0.0168    0.7143    0.0328  

In [10]:
read_files_and_run(prediction_file="qa_paligemma_self_caption_easyocr.tsv", reference_file="qa_reference.tsv")

                       precision    recall  f1-score   support

AnswerGeneralQuestion     0.2691    0.0392    0.0685      2064
  CreateCalendarEvent     0.0000    0.0000    0.0000        24
      FindNearbyPlace     0.2351    0.3517    0.2818      1032
       FindPersonInfo     0.0397    0.1100    0.0584       100
        GetDirections     0.0129    0.0702    0.0219        57
        MakePhoneCall     0.0000    0.0000    0.0000        11
              OpenApp     0.0112    0.0545    0.0186        55
               Others     0.3519    0.3273    0.3391      2108
            SearchWeb     0.3327    0.1480    0.2049      2256
            SendEmail     0.0029    0.5000    0.0058         2
          SendMessage     0.0156    0.1429    0.0282        35
      SendTextMessage     0.0000    0.0000    0.0000         0
             SetAlarm     0.0000    0.0000    0.0000         0
          SetReminder     0.0156    0.2759    0.0296        29
      StartNavigation     0.0101    0.0938    0.0182  

In [11]:
read_files_and_run(prediction_file="qa_qwen_easyocr_self_caption.tsv", reference_file="qa_reference.tsv")

                       precision    recall  f1-score   support

AnswerGeneralQuestion     0.2625    0.3015    0.2806       262
  CreateCalendarEvent     0.6505    0.8403    0.7333       144
      FindNearbyPlace     0.6710    0.6451    0.6578      1606
       FindPersonInfo     0.3466    0.4776    0.4017       201
        GetDirections     0.8091    0.7184    0.7610       348
        MakePhoneCall     0.2600    0.6667    0.3741        39
              OpenApp     0.3284    0.8544    0.4744       103
               Others     0.8817    0.5444    0.6732      3176
            SearchWeb     0.4173    0.4901    0.4508       855
            SendEmail     0.5304    0.8841    0.6630       207
          SendMessage     0.3688    0.6211    0.4627       190
      SendTextMessage     0.2392    0.8841    0.3765        69
             SetAlarm     0.5440    1.0000    0.7047        68
          SetReminder     0.7207    0.9535    0.8209       387
      StartNavigation     0.4396    0.8733    0.5848  

In [12]:
read_files_and_run(prediction_file="qa_metadata.tsv", reference_file="qa_reference.tsv")

                       precision    recall  f1-score   support

AnswerGeneralQuestion     0.8538    0.1474    0.2513      1744
  CreateCalendarEvent     0.9032    0.6245    0.7385       269
      FindNearbyPlace     0.6483    0.8682    0.7423      1153
       FindPersonInfo     0.6101    0.5909    0.6004       286
        GetDirections     0.9288    0.7266    0.8153       395
        MakePhoneCall     0.9400    0.8545    0.8952       110
              OpenApp     0.9104    0.7771    0.8385       314
               Others     0.2800    0.6994    0.3999       785
            SearchWeb     0.5627    0.5998    0.5807       942
            SendEmail     0.9246    0.8740    0.8986       365
          SendMessage     0.5312    0.7424    0.6193       229
      SendTextMessage     0.9373    0.7113    0.8088       336
             SetAlarm     0.9360    0.9141    0.9249       128
          SetReminder     0.9199    0.9515    0.9355       495
      StartNavigation     0.7517    0.8819    0.8116  

In [13]:
read_files_and_run(prediction_file="qa_metadata_easyocr.tsv", reference_file="qa_reference.tsv")

                       precision    recall  f1-score   support

AnswerGeneralQuestion     0.7841    0.1404    0.2381      1681
  CreateCalendarEvent     0.8495    0.7281    0.7841       217
      FindNearbyPlace     0.6911    0.8640    0.7679      1235
       FindPersonInfo     0.6318    0.6387    0.6352       274
        GetDirections     0.9903    0.6402    0.7776       478
        MakePhoneCall     0.9500    0.9596    0.9548        99
              OpenApp     0.8321    0.8884    0.8593       251
               Others     0.1081    0.7518    0.1890       282
            SearchWeb     0.8317    0.5012    0.6255      1666
            SendEmail     0.9565    0.9192    0.9375       359
          SendMessage     0.2687    0.6825    0.3857       126
      SendTextMessage     0.9451    0.6086    0.7404       396
             SetAlarm     0.9680    0.9098    0.9380       133
          SetReminder     0.9512    0.9549    0.9530       510
      StartNavigation     0.3154    0.9592    0.4747  

In [14]:
read_files_and_run(prediction_file="qa_selfcaption_metadata.tsv", reference_file="qa_reference.tsv")

                       precision    recall  f1-score   support

AnswerGeneralQuestion     0.6678    0.1971    0.3043      1020
  CreateCalendarEvent     0.8548    0.5846    0.6943       272
      FindNearbyPlace     0.6049    0.8828    0.7179      1058
       FindPersonInfo     0.4838    0.8323    0.6119       161
        GetDirections     0.9159    0.8654    0.8899       327
        MakePhoneCall     0.9700    0.9604    0.9652       101
              OpenApp     0.9291    0.6570    0.7697       379
               Others     0.4717    0.7708    0.5853      1200
            SearchWeb     0.6952    0.4847    0.5712      1440
            SendEmail     0.9362    0.8972    0.9163       360
          SendMessage     0.4531    0.7214    0.5566       201
      SendTextMessage     0.8941    0.6369    0.7439       358
             SetAlarm     0.9520    0.9754    0.9636       122
          SetReminder     0.9395    0.9322    0.9358       516
      StartNavigation     0.8289    0.8517    0.8401  

In [15]:
read_files_and_run(prediction_file="qa_selfcaption_easyocr.tsv", reference_file="qa_reference.tsv")

                       precision    recall  f1-score   support

AnswerGeneralQuestion     0.5947    0.2380    0.3400       752
  CreateCalendarEvent     0.8871    0.6653    0.7604       248
      FindNearbyPlace     0.7027    0.9211    0.7972      1178
       FindPersonInfo     0.6101    0.7511    0.6733       225
        GetDirections     0.9515    0.7084    0.8122       415
        MakePhoneCall     0.9300    0.9394    0.9347        99
              OpenApp     0.9030    0.7562    0.8231       320
               Others     0.5156    0.7979    0.6264      1267
            SearchWeb     0.7620    0.5222    0.6197      1465
            SendEmail     0.9623    0.8177    0.8842       406
          SendMessage     0.6969    0.6335    0.6637       352
      SendTextMessage     0.9137    0.9472    0.9301       246
             SetAlarm     0.9520    0.9597    0.9558       124
          SetReminder     0.9492    0.9529    0.9511       510
      StartNavigation     0.6107    0.9192    0.7339  

In [16]:
read_files_and_run(prediction_file="qa_selfcaption_easyocr_8bit.tsv", reference_file="qa_reference.tsv")

                       precision    recall  f1-score   support

AnswerGeneralQuestion     0.6910    0.2213    0.3352       940
  CreateCalendarEvent     0.8763    0.7581    0.8130       215
      FindNearbyPlace     0.7345    0.8552    0.7902      1326
       FindPersonInfo     0.4585    0.7937    0.5812       160
        GetDirections     0.9515    0.8400    0.8923       350
        MakePhoneCall     0.9200    0.9293    0.9246        99
              OpenApp     0.8284    0.8162    0.8222       272
               Others     0.3983    0.8536    0.5431       915
            SearchWeb     0.8118    0.4730    0.5977      1723
            SendEmail     0.9217    0.9138    0.9177       348
          SendMessage     0.1031    0.6471    0.1779        51
      SendTextMessage     0.9529    0.4860    0.6437       500
             SetAlarm     0.9760    0.9606    0.9683       127
          SetReminder     0.9609    0.9425    0.9516       522
      StartNavigation     0.7785    0.9027    0.8360  