In [34]:
import pandas as pd
import ast

In [35]:
# Metrics Functions
def accuracy(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection)/union

def precision(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    return float(intersection)/len(list1)

def recall(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    return float(intersection)/len(list2)

def f1(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    precision_score = precision(list1, list2)
    recall_score = recall(list1, list2)
    try:
        f1_score = float(2*precision_score*recall_score)/(precision_score+recall_score)
    except:
        f1_score = 0
    return f1_score

In [36]:
def calculate_metrics(df: pd.DataFrame):
    all_accuracies = []
    all_precision = []
    all_recall = []
    all_f1 = []

    for index, row in df.iterrows():
        true_labels = ast.literal_eval(row["True Labels"])
        pred_labels = ast.literal_eval(row["Predicted Labels"])

        accuracy_score = accuracy(pred_labels, true_labels)
        precision_score = precision(pred_labels, true_labels)
        recall_score = recall(pred_labels, true_labels)
        f1_score = f1(pred_labels, true_labels)

        all_accuracies.append(accuracy_score)
        all_precision.append(precision_score)
        all_recall.append(recall_score)
        all_f1.append(f1_score)


    avg_acc = sum(all_accuracies)/len(all_accuracies)
    avg_prec = sum(all_precision)/len(all_precision)
    avg_rec = sum(all_recall)/len(all_recall)
    avg_f1 = sum(all_f1)/len(all_f1)

    print("Average Accuracy on Test Data:", round(avg_acc*100,2), "%")
    print("Average Precision on Test Data:", round(avg_prec*100,2), "%")
    print("Average Recall on Test Data:", round(avg_rec*100,2), "%")
    print("Average F1-Score on Test Data:", round(avg_f1*100,2), "%")

In [37]:
df = pd.read_csv("predictedLabelsFewShotNoTuning.csv")

In [38]:
df

Unnamed: 0,Power Plant,Sentence/Paragraph,Predicted Labels,True Labels
0,Diablo Canyon,Personnel error (cognitive) by a utility licen...,"['personal accountability', 'work processes']","['decision making', 'personal accountability',..."
1,Diablo Canyon,The cause of the events was human error enable...,"['continuous learning', 'personal accountabili...","['leadership safety values and actions', 'pers..."
2,Diablo Canyon,Degraded Wire Wire was abnormally degraded in ...,"['problem identification and resolution', 'wor...","['personal accountability', 'problem identific..."
3,South Texas,The root cause for this event is that manageme...,"['leadership safety values and actions', 'prob...","['effective safety communication', 'leadership..."
4,South Texas,The cause of the event was failure of the ECW ...,['problem identification and resolution'],['work processes']
...,...,...,...,...
65,South Texas,The root cause of the event was an inadequate ...,"['problem identification and resolution', 'wor...","['problem identification and resolution', 'wor..."
66,Diablo Canyon,The cause of the electrical fault could not be...,"['problem identification and resolution', 'wor...","['questioning attitude', 'work processes']"
67,Davis-Besse,The electrician checking the status of the loc...,"['personal accountability', 'work processes']","['decision making', 'personal accountability',..."
68,Davis-Besse,The cause of this event was determined to be i...,"['problem identification and resolution', 'wor...","['decision making', 'effective safety communic..."


**<h1>Results on FewShotNoTuning</h1>**

In [39]:
print(f"Total test size: {len(df)}")
calculate_metrics(df)

Total test size: 70
Average Accuracy on Test Data: 35.67 %
Average Precision on Test Data: 50.24 %
Average Recall on Test Data: 47.98 %
Average F1-Score on Test Data: 47.01 %


<h3>Specifically on Diablo Canyon</h3>

In [43]:
diablo_canyon_df = df[df['Power Plant'] == 'Diablo Canyon']
diablo_canyon_df.reset_index(drop=True, inplace=True)
print(f"Total test size (ONLY DIABLO CANYON): {len(diablo_canyon_df)}")
calculate_metrics(diablo_canyon_df)

Total test size (ONLY DIABLO CANYON): 31
Average Accuracy on Test Data: 29.03 %
Average Precision on Test Data: 42.47 %
Average Recall on Test Data: 40.86 %
Average F1-Score on Test Data: 39.68 %


In [44]:
south_texas_df = df[df['Power Plant'] == 'South Texas']
south_texas_df.reset_index(drop=True, inplace=True)
print(f"Total test size (ONLY SOUTH TEXAS): {len(south_texas_df)}")
calculate_metrics(south_texas_df)

Total test size (ONLY SOUTH TEXAS): 27
Average Accuracy on Test Data: 42.35 %
Average Precision on Test Data: 55.56 %
Average Recall on Test Data: 54.32 %
Average F1-Score on Test Data: 53.23 %


In [45]:
davis_df = df[df['Power Plant'] == 'Davis-Besse']
davis_df.reset_index(drop=True, inplace=True)
print(f"Total test size (ONLY DAVIS-BESSE): {len(davis_df)}")
calculate_metrics(davis_df)

Total test size (ONLY DAVIS-BESSE): 12
Average Accuracy on Test Data: 37.78 %
Average Precision on Test Data: 58.33 %
Average Recall on Test Data: 52.08 %
Average F1-Score on Test Data: 51.94 %
