In [4]:
import pandas as pd
import ast
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import operator

In [5]:
# Metrics Functions
def accuracy(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection)/union

def precision(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    return float(intersection)/len(list1)

def recall(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    return float(intersection)/len(list2)

def f1(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    precision_score = precision(list1, list2)
    recall_score = recall(list1, list2)
    try:
        f1_score = float(2*precision_score*recall_score)/(precision_score+recall_score)
    except:
        f1_score = 0
    return f1_score

In [6]:
def calculate_metrics(df: pd.DataFrame):
    all_accuracies = []
    all_precision = []
    all_recall = []
    all_f1 = []

    for index, row in df.iterrows():
        true_labels = ast.literal_eval(row["True Labels"])
        pred_labels = ast.literal_eval(row["Predicted Labels"])

        accuracy_score = accuracy(pred_labels, true_labels)
        precision_score = precision(pred_labels, true_labels)
        recall_score = recall(pred_labels, true_labels)
        f1_score = f1(pred_labels, true_labels)

        all_accuracies.append(accuracy_score)
        all_precision.append(precision_score)
        all_recall.append(recall_score)
        all_f1.append(f1_score)


    avg_acc = sum(all_accuracies)/len(all_accuracies)
    avg_prec = sum(all_precision)/len(all_precision)
    avg_rec = sum(all_recall)/len(all_recall)
    avg_f1 = sum(all_f1)/len(all_f1)

    print("Average Accuracy on Test Data:", round(avg_acc*100,2), "%")
    print("Average Precision on Test Data:", round(avg_prec*100,2), "%")
    print("Average Recall on Test Data:", round(avg_rec*100,2), "%")
    print("Average F1-Score on Test Data:", round(avg_f1*100,2), "%")

In [7]:
labels = [
    "problem identification and resolution",
    "work processes",
    "questioning attitude",
    "continuous learning",
    "personal accountability",
    "respectful work environment",
    "decision making",
    "leadership safety values and actions",
    "effective safety communication",
    "environment for raising concerns"
]

def getTraitCounts(df: pd.DataFrame):
    traitCountsTrue = {}
    traitCountsPredicted = {}
    for label in labels:
        traitCountsTrue[label] = 0
        traitCountsPredicted[label] = 0
    for index, row in df.iterrows():
        true_labels = ast.literal_eval(row["True Labels"])
        pred_labels = ast.literal_eval(row["Predicted Labels"])
        for trueLabel in true_labels:
            traitCountsTrue[trueLabel] = traitCountsTrue.get(trueLabel, 0) + 1
        for predLabel in pred_labels:
            traitCountsPredicted[predLabel] = traitCountsPredicted.get(predLabel, 0) + 1

    trueLabelsToPlot = list(traitCountsTrue.keys())
    trueCountsToPlot = list(traitCountsTrue.values())

    predictedLabelsToPlot = list(traitCountsPredicted.keys())
    predictedCountsToPlot = list(traitCountsPredicted.values())

    predictedMinusTrueCount = map(operator.sub, predictedCountsToPlot, trueCountsToPlot)

    summaryCountDf = pd.DataFrame(data={
        "Labels": trueLabelsToPlot,
        "True Counts": trueCountsToPlot,
        "Predicted Counts": predictedCountsToPlot,
        "Difference Between Predicted & True":  predictedMinusTrueCount
    })
    print(summaryCountDf.to_string(index=False))
    
    fig = go.Figure(data=[go.Bar(
        name = 'True Label Counts',
        x = trueCountsToPlot,
        y = trueLabelsToPlot,
        orientation="h"
    ),
        go.Bar(
        name = 'Predicted Label Counts',
        x = predictedCountsToPlot,
        y = predictedLabelsToPlot,
        orientation="h"
    )
    ])

    # Commented below line but sorts x axis if wanting to dispaly like that, looks cluttered if you do though
    # fig.update_layout(yaxis=dict(autorange="reversed"))
    fig.update_layout(title_text="True/Predicted Label Counts")
    fig.show()

In [26]:
df = pd.read_csv("FewShot_Keywords_FewExamples.csv")

In [27]:
df

Unnamed: 0,Power Plant,Sentence/Paragraph,Predicted Labels,True Labels
0,Davis-Besse,The cause of the cracks appears to be high cyc...,"['problem identification and resolution', 'wor...","['problem identification and resolution', 'que..."
1,Davis-Besse,"These conditions, apparently caused by design ...","['problem identification and resolution', 'que...","['questioning attitude', 'work processes']"
2,Davis-Besse,The apparent cause of the HPI pump debris tole...,"['problem identification and resolution', 'wor...","['continuous learning', 'questioning attitude'..."
3,Davis-Besse,The previous procedures used to calibrate the ...,"['problem identification and resolution', 'wor...",['work processes']
4,Davis-Besse,"Therefore, the cause of the loss of taper pins...","['personal accountability', 'problem identific...","['personal accountability', 'work processes']"
...,...,...,...,...
80,South Texas,The linkage mechanism that operates the breake...,"['problem identification and resolution', 'wor...","['personal accountability', 'problem identific..."
81,South Texas,The majority of the defective tubes detected i...,"['problem identification and resolution', 'que...","['decision making', 'problem identification an..."
82,South Texas,1. Personnel did not recognize the laptop comp...,"['problem identification and resolution', 'wor...","['personal accountability', 'work processes']"
83,South Texas,The root cause of this incident was a lack of ...,"['continuous learning', 'problem identificatio...","['continuous learning', 'problem identificatio..."


In [28]:
getTraitCounts(df)

                               Labels  True Counts  Predicted Counts  Difference Between Predicted & True
problem identification and resolution           35                71                                   36
                       work processes           53                70                                   17
                 questioning attitude           23                19                                   -4
                  continuous learning           16                19                                    3
              personal accountability           37                14                                  -23
          respectful work environment            1                 0                                   -1
                      decision making           16                11                                   -5
 leadership safety values and actions           12                 8                                   -4
       effective safety communication         

**<h1>Results on FewShotNoTuning</h1>**

In [29]:
print(f"Total test size: {len(df)}")
calculate_metrics(df)

Total test size: 85
Average Accuracy on Test Data: 51.27 %
Average Precision on Test Data: 63.04 %
Average Recall on Test Data: 68.82 %
Average F1-Score on Test Data: 63.56 %


<h3>Specifically on Diablo Canyon</h3>

In [30]:
diablo_canyon_df = df[df['Power Plant'] == 'Diablo Canyon']
diablo_canyon_df.reset_index(drop=True, inplace=True)
print(f"Total test size (ONLY DIABLO CANYON): {len(diablo_canyon_df)}")
calculate_metrics(diablo_canyon_df)

Total test size (ONLY DIABLO CANYON): 34
Average Accuracy on Test Data: 45.49 %
Average Precision on Test Data: 59.8 %
Average Recall on Test Data: 62.25 %
Average F1-Score on Test Data: 58.17 %


In [31]:
south_texas_df = df[df['Power Plant'] == 'South Texas']
south_texas_df.reset_index(drop=True, inplace=True)
print(f"Total test size (ONLY SOUTH TEXAS): {len(south_texas_df)}")
calculate_metrics(south_texas_df)

Total test size (ONLY SOUTH TEXAS): 34
Average Accuracy on Test Data: 57.84 %
Average Precision on Test Data: 68.38 %
Average Recall on Test Data: 75.49 %
Average F1-Score on Test Data: 69.75 %


In [32]:
davis_df = df[df['Power Plant'] == 'Davis-Besse']
davis_df.reset_index(drop=True, inplace=True)
print(f"Total test size (ONLY DAVIS-BESSE): {len(davis_df)}")
calculate_metrics(davis_df)

Total test size (ONLY DAVIS-BESSE): 17
Average Accuracy on Test Data: 49.71 %
Average Precision on Test Data: 58.82 %
Average Recall on Test Data: 68.63 %
Average F1-Score on Test Data: 61.96 %


### Evaluating on additional models

In [16]:
df = pd.read_csv("ZeroShot_withDescriptions.csv")
print(f"Total test size: {len(df)}")
calculate_metrics(df)

Total test size: 100
Average Accuracy on Test Data: 32.99 %
Average Precision on Test Data: 44.25 %
Average Recall on Test Data: 51.25 %
Average F1-Score on Test Data: 45.45 %


In [17]:
df = pd.read_csv("FewShot_withDescriptions.csv")
print(f"Total test size: {len(df)}")
calculate_metrics(df)

Total test size: 100
Average Accuracy on Test Data: 42.4 %
Average Precision on Test Data: 56.33 %
Average Recall on Test Data: 52.0 %
Average F1-Score on Test Data: 52.27 %
