University of Aberdeen\
Atanas Komsiyski
## Evaluating GPT-3.5-turbo for Action Item Extraction in Meeting Transcripts

This Jupyter notebook contains an approach modifying BERTscore's approach to use whole sentences pairing instead of token pairing for computing scores. However, the method proved unreliable and was therefore only mentioned in passing in the report. It is provided with the rest of the code we utilise only as an example of the attempted creation of a BERTScore inspired metric.

Inspired by: https://sbert.net/docs/usage/semantic_textual_similarity.html

#### Importing libraries

In [9]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import confusion_matrix
import numpy as np
import xml.etree.ElementTree as ET

#### Reading XML files into lists

In [10]:
def read_xml(file_path):
    meetings = {}
    tree = ET.parse(file_path)
    root = tree.getroot()
    for meeting in root.findall('Meeting'):
        meeting_name = meeting.get('Name')
        meetings[meeting_name] = {}
        for iteration in meeting.findall('Iteration'):
            iteration_number = int(iteration.get('Number'))
            items = [item.text for item in iteration.findall('Item')]
            meetings[meeting_name][iteration_number] = items
    return meetings


### Test version 1

In [11]:

def compute_action_item_metrics(gpt_meetings, human_meetings, threshold):
    results = []

    for meeting_name, gpt_iterations in gpt_meetings.items():
            human_iteration = human_meetings.get(meeting_name, {}).get(0, [])  # Get the single iteration from human (ground truth)
            for iteration_number, gpt_items in gpt_iterations.items():

                    # Load pre-trained BERT model
                    model = SentenceTransformer('all-MiniLM-L6-v2')

                    # Encode human and GPT items into embeddings
                    human_embeddings = model.encode(human_iteration, convert_to_tensor=True)
                    gpt_embeddings = model.encode(gpt_items, convert_to_tensor=True)

                    # Calculate cosine similarity between sentence embeddings
                    similarity_matrix = util.cos_sim(human_embeddings, gpt_embeddings).cpu().numpy()

                    # Find pairs with similarity above threshold
                    true_positives = 0
                    false_positives = 0

                    for i in range(len(human_iteration)):
                        matched = False
                        for j in range(len(gpt_iterations)):
                            if similarity_matrix[i][j] > threshold:
                                matched = True
                                break

                        if matched:
                            true_positives += 1  # All matched items are true positives
                        else:
                            false_positives += 1  # Items in human_iteration not matched by GPT are false negatives

                    precision = true_positives / (true_positives + false_positives) 
                    recall = true_positives / len(gpt_items)

                    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0


                    metrics = {
                        'Meeting Name': meeting_name,
                        "GPT Iteration Number": iteration_number,
                        'True Positives': true_positives,
                        'False Positives': false_positives,
                        'Precision': precision,
                        'Recall': recall,
                        'F1 Score': f1_score
                    }

                    results.append(metrics)

    return results



In [12]:
# loading xml files into nested lists
gpt_meetings = read_xml("GPT_action_items_v1.xml")
human_meetings = read_xml("Human_action_items.xml")

# compute metrics
metrics = compute_action_item_metrics(gpt_meetings, human_meetings, 0.5)
results_df = pd.DataFrame(metrics)
display(results_df)

Unnamed: 0,Meeting Name,GPT Iteration Number,True Positives,False Positives,Precision,Recall,F1 Score
0,Bed002.txt,0,2,6,0.250000,0.166667,0.200000
1,Bed002.txt,1,2,6,0.250000,0.200000,0.222222
2,Bed002.txt,2,2,6,0.250000,0.200000,0.222222
3,Bed003.txt,0,1,6,0.142857,0.111111,0.125000
4,Bed003.txt,1,2,5,0.285714,0.133333,0.181818
...,...,...,...,...,...,...,...
70,Bro011.txt,1,0,10,0.000000,0.000000,0.000000
71,Bro011.txt,2,0,10,0.000000,0.000000,0.000000
72,Bro012.txt,0,0,10,0.000000,0.000000,0.000000
73,Bro012.txt,1,0,10,0.000000,0.000000,0.000000


In [13]:
# Group by 'Meeting'
mean_results_df = results_df.groupby('Meeting Name').mean()

# Reset the index to keep 'Meeting' as a column
mean_results_df.reset_index(inplace=True)

# Drop the GPT Iteration Number as not relevant
mean_results_df.drop('GPT Iteration Number', axis=1, inplace=True)

# Display the resulting DataFrame
display(mean_results_df)

Unnamed: 0,Meeting Name,True Positives,False Positives,Precision,Recall,F1 Score
0,Bed002.txt,2.0,6.0,0.25,0.188889,0.214815
1,Bed003.txt,1.666667,5.333333,0.238095,0.142088,0.176347
2,Bed004.txt,2.333333,6.666667,0.259259,0.225524,0.239037
3,Bed005.txt,1.0,7.0,0.125,0.083333,0.1
4,Bed006.txt,0.0,10.0,0.0,0.0,0.0
5,Bed008.txt,1.0,0.0,1.0,0.08189,0.151282
6,Bed009.txt,2.0,8.0,0.2,0.25,0.222222
7,Bed010.txt,0.666667,7.333333,0.083333,0.097222,0.089286
8,Bmr001.txt,1.0,6.0,0.142857,0.183333,0.155377
9,Bmr002.txt,0.0,10.0,0.0,0.0,0.0


### Test version 2

In [14]:

def compute_action_item_metrics2(gpt_meetings, human_meetings, threshold):
    results = []

    for meeting_name, gpt_iterations in gpt_meetings.items():
        human_iteration = human_meetings.get(meeting_name, {}).get(0, [])  # Get the single iteration from human (ground truth)
        
        for iteration_number, gpt_items in gpt_iterations.items():
            # Create DataFrames from input sentences
            human_df = pd.DataFrame({'Sentence': human_iteration, 'IsActionItem': [1] * len(human_iteration)})
            gpt_df = pd.DataFrame({'Sentence': gpt_items})

            # Load pre-trained BERT model
            model = SentenceTransformer('all-MiniLM-L6-v2')

            # Encode sentences into embeddings
            human_embeddings = model.encode(human_df['Sentence'].tolist(), convert_to_tensor=True)
            gpt_embeddings = model.encode(gpt_df['Sentence'].tolist(), convert_to_tensor=True)

            # Calculate cosine similarity between sentence embeddings
            similarity_matrix = util.pytorch_cos_sim(human_embeddings, gpt_embeddings).cpu().numpy()

            # Find pairs with similarity above threshold
            true_positives = 0
            false_positives = 0
            false_negatives = 0

            for i in range(len(human_iteration)):
                matched = False
                for j in range(len(gpt_items)):
                    if similarity_matrix[i][j] > threshold:
                        matched = True
                        break

                if matched:
                    if human_df.loc[i, 'IsActionItem'] == 1:
                        true_positives += 1
                else:
                    if human_df.loc[i, 'IsActionItem'] == 1:
                        false_negatives += 1

            false_positives = len(gpt_items) - true_positives #- false_negatives

            precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
            recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

            # Calculate F1 score
            f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

            # Collect metrics into a dictionary
            metrics = {
                'Meeting Name': meeting_name,
                'GPT Iteration Number': iteration_number,
                'True Positives': true_positives,
                'False Positives': false_positives,
                'False Negatives': false_negatives,
                'Precision': precision,
                'Recall': recall,
                'F1 Score': f1_score
            }

            # Append metrics to results list
            results.append(metrics)

    return results

In [15]:
#load XML files in lists
gpt_meetings = read_xml("GPT_action_items_v1.xml")
human_meetings = read_xml("Human_action_items.xml")

# compute metrics
metrics = compute_action_item_metrics2(gpt_meetings, human_meetings, 0.4)
results_df = pd.DataFrame(metrics)
display(results_df)


Unnamed: 0,Meeting Name,GPT Iteration Number,True Positives,False Positives,False Negatives,Precision,Recall,F1 Score
0,Bed002.txt,0,6,6,2,0.500000,0.750000,0.600000
1,Bed002.txt,1,6,4,2,0.600000,0.750000,0.666667
2,Bed002.txt,2,5,5,3,0.500000,0.625000,0.555556
3,Bed003.txt,0,6,3,1,0.666667,0.857143,0.750000
4,Bed003.txt,1,5,10,2,0.333333,0.714286,0.454545
...,...,...,...,...,...,...,...,...
70,Bro011.txt,1,0,12,10,0.000000,0.000000,0.000000
71,Bro011.txt,2,0,6,10,0.000000,0.000000,0.000000
72,Bro012.txt,0,0,9,10,0.000000,0.000000,0.000000
73,Bro012.txt,1,0,14,10,0.000000,0.000000,0.000000
