University of Aberdeen\
Atanas Komsiyski
## Evaluating GPT-3.5-turbo for Action Item Extraction in Meeting Transcripts

This is a Jupyter notebook containing our ROUGE score results


#### Libraries
To ensure all libraries are installed before executing the notebook run -pip install requirements.txt

In [13]:
import pandas as pd
from rouge import Rouge
import xml.etree.ElementTree as ET

# Set the pandas display options to show all rows and columns
pd.set_option('display.max_rows', None)  # unlimited rows
pd.set_option('display.max_columns', None)  # unlimited columns
pd.set_option("display.precision", 5) # controls number precision after the point

In [14]:
# function that reads the contents of our XML files and saves it in a nested list for easier access
def read_xml(file_path):
    meetings = {}
    tree = ET.parse(file_path)
    root = tree.getroot()
    for meeting in root.findall('Meeting'):
        meeting_name = meeting.get('Name')
        meetings[meeting_name] = {}
        for iteration in meeting.findall('Iteration'):
            iteration_number = int(iteration.get('Number'))
            items = [item.text for item in iteration.findall('Item')]
            meetings[meeting_name][iteration_number] = items
    return meetings


In [15]:
# function computing ROUGE scores matching n-grams between candidate and reference
def compute_rouge(gpt_meetings, human_meetings):
    rouge = Rouge()
    rouge_scores = []
    for meeting_name, gpt_iterations in gpt_meetings.items():              # for each meeting's gpt iterations
        human_iteration = human_meetings.get(meeting_name, {}).get(0, [])  # get the single iteration from human
        for iteration_number, gpt_items in gpt_iterations.items():
            gpt_text = ' '.join(gpt_items)
            human_text = ' '.join(human_iteration)
            # joining each list of action items
            scores = rouge.get_scores(gpt_text, human_text) # compute ROUGE score
            rouge_scores.append({
                "Meeting": meeting_name,
                "GPT Iteration Number": iteration_number,
                "ROUGE Scores": scores
            })
    return rouge_scores

### Version 1: Base prompt
"Extract all action items from the following meeting transcript and display them in the form of a numbered list."

In [16]:
# read XML files into nested lists
gpt_meetings = read_xml("GPT_action_items_v1.xml")
human_meetings = read_xml("Human_action_items.xml")

# compute ROUGE scores
rouge_scores = compute_rouge(gpt_meetings, human_meetings)

# transform into Pandas dataframe for readability
df_v1 = pd.json_normalize(rouge_scores, ['ROUGE Scores'], ['Meeting', 'GPT Iteration Number'], )
df_v1 = df_v1[['Meeting', 'GPT Iteration Number', 'rouge-1.r', 'rouge-1.p', 'rouge-1.f', 'rouge-2.r', 'rouge-2.p', 'rouge-2.f', 'rouge-l.r', 'rouge-l.p', 'rouge-l.f']]

display(df_v1) # display table

# Table containig all meetings and their iterations with their respective scores

Unnamed: 0,Meeting,GPT Iteration Number,rouge-1.r,rouge-1.p,rouge-1.f,rouge-2.r,rouge-2.p,rouge-2.f,rouge-l.r,rouge-l.p,rouge-l.f
0,Bed002.txt,0,0.34722,0.25253,0.2924,0.0625,0.03947,0.04839,0.33333,0.24242,0.2807
1,Bed002.txt,1,0.34722,0.30864,0.3268,0.10417,0.08475,0.09346,0.29167,0.25926,0.27451
2,Bed002.txt,2,0.30556,0.26829,0.28571,0.07292,0.06034,0.06604,0.29167,0.2561,0.27273
3,Bed003.txt,0,0.41667,0.28409,0.33784,0.14493,0.0813,0.10417,0.4,0.27273,0.32432
4,Bed003.txt,1,0.45,0.20611,0.28272,0.17391,0.06122,0.09057,0.43333,0.19847,0.27225
5,Bed003.txt,2,0.48333,0.29897,0.36943,0.13043,0.06667,0.08824,0.41667,0.25773,0.31847
6,Bed004.txt,0,0.27711,0.23469,0.25414,0.04505,0.03378,0.03861,0.24096,0.20408,0.22099
7,Bed004.txt,1,0.27711,0.23,0.25137,0.05405,0.03822,0.04478,0.25301,0.21,0.22951
8,Bed004.txt,2,0.25301,0.2234,0.23729,0.04505,0.03623,0.04016,0.24096,0.21277,0.22599
9,Bed005.txt,0,0.27778,0.24096,0.25806,0.02083,0.01802,0.01932,0.25,0.21687,0.23226


In [17]:
# group by 'Meeting' and compute the mean of the three iterations
mean_df_v1 = df_v1.groupby('Meeting').mean()

# reset the index to keep 'Meeting' as a column
mean_df_v1.reset_index(inplace=True)

# drop the GPT Iteration Number as not relevant
mean_df_v1.drop('GPT Iteration Number', axis=1, inplace=True)

# display the resulting DataFrame
display(mean_df_v1)

# Table containig the mean of scores from all iterations grouped by meeting

Unnamed: 0,Meeting,rouge-1.r,rouge-1.p,rouge-1.f,rouge-2.r,rouge-2.p,rouge-2.f,rouge-l.r,rouge-l.p,rouge-l.f
0,Bed002.txt,0.33333,0.27649,0.30164,0.07986,0.06152,0.06929,0.30556,0.25259,0.27598
1,Bed003.txt,0.45,0.26306,0.33,0.14976,0.06973,0.09432,0.41667,0.24298,0.30502
2,Bed004.txt,0.26908,0.22937,0.2476,0.04805,0.03608,0.04118,0.24498,0.20895,0.2255
3,Bed005.txt,0.31944,0.25382,0.28204,0.06597,0.04493,0.05321,0.28704,0.22794,0.25335
4,Bed006.txt,0.26984,0.16902,0.20758,0.08333,0.03808,0.05222,0.2328,0.14539,0.17877
5,Bed008.txt,0.67778,0.21273,0.32304,0.42342,0.10498,0.16786,0.66667,0.20885,0.31729
6,Bed009.txt,0.27143,0.23109,0.24938,0.05674,0.04461,0.04989,0.25238,0.21451,0.23167
7,Bed010.txt,0.32857,0.34192,0.33287,0.11828,0.11569,0.11575,0.31905,0.33087,0.32264
8,Bmr001.txt,0.14,0.11099,0.12295,0.00575,0.00463,0.00513,0.13333,0.10504,0.11666
9,Bmr002.txt,0.50222,0.38417,0.43325,0.23913,0.14873,0.1817,0.45778,0.34892,0.39412


In [18]:
mean_df_v1.drop('Meeting', axis=1, inplace=True) # drop the meeting name from the table as not needed for next step
display(mean_df_v1.mean()) # display the mean score for each metric

# Table displaying the mean of scores of all meetings

rouge-1.r    0.52067
rouge-1.p    0.30443
rouge-1.f    0.37322
rouge-2.r    0.26978
rouge-2.p    0.13304
rouge-2.f    0.17139
rouge-l.r    0.49947
rouge-l.p    0.29058
rouge-l.f    0.35688
dtype: float64

### Version 2: Base prompt and Examples
"Examples of action items: Arrange a meeting with Amy next Friday, Call Ben after the presentation, Allison to begin data collection today."

In [19]:
# read XML files into nested lists
gpt_meetings = read_xml("GPT_action_items_v2.xml")
human_meetings = read_xml("Human_action_items.xml")

# compute ROUGE scores
rouge_scores = compute_rouge(gpt_meetings, human_meetings)

# transform into Pandas dataframe for readability
df_v2 = pd.json_normalize(rouge_scores, ['ROUGE Scores'], ['Meeting', 'GPT Iteration Number'], )
df_v2 = df_v2[['Meeting', 'GPT Iteration Number', 'rouge-1.r', 'rouge-1.p', 'rouge-1.f', 'rouge-2.r', 'rouge-2.p', 'rouge-2.f', 'rouge-l.r', 'rouge-l.p', 'rouge-l.f']]

display(df_v2) # display table

# Table containig all meetings and their iterations with their respective scores

Unnamed: 0,Meeting,GPT Iteration Number,rouge-1.r,rouge-1.p,rouge-1.f,rouge-2.r,rouge-2.p,rouge-2.f,rouge-l.r,rouge-l.p,rouge-l.f
0,Bed002.txt,0,0.48611,0.39773,0.4375,0.20833,0.15152,0.17544,0.44444,0.36364,0.4
1,Bed002.txt,1,0.5,0.31579,0.3871,0.27083,0.1383,0.1831,0.45833,0.28947,0.35484
2,Bed002.txt,2,0.45833,0.48529,0.47143,0.26042,0.2381,0.24876,0.43056,0.45588,0.44286
3,Bed003.txt,0,0.35,0.28378,0.31343,0.13043,0.08257,0.10112,0.33333,0.27027,0.29851
4,Bed003.txt,1,0.35,0.28,0.31111,0.14493,0.09524,0.11494,0.33333,0.26667,0.2963
5,Bed003.txt,2,0.4,0.33803,0.36641,0.13043,0.09783,0.1118,0.35,0.29577,0.32061
6,Bed004.txt,0,0.26506,0.33333,0.2953,0.05405,0.06742,0.06,0.22892,0.28788,0.25503
7,Bed004.txt,1,0.20482,0.17526,0.18889,0.02703,0.02055,0.02335,0.18072,0.15464,0.16667
8,Bed004.txt,2,0.3012,0.32468,0.3125,0.06306,0.05932,0.06114,0.25301,0.27273,0.2625
9,Bed005.txt,0,0.41667,0.23077,0.29703,0.14583,0.06667,0.0915,0.375,0.20769,0.26733


In [20]:
# group by 'Meeting' and compute the mean of the three iterations
mean_df_v2 = df_v2.groupby('Meeting').mean()

# reset the index to keep 'Meeting' as a column
mean_df_v2.reset_index(inplace=True)

# drop the GPT Iteration Number as not relevant
mean_df_v2.drop('GPT Iteration Number', axis=1, inplace=True)

# display the resulting DataFrame
display(mean_df_v2)

# Table containig the mean of scores from all iterations grouped by meeting

Unnamed: 0,Meeting,rouge-1.r,rouge-1.p,rouge-1.f,rouge-2.r,rouge-2.p,rouge-2.f,rouge-l.r,rouge-l.p,rouge-l.f
0,Bed002.txt,0.48148,0.3996,0.43201,0.24653,0.17597,0.20243,0.44444,0.36966,0.39923
1,Bed003.txt,0.36667,0.3006,0.33032,0.13527,0.09188,0.10929,0.33889,0.27757,0.30514
2,Bed004.txt,0.25703,0.27776,0.26556,0.04805,0.0491,0.04816,0.22088,0.23842,0.22807
3,Bed005.txt,0.37963,0.2236,0.28113,0.13542,0.06739,0.08988,0.34722,0.20471,0.25729
4,Bed006.txt,0.21693,0.20841,0.20483,0.0375,0.02751,0.03024,0.19577,0.18561,0.18393
5,Bed008.txt,0.61111,0.21843,0.32175,0.30631,0.08796,0.13666,0.61111,0.21843,0.32175
6,Bed009.txt,0.28571,0.20585,0.23903,0.05319,0.0336,0.04112,0.2619,0.18836,0.21888
7,Bed010.txt,0.24762,0.43009,0.30115,0.08244,0.13401,0.09631,0.24286,0.4156,0.29398
8,Bmr001.txt,0.15333,0.15106,0.14698,0.00575,0.00256,0.00355,0.14667,0.14293,0.13965
9,Bmr002.txt,0.33778,0.32584,0.30397,0.12681,0.0745,0.09357,0.32444,0.31606,0.29269


In [21]:
mean_df_v2.drop('Meeting', axis=1, inplace=True) # drop the meeting name from the table as not needed for next step
display(mean_df_v2.mean()) # display the mean score for each metric

# Table displaying the mean of scores of all meetings

rouge-1.r    0.41856
rouge-1.p    0.28151
rouge-1.f    0.31926
rouge-2.r    0.16885
rouge-2.p    0.09359
rouge-2.f    0.11401
rouge-l.r    0.39318
rouge-l.p    0.26335
rouge-l.f    0.29908
dtype: float64

### Version 3: Base prompt and Definition
"Action items must contain information on who needs to complete what action item and when if known."

In [22]:
# read XML files into nested lists
gpt_meetings = read_xml("GPT_action_items_v3.xml")
human_meetings = read_xml("Human_action_items.xml")

# compute ROUGE scores
rouge_scores = compute_rouge(gpt_meetings, human_meetings)

# transform into Pandas dataframe for readability
df_v3 = pd.json_normalize(rouge_scores, ['ROUGE Scores'], ['Meeting', 'GPT Iteration Number'], )
df_v3 = df_v3[['Meeting', 'GPT Iteration Number', 'rouge-1.r', 'rouge-1.p', 'rouge-1.f', 'rouge-2.r', 'rouge-2.p', 'rouge-2.f', 'rouge-l.r', 'rouge-l.p', 'rouge-l.f']]

display(df_v3) # display table

# Table containig all meetings and their iterations with their respective scores

Unnamed: 0,Meeting,GPT Iteration Number,rouge-1.r,rouge-1.p,rouge-1.f,rouge-2.r,rouge-2.p,rouge-2.f,rouge-l.r,rouge-l.p,rouge-l.f
0,Bed002.txt,0,0.41667,0.25862,0.31915,0.17708,0.08808,0.11765,0.375,0.23276,0.28723
1,Bed002.txt,1,0.40278,0.33333,0.36478,0.16667,0.11511,0.13617,0.38889,0.32184,0.3522
2,Bed002.txt,2,0.41667,0.32609,0.36585,0.14583,0.0979,0.11715,0.38889,0.30435,0.34146
3,Bed003.txt,0,0.41667,0.24272,0.30675,0.14493,0.06494,0.08969,0.41667,0.24272,0.30675
4,Bed003.txt,1,0.45,0.23276,0.30682,0.17391,0.06667,0.09639,0.41667,0.21552,0.28409
5,Bed003.txt,2,0.38333,0.23958,0.29487,0.10145,0.05263,0.06931,0.36667,0.22917,0.28205
6,Bed004.txt,0,0.26506,0.30556,0.28387,0.05405,0.05357,0.05381,0.26506,0.30556,0.28387
7,Bed004.txt,1,0.28916,0.2069,0.24121,0.07207,0.04598,0.05614,0.26506,0.18966,0.22111
8,Bed004.txt,2,0.22892,0.18447,0.2043,0.03604,0.02649,0.03053,0.18072,0.14563,0.16129
9,Bed005.txt,0,0.47222,0.25758,0.33333,0.16667,0.08163,0.10959,0.43056,0.23485,0.30392


In [23]:
# group by 'Meeting' and compute the mean of the three iterations
mean_df_v3 = df_v3.groupby('Meeting').mean()

# reset the index to keep 'Meeting' as a column
mean_df_v3.reset_index(inplace=True)

# drop the GPT Iteration Number as not relevant
mean_df_v3.drop('GPT Iteration Number', axis=1, inplace=True)

# display the resulting DataFrame
display(mean_df_v3)

# Table containig the mean of scores from all iterations grouped by meeting

Unnamed: 0,Meeting,rouge-1.r,rouge-1.p,rouge-1.f,rouge-2.r,rouge-2.p,rouge-2.f,rouge-l.r,rouge-l.p,rouge-l.f
0,Bed002.txt,0.41204,0.30601,0.34993,0.16319,0.10036,0.12366,0.38426,0.28632,0.32697
1,Bed003.txt,0.41667,0.23835,0.30281,0.1401,0.06141,0.08513,0.4,0.22913,0.29096
2,Bed004.txt,0.26104,0.23231,0.24313,0.05405,0.04201,0.04683,0.23695,0.21361,0.22209
3,Bed005.txt,0.45833,0.24462,0.31897,0.17708,0.07908,0.10922,0.41667,0.22241,0.29
4,Bed006.txt,0.26455,0.18366,0.21587,0.05833,0.0329,0.04195,0.2381,0.16475,0.1939
5,Bed008.txt,0.56667,0.17814,0.27059,0.22523,0.0599,0.09436,0.54444,0.17137,0.26022
6,Bed009.txt,0.32381,0.20283,0.2494,0.07092,0.03815,0.04957,0.28095,0.17606,0.21645
7,Bed010.txt,0.28095,0.3868,0.3194,0.08602,0.1049,0.09159,0.2619,0.35571,0.29592
8,Bmr001.txt,0.18667,0.09359,0.12453,0.00575,0.00249,0.00347,0.16667,0.08361,0.11123
9,Bmr002.txt,0.44889,0.38341,0.40738,0.13406,0.10308,0.11434,0.39556,0.33883,0.35926


In [24]:
mean_df_v3.drop('Meeting', axis=1, inplace=True) # drop the meeting name from the table as not needed for next step
display(mean_df_v3.mean()) # display the mean score for each metric

# Table displaying the mean of scores of all meetings

rouge-1.r    0.45262
rouge-1.p    0.26569
rouge-1.f    0.32641
rouge-2.r    0.18268
rouge-2.p    0.08932
rouge-2.f    0.11629
rouge-l.r    0.41765
rouge-l.p    0.24447
rouge-l.f    0.30068
dtype: float64