University of Aberdeen\
Atanas Komsiyski
## Evaluating GPT-3.5-turbo for Action Item Extraction in Meeting Transcripts

This Jupyter notebook contains the pipeline for performing chunked linear segmentation for long texts and feeding the transcripts into the GPT model


#### Libraries
To ensure all libraries are installed before executing the notebook run -pip install requirements.txt

In [73]:
# importing libraries
from openai import OpenAI
import tiktoken
import tiktoken_ext.openai_public
import os
import xml.etree.ElementTree as ET
import re


client = OpenAI(api_key="<redacted>") # setting OpenAI API key
folder_path = "ICSI_original_transcripts/transcripts/chosen_transcripts"   # location of folder containing the meeting transcripts to process


In [74]:
# getting the encoding used by our model
encoding = tiktoken.encoding_name_for_model("gpt-3.5-turbo-0125") 
print(encoding)

cl100k_base


In [75]:
# function to calculate the number of tokens in a string given the encoding
def num_tokens_from_string(string, encoding_name: str):
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [76]:
# function performing the chunked linear segmentation
def split_into_chunks(text):
    chunks = []
    current_chunk = ''
    
    for line in text:
        if num_tokens_from_string(current_chunk, encoding) + num_tokens_from_string(line, encoding) > 16000:  # 16,385 tokens max for gpt-3.5-turbo-0125
            # append the current chunk to the list of chunks
            chunks.append(current_chunk + "---" + "\n")
            # start a new chunk with the current line
            current_chunk = line
        else:
            # add the line to the current chunk
            current_chunk += line
    # append the last chunk
    if current_chunk:
        chunks.append(current_chunk)
    return chunks


In [77]:
# function making the call to the OpenAI API
def extract_action_items(list_of_chunks, temperature, max_tokens):
    # initialize the list to store completions
    completions = []

    # iterate over chunks and generate completions
    for chunk in list_of_chunks:
        completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "Extract all action items from the following meeting transcript and display them in the form of a numbered list."},
            {"role": "user", "content": chunk}
        ],
        temperature=temperature,
        max_tokens=max_tokens,
        n=1
        )
        completions.append(completion)

    # concatenate all completions
    all_action_items = "\n".join(completion.choices[0].message.content for completion in completions)

    return all_action_items

In [79]:
# function constructing the XML file and cleaning up the GPT output
def process_text_file(file_path, file_name, temperatures: dict, max_tokens: int):
    
    #set name for each meeting
    file_node = ET.SubElement(root, "Meeting", attrib={"Name": file_name})

    for temp in temperatures:
        temp_node = {}
        for i in range(3):
            with open(file_path, 'r') as text:
                text_chunks = split_into_chunks(text) # returns a list of strings
                action_items = extract_action_items(text_chunks, temp, max_tokens) # returns a single string

                iteration_node = ET.SubElement(file_node, "Iteration", attrib={"Number": str(i)}) # number iterations
                for item in action_items.split('\n'):  # Split action items by newline
                    if item.startswith("Action items:") or item.startswith("Action Items:") or item.startswith("### Action Items:"): # excluding leading sentence before action item lists
                    # skip creating an XML element if the row contains the words above
                        continue

                    item = re.sub(r"\s+", " ", item) # replace 1+ spaces with a single space
      
                    item_node = ET.SubElement(iteration_node, "Item") # create <Item> tag for each action item
                    item_node.text = re.sub(r"^\d+\.\s*", "", item) # add text to the tag after using regex to remove the numbering in front of each action item (eg. "1. ", "21. ")



In [81]:
# running all functions to create the XML file containing the extracted by the model action items
temperatures = [0.5]
max_tokens = 300

root = ET.Element("Meetings") # set the root XML element to <Meetings>

for file_name in os.listdir(folder_path): # for each file in our folder
    file_path = os.path.join(folder_path, file_name)
    print("Processing: ", file_name) # print progress
    results = process_text_file(file_path, file_name, temperatures, max_tokens) # process the files

    
    tree = ET.ElementTree(root) # create the <Meetings> root element all <Meeting> elements are contained under
    tree.write("GPT_action_items.xml", encoding='utf-8', xml_declaration=True) # write the XML elements to file


Processing:  Bed002.txt


Processing:  Bed003.txt
Processing:  Bed004.txt
Processing:  Bed005.txt
Processing:  Bed006.txt
Processing:  Bed008.txt
Processing:  Bed009.txt
Processing:  Bed010.txt
Processing:  Bmr001.txt
Processing:  Bmr002.txt
Processing:  Bmr003.txt
Processing:  Bmr005.txt
Processing:  Bmr006.txt
Processing:  Bmr007.txt
Processing:  Bmr008.txt
Processing:  Bmr009.txt
Processing:  Bmr010.txt
Processing:  Bro003.txt
Processing:  Bro004.txt
Processing:  Bro005.txt
Processing:  Bro007.txt
Processing:  Bro008.txt
Processing:  Bro010.txt
Processing:  Bro011.txt
Processing:  Bro012.txt
