University of Aberdeen\
Atanas Komsiyski
## Evaluating GPT-3.5-turbo for Action Item Extraction in Meeting Transcripts

This Jupyter notebook contains the preprocessing pipeline for meeting transcripts from the ICSI corpus.\
It processes the MRT files from the dataset into human-readable TXT files.


#### Libraries
To ensure all libraries are installed before executing the notebook run -pip install requirements.txt

In [25]:
# importing libraries
import os
import xml.etree.ElementTree as ET
import re

In [26]:
# main function processing the MRT files to human-readable TXTs
def MRTtoTXT(xml_file):
    # load the MRT(XML-like) file
    tree = ET.parse(xml_file)
    root = tree.getroot()

    current_participant = None # set the starting participant and text as blank
    participant_text = ""

    # extract text and participant from each Segment
    for segment in root.findall('.//Segment'):
        participant = segment.get('Participant', '')
        digit_task = segment.get('DigitTask', '')

        # ignore Segments part of the Digits Task
        if digit_task.lower() == 'true':
            continue


        text = ''.join(segment.itertext()).strip() # strip the text segment of any extra spaces

        if text and participant:  # check if there is text and participant for the current Segment
            # if participant changes or it's the first Segment
            if participant != current_participant:
                # yield the participant's text and format it
                if participant_text:
                    yield f"{current_participant}: {participant_text}" # e.g. - Name: This is a test.
                # reset participant and text for the next participant
                current_participant = participant
                participant_text = text
            else:
                # concatenate text if the participant is the same; puts speech from across elements together if the speaker reamains the same
                participant_text += " " + text

    # yield the last participant's text
    if participant_text:
        yield f"{current_participant}: {participant_text}"



In [None]:

# function to iterate through folder and process using the MRT to TXT function
def process_folder(input_folder, output_folder):
    # create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # process each file from the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.mrt'):
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, os.path.splitext(filename)[0] + '.txt') # create a txt file with the same name as the original MRT file

            # extract text from MRT file
            text_content = "\n".join(MRTtoTXT(input_file_path))
            
            # remove accidental double spaces from concatenating and underscores from abreviations
            text_content = re.sub(r"[^\S\n]+", " ", text_content)   # the regular expression removes all runs of more than one space but preserves newline(\n) characters
            text_content = text_content.replace("_", "") # replace _ from abreviations in text

            # write text content to the new .txt file if there is any text
            if text_content:
                with open(output_file_path, 'w') as txt_file:
                    txt_file.write(text_content)

# path to the input folder containing MRT files
input_folder_path = 'ICSI_original_transcripts/transcripts'

# path to the output folder where .txt files will be saved
output_folder_path = 'ICSI_original_transcripts/transcripts/txt_transcripts'

# processing the input folder
process_folder(input_folder_path, output_folder_path)

