In [1]:
import json

# read the dataset
# please enter the path of your data
split = 'train'
data_path = '/path/to/data/ALL/jsonl/' + split + '.jsonl'
data = []
with open(data_path) as f:
    for line in f:
        data.append(json.loads(line))
n_meetings = len(data)
print('Total {} meetings in the {} set.'.format(n_meetings, split))

Total 162 meetings in the train set.


In [2]:
from nltk import word_tokenize
# tokneize a sent
def tokenize(sent):
    tokens = ' '.join(word_tokenize(sent.lower()))
    return tokens

In [3]:
# filter some noises caused by speech recognition
def clean_data(text):
    text = text.replace('{ vocalsound } ', '')
    text = text.replace('{ disfmarker } ', '')
    text = text.replace('a_m_i_', 'ami')
    text = text.replace('l_c_d_', 'lcd')
    text = text.replace('p_m_s', 'pms')
    text = text.replace('t_v_', 'tv')
    text = text.replace('{ pause } ', '')
    text = text.replace('{ nonvocalsound } ', '')
    text = text.replace('{ gap } ', '')
    return text

In [4]:
# process data for BART
# the input of the model here is the entire content of the meeting
bart_data = []
for i in range(len(data)):
    # get meeting content
    src = []
    for k in range(len(data[i]['meeting_transcripts'])):
        cur_turn = data[i]['meeting_transcripts'][k]['speaker'].lower() + ': '
        cur_turn = cur_turn + tokenize(data[i]['meeting_transcripts'][k]['content'])
        src.append(cur_turn)
    src = ' '.join(src)
    for j in range(len(data[i]['general_query_list'])):
        cur = {}
        query = tokenize(data[i]['general_query_list'][j]['query'])
        cur['src'] = clean_data('<s> ' + query + ' </s> ' + src + ' </s>')
        target = tokenize(data[i]['general_query_list'][j]['answer'])
        cur['tgt'] = target
        bart_data.append(cur)
    for j in range(len(data[i]['specific_query_list'])):
        cur = {}
        query = tokenize(data[i]['specific_query_list'][j]['query'])
        cur['src'] = clean_data('<s> ' + query + ' </s> ' + src + ' </s>')
        target = tokenize(data[i]['specific_query_list'][j]['answer'])
        cur['tgt'] = target
        bart_data.append(cur)
        
print('Total {} query-summary pairs in the {} set'.format(len(bart_data), split))
print(bart_data[2])
with open('/path/to/data/bart_' + split + '.jsonl', 'w') as f:
    for i in range(len(bart_data)):
        print(json.dumps(bart_data[i]), file=f)

Total 1257 query-summary pairs in the train set
{'src': "<s> what did user interface introduce about the detailed design of the prototype ? </s> project manager: welcome back . industrial designer: i 'm sorry to be late . project manager: welcome back everybody . user interface: yeah . thanks . project manager: so this meeting agenda will be the detailed design meeting . and uh opening and uh pms of the meet minutes , uh prototype presentation from uh christine and uh agnes . industrial designer: agnes , yes . project manager: yes and uh evaluation criteria . the finance , it 's uh from my side , from the management , and uh production evaluation . then uh closing . so we have forty minutes to discuss and uh finalise and close the product and project and to move further , okay , so okay , let 's talk about uh maybe first uh for the prototype . user interface: mm , okay . project manager: so i handle to user interface: i 've done a presentation , but it pretty much covers work that we '

In [5]:
# process data for BART
# the input of the model here is the gold span corresponding to each query
bart_data_gold = []
for i in range(len(data)):
    # get meeting content
    entire_src = []
    for k in range(len(data[i]['meeting_transcripts'])):
        cur_turn = data[i]['meeting_transcripts'][k]['speaker'].lower() + ': '
        cur_turn = cur_turn + tokenize(data[i]['meeting_transcripts'][k]['content'])
        entire_src.append(cur_turn)
    entire_src = ' '.join(entire_src)
    for j in range(len(data[i]['general_query_list'])):
        cur = {}
        query = tokenize(data[i]['general_query_list'][j]['query'])
        cur['src'] = clean_data('<s> ' + query + ' </s> ' + entire_src + ' </s>')
        target = tokenize(data[i]['general_query_list'][j]['answer'])
        cur['tgt'] = target
        bart_data_gold.append(cur)
    for j in range(len(data[i]['specific_query_list'])):
        cur = {}
        query = tokenize(data[i]['specific_query_list'][j]['query'])
        src = []
        # get the content in the gold span for each query
        for span in data[i]['specific_query_list'][j]['relevant_text_span']:
            assert len(span) == 2
            st, ed = int(span[0]), int(span[1])
            for k in range(st, ed + 1):
                cur_turn = data[i]['meeting_transcripts'][k]['speaker'].lower() + ': '
                cur_turn = cur_turn + tokenize(data[i]['meeting_transcripts'][k]['content'])
                src.append(cur_turn)
        src = ' '.join(src)
        cur['src'] = clean_data('<s> ' + query + ' </s> ' + src + ' </s>')
        target = tokenize(data[i]['specific_query_list'][j]['answer'])
        cur['tgt'] = target
        bart_data_gold.append(cur)
        
print('Total {} query-summary pairs in the {} set'.format(len(bart_data_gold), split))
print(bart_data_gold[2])
with open('/path/to/data/bart_' + split + '._gold.jsonl', 'w') as f:
    for i in range(len(bart_data_gold)):
        print(json.dumps(bart_data_gold[i]), file=f)

Total 1257 query-summary pairs in the train set
{'src': "<s> what did user interface introduce about the detailed design of the prototype ? </s> industrial designer: i 'm sorry to be late . project manager: welcome back everybody . user interface: yeah . thanks . project manager: so this meeting agenda will be the detailed design meeting . and uh opening and uh pms of the meet minutes , uh prototype presentation from uh christine and uh agnes . industrial designer: agnes , yes . project manager: yes and uh evaluation criteria . the finance , it 's uh from my side , from the management , and uh production evaluation . then uh closing . so we have forty minutes to discuss and uh finalise and close the product and project and to move further , okay , so okay , let 's talk about uh maybe first uh for the prototype . user interface: mm , okay . project manager: so i handle to user interface: i 've done a presentation , but it pretty much covers work that we 've both done , so if i 'm missin