In [14]:
import os
import json
import re

from tqdm import tqdm

from keybert import KeyBERT
kw_model = KeyBERT()

superseg, tiage

In [38]:
ngram_range = (1, 1)
top_n = 5
superseg_path = './datasets/superseg'
splits = ['train', 'validation', 'test']


In [39]:
for split in tqdm(splits):
    with open(os.path.join(superseg_path, f'segmentation_file_{split}.json')) as f:
        data = json.load(f)
    dataset = {'data' : []}
    for idx, current_dialog in enumerate(tqdm(data['dial_data']['superseg-v2'])):
        current_topic_utterances = [] 
        current_topic_id = 0
        for item in current_dialog['turns']:
            if item['topic_id'] == current_topic_id:
                current_topic_utterances.append(item['utterance'].strip())
            else:
                keywords = kw_model.extract_keywords(' '.join(current_topic_utterances), top_n=top_n,  keyphrase_ngram_range=ngram_range, stop_words='english') 
                keywords = [word[0] for word in keywords]
                dataset['data'].append({"text": "\n".join(current_topic_utterances),
                                        "topic_id": current_topic_id,
                                        "keywords": ', '.join(keywords),
                                        "dialogue_id": idx})
                current_topic_id += 1
                current_topic_utterances = [item['utterance'].strip()]

    with open(f'./data/superseg_{split}.json', 'w') as f:
        json.dump(dataset, f)   

100%|██████████| 6948/6948 [05:12<00:00, 22.25it/s]
100%|██████████| 1322/1322 [00:58<00:00, 22.73it/s]
100%|██████████| 1322/1322 [00:55<00:00, 23.73it/s]
100%|██████████| 3/3 [07:06<00:00, 142.16s/it]


In [36]:
ngram_range = (1, 1)
top_n = 5
superseg_path = './datasets/tiage'
splits = ['train', 'validation', 'test']
bad_chars = [symbol for symbol in "<>:/\|?!*"]

In [37]:
for split in tqdm(splits):
    with open(os.path.join(superseg_path, f'segmentation_file_{split}.json')) as f:
        data = json.load(f)
    dataset = {'data' : []}
    for idx, current_dialog in enumerate(tqdm(data['dial_data']['tiage'])):
        current_topic_utterances = [] 
        current_topic_id = 0
        for item in current_dialog['turns']:
            if item['topic_id'] == current_topic_id:
                text = ''.join(i for i in item['utterance'].strip() if not i in bad_chars)
                current_topic_utterances.append(text)
            else:
                keywords = kw_model.extract_keywords(' '.join(current_topic_utterances), top_n=top_n,  keyphrase_ngram_range=ngram_range, stop_words='english') 
                keywords = [word[0] for word in keywords]
                dataset['data'].append({"text": "\n".join(current_topic_utterances),
                                        "topic_id": current_topic_id,
                                        "keywords": ', '.join(keywords),
                                        "dialogue_id": idx})
                current_topic_id += 1
                current_topic_utterances = [item['utterance'].strip()]

    with open(f'./data/tiage_{split}.json', 'w') as f:
        json.dump(dataset, f)   

100%|██████████| 300/300 [00:11<00:00, 26.56it/s]
100%|██████████| 100/100 [00:04<00:00, 24.87it/s]
100%|██████████| 100/100 [00:04<00:00, 24.79it/s]
100%|██████████| 3/3 [00:19<00:00,  6.46s/it]


In [8]:
ngram_range = (1, 1)
top_n = 5
qmsum_path = './datasets/QMSum'
splits = ['train', 'validation', 'test']
acadenic_ds = {'train' : [], 'validation': [], 'test': []}

In [10]:
files = os.listdir(os.path.join(qmsum_path, 'Academic', 'train'))
with open(os.path.join(qmsum_path, 'Academic', 'train', files[0])) as f:
    data = json.load(f)

In [12]:
data['topic_list']

[{'topic': 'Technical issues', 'relevant_text_span': [['0', '171']]},
 {'topic': 'Transcription pipeline', 'relevant_text_span': [['172', '372']]},
 {'topic': 'Options for carrying out transcription',
  'relevant_text_span': [['373', '877']]},
 {'topic': 'Transcription conventions and interfaces',
  'relevant_text_span': [['878', '1156']]},
 {'topic': 'Time cost of annotation and website',
  'relevant_text_span': [['1157', '1341']]},
 {'topic': 'Electronics', 'relevant_text_span': [['1342', '1744']]}]

In [28]:
current_topic_utterances = []
text = [utter['content'] for utter in data['meeting_transcripts']]
new_dialog = []
for idx, topics in enumerate(data['topic_list']):
    #topic_idx = 
    topic_start_idx, topic_end_idx = int(topics['relevant_text_span'][0][0]), int(topics['relevant_text_span'][0][1]) + 1
    keywords = kw_model.extract_keywords(' '.join(text[topic_start_idx:topic_end_idx]).strip(), 
                                         top_n=top_n,  
                                         keyphrase_ngram_range=ngram_range, 
                                         stop_words='english') 
    keywords = [word[0] for word in keywords]
    new_dialog.append({'text': text[topic_start_idx:topic_end_idx],
                        'topic_id': idx,
                        'keywords': keywords})
    break

acadenic_ds['train'].append(new_dialog)

In [29]:
acadenic_ds

{'train': [[{'text': ['OK , this is one channel . Can you uh , say your name and talk into your mike one at a time ?',
     'This is Eric on channel three , I believe .',
     "OK . Uh , I don't think it 's on there , Jane .",
     'Tasting one two three , tasting .',
     'OK , this is Jane on channel five .',
     "Uh , I still don't see you Jane .",
     'Oh , darn , what am I doing wrong ?',
     'Can you see me on channel four ? Really ?',
     'Yeah , I s',
     'My lucky day .',
     'Uh , screen no , {disfmarker} it is , oh , maybe it just warmed up ?',
     'No .',
     "Oh , darn , can you can't see channel five yet ?",
     "Uh , well , the mike isn't close enough to your mouth , so .",
     'Oh , this would be k OK , is that better ?',
     'S uh , try speaking loudly ,',
     'I like the high quality labelling .',
     'so ,',
     'Hello ,',
     'OK , good .',
     'David , can we borrow your labelling machine to improve the quality of the labelling a little bit here ?',