In [1]:
import json
from pathlib import Path

In [2]:
def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

path_to_training = Path("training")
path_to_test = Path("test")

## training and test sets of transcription ids

In [3]:
training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
training_set.remove('IS1002a')
training_set.remove('IS1005d')
training_set.remove('TS3012c')

test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])

## naive_baseline: all utterances are predicted important (label 1)

In [4]:
test_labels = {}
for transcription_id in test_set:
    with open(path_to_test / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)
    
    test_labels[transcription_id] = [1] * len(transcription)

with open("test_labels_naive_baseline.json", "w") as file:
    json.dump(test_labels, file, indent=4)

## text_baseline: utterances are embedded with SentenceTransformer, then train a classifier.

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sentence_transformers import SentenceTransformer
bert = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm
.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 304kB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 50.5kB/s]
README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 4.03MB/s]
config.json: 100%|██████████| 612/612 [00:00<00:00, 307kB/s]
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 153kB/s]
data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 17.2MB/s]
pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:00<00:00, 107MB/s] 
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 9.79kB/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 41.6kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 2.84MB/s]
tokenizer_config.json: 100%|██████████| 350/350 [00:00<00:00, 638kB/s]
train_script.py: 100%|██████████| 13.2k/13.2k [00:00<00:00, 6.36MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.42MB/s]
modules.json: 100%|██████████| 349/34

In [8]:
y_training = []
with open("training_labels.json", "r") as file:
    training_labels = json.load(file)
X_training = []
for transcription_id in training_set:
    with open(path_to_training / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)
    
    for utterance in transcription:
        X_training.append(utterance["speaker"] + ": " + utterance["text"])
    
    y_training += training_labels[transcription_id]

X_training = bert.encode(X_training, show_progress_bar=True)

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_training, y_training)

test_labels = {}
for transcription_id in test_set:
    with open(path_to_test / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)
    
    X_test = []
    for utterance in transcription:
        X_test.append(utterance["speaker"] + ": " + utterance["text"])
    
    X_test = bert.encode(X_test)

    y_test = clf.predict(X_test)
    test_labels[transcription_id] = y_test.tolist()

Batches: 100%|██████████| 2270/2270 [00:24<00:00, 94.44it/s] 


In [9]:
with open("test_labels_text_baseline.json", "w") as file:
    json.dump(test_labels, file, indent=4)

## F1-Score

In [10]:
from sklearn.metrics import f1_score

In [14]:
with open("test_labels_text_baseline.json", "r") as file:
        y_pred = json.load(file)

with open("test_labels_naive_baseline.json", "r") as file:
        y_true = json.load(file)

In [15]:
y_true_labels = []
y_pred_labels = []

for k in y_true.keys():
    y_true_labels.extend(y_true[k])
    y_pred_labels.extend(y_pred[k])

In [18]:
f1_score(y_true_labels, y_pred_labels)

0.3121072817778745