In [1]:
from swda_github.swda import CorpusReader
corpus = CorpusReader('swda_github/swda')

### First generate label ids (in order of frequency)

In [2]:
from collections import Counter

act_labels = [utt.damsl_act_tag() for trans in corpus.iter_transcripts() for utt in trans.utterances]
act_counter = Counter(act_labels)
act_set, _ = zip(*act_counter.most_common())

act_dict = {}
for k, act in enumerate(act_set):
    act_dict[act] = k

transcript 1155


### I've also hardcoded mapping between raw_label-id-description

In [3]:
from collections import defaultdict
from collections import Counter

flatten = lambda data: [utt for conv in data for utt in conv]

act_pairs = [
                ('+','multi_segment'), ('sd','statement'), ('b','backchannel'), ('sv','opinion'), ('aa','agreement'),
                ('%','abandoned'), ('ba','appreciation'), ('qy','yes/no question'), ('x','non verbal'),  
                ('ny','yes answers'), ('fc','closing remarks'), ('qw','wh-question'), 
                ('nn','no answers'),  ('bk','response acknowledgement'), ('h','hedge'), 
                ('qy^d','declaritive yes/no question'), ('fo_o_fw_"_by_bc','other'), ('bh','back-channel question'), 
                ('^q','quotation'), ('bf','summarize'),('na','affermative yes/no answers'), ('ad','action-directive'), 
                ('^2','collaborative Completion'), ('b^m', 'repeat phrase'), ('qo', 'open question'), 
                ('qh', 'rhetorical question'), ('^h', 'hold before answer'), ('ar', 'reject'),
                ('ng', 'negative non-no answers'), ('br', 'signal non understanding'), ('no', 'other answers'),
                ('fp', 'conventional opening'), ('qrr', 'or clause'), ('arp_nd', 'dispreferred answers'), 
                ('t3', '3rd-party-talk'), ('oo_co_cc', 'offers, options'), ('t1', 'self-talk'), ('bd', 'downplayer'),
                ('aap_am', 'maybe/accept-part'), ('^g', 'tag question'), ('qw^d', 'declarative wh-question'),
                ('fa', 'apology'), ('ft', 'thanking')
            ]

act_names = {}
for code, name in act_pairs:
    act_index = act_dict[code]
    act_names[act_index] = name

print(list(act_names.items())[:5])

[(3, 'multi_segment'), (0, 'statement'), (1, 'backchannel'), (2, 'opinion'), (5, 'agreement')]


### Then go through datasets and get all important data

In [4]:
import time

data = []
for trans in corpus.iter_transcripts():
    conv_dict = {}
    conv_dict['conv_id'] = 'sw' + str(trans.conversation_no)
    conv = []
    for utt in trans.utterances:
        utt_list = utt.text_words(filter_disfluency=True)
        utt_data = {'text':' '.join(utt_list), 'label':act_dict[utt.damsl_act_tag()], 'speaker':utt.caller}
        conv.append(utt_data)
    conv_dict['turns'] = conv
    data.append(conv_dict)


transcript 1155


### Finally split data into train/dev/test split

In [5]:
def load_list(path:str)->list:
    with open(path, 'r') as f:
        conv_ids = f.readlines()
        conv_ids = [i.replace('\n', '') for i in conv_ids]
    return conv_ids

train_ids = load_list('id_splits/train_ids') 
dev_ids   = load_list('id_splits/dev_ids') 
test_ids  = load_list('id_splits/test_ids') 

train = [conv for conv in data if conv['conv_id'] in train_ids]
dev   = [conv for conv in data if conv['conv_id'] in dev_ids]
test  = [conv for conv in data if conv['conv_id'] in test_ids]

In [6]:
print(len(train))

1115


In [7]:
#SAVING EVERYTHING

In [8]:
import json

def save_json(path, conv_dict):
    with open(path, "w") as outfile:
        json.dump(conv_dict, outfile)

save_json('standard/train.json', train)
save_json('standard/dev.json', dev)
save_json('standard/test.json', test)
save_json('standard/labels.json', act_names)