In [1]:
%load_ext autoreload
%aimport -keras
%autoreload 2

In [2]:
import random
import os
import matplotlib.pyplot as plt
import argparse
import keras
from utils import *
from models.lstm_classifier import LSTMClassifier
from models.gru_classifier import GRUClassifier

Using TensorFlow backend.


In [3]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

## Train the model

In [4]:
train = load_data('data/utterances.train')
val = load_data('data/utterances.valid')

In [5]:
train = pd.concat([train, val]).sample(frac=1) # Merge train & val then shuffle
val_split = len(val) / len(train)
val_split

0.09237789951132092

In [6]:
test = load_data('data/utterances.test', train=False)

In [7]:
classes = train['dialog_act'].unique()
num_classes = len(classes)
max_len = train['utterance_t'].apply(len).max()

In [8]:
(X_train, y_train), (vocab, word_to_idx, idx_to_word) = process_data(train, max_len)

In [9]:
test_vocab = [word for row in test['utterance_t'] for word in row if word not in vocab]
for word in test_vocab:
    idx = len(word_to_idx)
    word_to_idx[word] = idx
    idx_to_word[idx] = word
    vocab.add(word)

In [10]:
weights = {}
with open('/projekte/slu/share/emb/glove.twitter.27B.50d.txt', 'r', encoding='utf-8') as f:
    for line in f.readlines():
        line = line.split()
        word = line[0]
        if word in vocab:
            weights[word] = np.array(line[1:], dtype='float32')

In [11]:
# embds_path = '/projekte/slu/share/GoogleNews-vectors-negative300.bin' 
# weights = load_bin_vec(embds_path, word_to_idx)

In [12]:
num_dims = weights[random.choice(list(weights.keys()))].shape[0]
add_unknown_words(weights, word_to_idx, k=num_dims, min_df=0)
W, word_idx = get_W(weights, k=num_dims)

In [None]:
#model = GRUClassifier(num_classes, max_len, W)
model = LSTMClassifier(num_classes, max_len, W)

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])
print(model.summary())

## Test the model

In [None]:
%%time
history = model.fit(X_train, y_train, validation_split=val_split * 2,
                    epochs=3, batch_size=64)

In [19]:
model = keras.models.load_model('gru_100_epochs.h5')

In [20]:
X_test = sequence.pad_sequences(test['utterance_t'].apply(process_seq, args=[word_idx]),
                                maxlen=max_len)

In [21]:
pred = model.predict_classes(X_test)
pred

array([4, 0, 0, ..., 0, 0, 2])

In [22]:
counts = np.unique(pred, return_counts=True)
counts

(array([ 0,  1,  2,  3,  4,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]),
 array([8768,  296, 8321,    3,  375,  357,    4,   73,  539,   61,    2,
          18,   49,   32,   58,   33,   19,    3,   73,  122,    7,    7,
           7,    8,    6,  555,    3,   59,  141,    1]))

In [23]:
counts = np.unique(pred, return_counts=True)
pred_out = pd.Series(dict([(classes[i], c / len(pred))
                           for i,c in list(zip(counts[0], counts[1]))]), name='dialog_act')
train_out = (train.groupby('dialog_act').size() / len(train))
merged = pd.DataFrame(data=dict(pred_out=pred_out, train_out=train_out)).dropna()
merged['% dif'] = (merged['pred_out'] - merged['train_out']) * 10
merged['rank dif'] = ((merged['pred_out'].rank() - merged['train_out']
                       .rank())
                       .abs()
                       .astype(int))

In [24]:
merged.sort_values(by='train_out', ascending=False)

Unnamed: 0,pred_out,train_out,% dif,rank dif
s,0.4384,0.497256,-0.588564,0
b,0.41605,0.195615,2.204352,0
aa,0.01785,0.074276,-0.564265,3
%,0.01875,0.042974,-0.242242,1
%--,0.0148,0.038933,-0.241327,2
ba,0.00365,0.024342,-0.206916,4
qy,0.0001,0.022028,-0.219275,22
x,0.02695,0.016836,0.101141,4
bk,0.00015,0.013224,-0.130739,18
qw,0.00305,0.010319,-0.072686,2


In [25]:
for p, c in zip(pred[:20], test['utterance_t'].iloc[:20]):
    print(classes[p], c)

% ['<START>', 'But', 'once', 'we', 'once', 'we', "'ve", 'done', 'the', 'intellectual', 'part', 'of', 'these', ',', 'uh', ',', 'we', 'can', 'just', 'knock', 'them', 'out', ',', 'right', '?', '<END>']
s ['<START>', 'And', 'the', ',', 'uh', ',', 'Aurora', 'HTK', ',', 'it', 'was', 'like', 'twenty', '.', '<END>']
s ['<START>', 'Mmm', '.', '<END>']
b ['<START>', 'So', 'y', 'the', 'example', 'is', ',', '"', 'That', 'would', 'be', 'hard', '"', '<END>']
%-- ['<START>', 'Well', ',', 'or', 'if', 'you', "'re", 'a', 'C', 'programmer', '.', '<END>']
b ['<START>', 'if', 'they', "'re", 'right', 'next', 'to', 'one', 'another', '?', '<END>']
s ['<START>', 'That', "'s", 'interesting', '.', '<END>']
b ['<START>', 'I', "'d", 'expect', 'it', 'to', 'be', 'a', 'minor', 'effect', ',', '<END>']
b ['<START>', 'Mmm', '?', '<END>']
s ['<START>', 'Mm', '-', 'hmm', '.', '<END>']
s ['<START>', 'Alright', '.', '<END>']
b ['<START>', 'It', 'was', 'bo', 'it', 'both', 'times', 'the', 'same', 'person', '.', '<END>']
b ['<

In [26]:
test['prediction'] = [classes[p] for p in pred]
test.head()

Unnamed: 0_level_0,utterance_t-3,utterance_t-2,utterance_t-1,utterance_t,prediction
utterance_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bmr003_2578,I have to do,"[<START>, that, 's, true, <END>]","[<START>, but, we, haven't, spent, that, ,, ri...","[<START>, But, once, we, once, we, 've, done, ...",%
Bro027_44,"Um ,","[<START>, but, one, of, the, differences, that...","[<START>, Yep, ., <END>]","[<START>, And, the, ,, uh, ,, Aurora, HTK, ,, ...",s
Bro015_42,Oh so take a real VAD but apply it to to to th...,"[<START>, Uh, g, <END>]","[<START>, Yeah, ,, to, the, clean, and, take, ...","[<START>, Mmm, ., <END>]",s
Btr002_235,"boy , it 's hard to ig uh , ignoi that ignore ...","[<START>, Uh, -, huh, ., <END>]","[<START>, Um, ,, <END>]","[<START>, So, y, the, example, is, ,, "", That,...",b
Bmr016_1398,"In Monty Python you say "" argh "" a lot .","[<START>, Oh, yeah, ?, <END>]","[<START>, So, ., <END>]","[<START>, Well, ,, or, if, you, 're, a, C, pro...",%--


In [None]:
with open('3372489_sorensen_topic1_result.txt', 'w') as fout:
    for idx, p in test['prediction'].iteritems():
        fout.write(f'{idx} {p}\n')