In [87]:
import numpy
import pandas as pd
import os
from collections import Counter
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [24]:
def preprocess(series):
    def delimit(sent):
        arr = sent.split()
        if arr:
            arr.insert(0, '<START>')
            arr.append('<END>')
        else:
            arr = ['<EMPTY>']
        return arr
    return series.apply(delimit)

In [25]:
def process_seq(seq, mapping):
    return [mapping[w] for w in seq]

In [26]:
def get_vocab(sents, top_words=None):
    c = Counter()
    for sent in sents:
        c.update(sent)
        
    if top_words is not None:
        vocab = set([w[0] for w in c.most_common(top_words)])
    else:
        vocab = set(c.keys())
        
    word_to_idx = {}
    idx_to_word = {}
                
    for idx, word in enumerate(c.keys()):
        word_to_idx[word] = idx
        idx_to_word[idx] = word
        
    return vocab, word_to_idx, idx_to_word

In [27]:
def load_data(path):
    COLUMNS = ['utterance_ID', 'dialog_act', 'utterance_t-3', 
           'utterance_t-2', 'utterance_t-1', 'utterance_t']
    
    df = pd.read_csv(path, sep='\t|;',
                  engine='python', names=COLUMNS, dtype=str).set_index('utterance_ID')
    df[COLUMNS[2:]] = df[COLUMNS[2:]].apply(preprocess)
    return df

In [81]:
def process_data(data, max_len, top_words=None):
    vocab, word_to_idx, idx_to_word = get_vocab(data['utterance_t'], top_words)
    label_to_idx = {label:idx for idx, label in enumerate(data['dialog_act'].unique())}
    
    X = sequence.pad_sequences(data['utterance_t'].apply(process_seq, args=[word_to_idx]),
                               maxlen=max_len)
    y = data['dialog_act'].map(label_to_idx).values
    
    return (X, y), (vocab, word_to_idx, idx_to_word)

In [29]:
train = load_data('da_tagging/utterances.train')
val = load_data('da_tagging/utterances.valid')
train.head()

Unnamed: 0_level_0,dialog_act,utterance_t-3,utterance_t-2,utterance_t-1,utterance_t
utterance_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2121_1,bc,[<EMPTY>],[<EMPTY>],[<EMPTY>],"[<START>, Okay, ,, uh, <END>]"
2121_2,qw,[<EMPTY>],[<EMPTY>],"[<START>, Okay, ,, uh, <END>]","[<START>, could, you, tell, me, what, you, thi..."
2121_3,h,[<EMPTY>],"[<START>, Okay, ,, uh, <END>]","[<START>, could, you, tell, me, what, you, thi...","[<START>, Well, ,, it, 's, hard, to, say, ., <..."
2121_4,s,"[<START>, Okay, ,, uh, <END>]","[<START>, could, you, tell, me, what, you, thi...","[<START>, Well, ,, it, 's, hard, to, say, ., <...","[<START>, I, mean, ,, while, it, 's, certainly..."
2121_5,qo,"[<START>, could, you, tell, me, what, you, thi...","[<START>, Well, ,, it, 's, hard, to, say, ., <...","[<START>, I, mean, ,, while, it, 's, certainly...","[<START>, What, do, you, think, ?, <END>]"


In [107]:
classes = train['dialog_act'].unique()
num_classes = len(classes)
num_classes

31

In [59]:
max_len = train['utterance_t'].apply(len).max()
max_len

106

In [83]:
(X_train, y_train), (vocab, word_to_idx, idx_to_word) = process_data(train, max_len)
(X_val, y_val), (_,_,_) = process_data(val, max_len)

In [86]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
embedding_dim = max_len
model = Sequential()
model.add(Embedding(len(vocab), embedding_dim, input_length=embedding_dim))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(31, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                    epochs=3, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 106, 106)          2597530   
_________________________________________________________________
lstm_13 (LSTM)               (None, 100)               82800     
_________________________________________________________________
dense_14 (Dense)             (None, 31)                3131      
Total params: 2,683,461
Trainable params: 2,683,461
Non-trainable params: 0
_________________________________________________________________
None
Train on 196502 samples, validate on 20000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fe72c420550>

In [102]:
predictions = model.predict(X_train)

In [114]:
preds = [classes[prediction.argmax()] for prediction in predictions]