In [102]:
import keras
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix

In [52]:
def get_sentence(file):
    data = []
    temp = []
    with open(file) as f:
        for line in f:
            text = line.split('\t')
            word = text[1][text[1].index('#')+2:]
            if word == "BOS":
                label = text[2][text[2].index('#') +2:]
            elif word == 'EOS':
                if '+' in label or label =='abbreviation':
                    temp = []
                    continue
                else:
                    data.append([temp, label])
                    temp = []
                    continue
            else:
                temp.append(word)
        return data

In [53]:
train_data = get_sentence('./atis.train.ctf.txt')

In [54]:
test_data = get_sentence('./atis.test.ctf.txt')


In [55]:
print(train_data[:2])

[[['i', 'want', 'to', 'fly', 'from', 'boston', 'at', '838', 'am', 'and', 'arrive', 'in', 'denver', 'at', '1110', 'in', 'the', 'morning'], 'flight'], [['what', 'flights', 'are', 'available', 'from', 'pittsburgh', 'to', 'baltimore', 'on', 'thursday', 'morning'], 'flight']]


In [56]:
def get_unigram_dict(train, test):
    unique_words = set()
    unique_label = set()
    for dataset in [train, test]:
        for line in dataset:
            for word in line[0]:
                unique_words.add(word)
            unique_label.add(line[1])
    return (unique_words, unique_label)

In [57]:
words_set, labels_set = get_unigram_dict(train_data, test_data)
words_list = list(words_set)
labels_list = list(labels_set)

In [58]:
print(words_list[:10])
print(labels_list)

['usa', 'dca', 'please', 'daily', 'route', 'economy', 'houston', 'bound', 'landings', 'must']
['flight_no', 'city', 'distance', 'airfare', 'meal', 'cheapest', 'aircraft', 'ground_service', 'restriction', 'capacity', 'ground_fare', 'airline', 'day_name', 'flight', 'airport', 'quantity', 'flight_time']


In [59]:
print(len(words_list))

898


In [63]:
def preprocessing(data, words_list, labels_list):
    X=[]
    y = []
    for line in data:
        temp =[0]*len(words_list)
        for i, word in enumerate(line[0]):
            temp[words_list.index(word)]+=1
        X.append(temp)
        temp_y = [0]*len(labels_list)
        temp_y[labels_list.index(line[1])]+=1
        y.append(temp_y)

    return(X, y)

        

In [64]:
X_train, y_train = preprocessing(train_data, words_list, labels_list)

In [69]:
print(y_train[0])
print(len(y_train[0]))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
17


In [70]:
X_test, y_test = preprocessing(test_data, words_list, labels_list)

# training on model

In [85]:
model = Sequential()
model.add(Dense(64, input_dim = len(words_list), activation='relu'))
model.add(Dense(16, input_dim = 64, activation='relu'))
model.add(Dense(len(labels_list), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.fit(np.array(X_train), np.array(y_train), epochs = 20, verbose =1 )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x227ce4d05f8>

In [86]:
pred_temp = model.predict(np.array(X_test))

In [87]:
pred =[]
for line in pred_temp:
    line = line.tolist()
    temp = [0]*len(labels_list)
    temp[line.index(max(line))]=1
    pred.append(temp)
    

In [88]:
print(accuracy_score(np.array(y_test), np.array(pred)))

0.9289940828402367


In [101]:
print(precision_recall_fscore_support(np.array(y_test), np.array(pred), average='macro')[:-1])

(0.6738049331290883, 0.5729309856298727, 0.5743886763225182)


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


# checking

In [97]:
print(test_data[:4])
print(pred[:4])
print(y_test[:4])

[[['i', 'would', 'like', 'to', 'find', 'a', 'flight', 'from', 'charlotte', 'to', 'las', 'vegas', 'that', 'makes', 'a', 'stop', 'in', 'st.', 'louis'], 'flight'], [['on', 'april', 'first', 'i', 'need', 'a', 'ticket', 'from', 'tacoma', 'to', 'san', 'jose', 'departing', 'before', '7', 'am'], 'airfare'], [['on', 'april', 'first', 'i', 'need', 'a', 'flight', 'going', 'from', 'phoenix', 'to', 'san', 'diego'], 'flight'], [['i', 'would', 'like', 'a', 'flight', 'traveling', 'one', 'way', 'from', 'phoenix', 'to', 'san', 'diego', 'on', 'april', 'first'], 'flight']]
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]]
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]]
