In [1]:
import keras

Using TensorFlow backend.


In [143]:
from keras.preprocessing import sequence, text
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, GRU, ConvLSTM2D, Bidirectional

In [32]:
from gensim.models import KeyedVectors
import numpy as np

In [19]:
training_path = "../data/fasttext/dataset.train"
validation_path = "../data/fasttext/dataset.valid"
test_path = "../data/fasttext/dataset.test"

In [103]:
max_len = 8
batch_size = 32

In [21]:
print("Loading embeddings...")
vectorizer = load_embeddings()
print("Loaded embeddings")

Loading embeddings...
Loaded embeddings


In [5]:
def load_embeddings(): 
    vectorizer = KeyedVectors.load_word2vec_format('crawl-300d-2M.vec')
    return vectorizer

In [34]:
def pad_to(input_list, max_len): 
    unk = np.zeros(300)
    if len(input_list) < max_len: 
        diff = max_len - len(input_list)
        input_list = input_list + ([unk] * diff)
    else: 
        input_list = input_list[:max_len]
    return input_list

In [30]:
def load_data(vectorizer, path): 
    print("Loading", path)
    # List of list of vectors
    # Final dimension = # samples, max_len, vector_length
    X = [] 
    Y = [] 
    with open(path, "r") as data_file: 
        for line in data_file.readlines(): 
            comps = line.split(" ")
        
            if "0" in comps[0]: 
                label = 0 
            else: 
                label = 1

            Y.append(label)

            comps = comps[1:]
            sentence = []
            for token in comps: 
                try: 
                    sentence.append(vectorizer.get_vector(token))
                except: 
                    pass
            sentence = pad_to(sentence, max_len)
            X.append(sentence)

    return np.asarray(X), np.asarray(Y)

In [150]:
def train(): 
    print('Loading data...')

    x_train, y_train = load_data(vectorizer, training_path)
    x_valid, y_valid = load_data(vectorizer, validation_path)
    x_test, y_test = load_data(vectorizer, test_path)

    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')

    print('x_train shape:', x_train.shape, y_train.shape)
    print('x_test shape:', x_test.shape, y_test.shape)

    print('Build model...')
    model = Sequential()
    
    model.add(Bidirectional(LSTM(32, dropout=0.2, recurrent_dropout=0.4), input_shape=(8, 300)))
    model.add(Dense(1, activation='sigmoid'))

    # try using different optimizers and different optimizer configs
    model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

    print('Train...')
    
    model.fit(x_train, y_train,
            batch_size=batch_size,
            epochs=15,
            validation_data=(x_test, y_test))
    
    from sklearn.metrics import classification_report
    predicted = model.predict(x_test)
    predicted_bin = [round(pred[0]) for pred in predicted]
    report = classification_report(y_test, predicted_bin)
    print(report)
        
    model.save("bidirectional_keras_lstm.h5")
    
    print("Saved model")

In [151]:
train()

Loading data...
Loading ../data/fasttext/dataset.train
Loading ../data/fasttext/dataset.valid
Loading ../data/fasttext/dataset.test
4010 train sequences
859 test sequences
x_train shape: (4010, 8, 300) (4010,)
x_test shape: (859, 8, 300) (859,)
Build model...
Train...
Train on 4010 samples, validate on 859 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
             precision    recall  f1-score   support

          0       0.94      0.91      0.92       764
          1       0.41      0.51      0.45        95

avg / total       0.88      0.86      0.87       859

[[694  70]
 [ 47  48]]
Saved model
