In [1]:
from __future__ import print_function

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from sklearn.model_selection import StratifiedKFold

Using TensorFlow backend.


In [2]:
BASE_DIR = '../Datasets/'
TEXT_DATA_DIR = BASE_DIR + '/mpqa535/'
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2

In [3]:
embeddings_index = {}
f = open(os.path.join(BASE_DIR, 'wiki_extvec'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [4]:
import csv
texts = []
labels = []
for root, dirs, files in os.walk('../Datasets/mpqa535/'):
    for f in files:
        with open(os.path.join(root,f)) as csvfile:
            reader = csv.reader(csvfile)
            reader.next()
            for row in reader:
                texts.append(row[0])
                labels.append(0 if row[1] == 'f' else 1)

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

In [6]:
X = pad_sequences(sequences)
y = np.asarray(labels)

In [7]:
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
X = X[indices]
y = y[indices]
# num_validation_samples = int(VALIDATION_SPLIT * X.shape[0])
# x_train = X[:-num_validation_samples]
# y_train = y[:-num_validation_samples]
# x_val = X[-num_validation_samples:]
# y_val = y[-num_validation_samples:]

In [8]:
num_words = len(word_index)
embedding_matrix = np.zeros((num_words+1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [9]:
embedding_layer = Embedding(num_words+1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=X.shape[1],
                            trainable=False)

In [10]:
from keras import backend as K

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def fbeta_score(y_true, y_pred, beta=1):
    if beta < 0:
        raise ValueError('The lowest choosable beta is zero (only precision).')

    # If there are no true positives, fix the F score at 0 like sklearn.
    if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
        return 0

    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    bb = beta ** 2
    fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
    return fbeta_score

def fmeasure(y_true, y_pred):
    return fbeta_score(y_true, y_pred, beta=1)

In [11]:
from keras.models import Sequential
from keras.layers import LSTM

seed = 7
np.random.seed(seed)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
scores = []
accuracies = []
precs = []
recalls = []
fms = []
for train, test in kfold.split(X, y):
    model = Sequential()
    model.add(embedding_layer)
    model.add(LSTM(EMBEDDING_DIM, dropout=0.3, recurrent_dropout=0.3))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', precision, recall, fmeasure])

    batch_size = 32
    model.fit(X[train], y[train],
              batch_size=batch_size,
              epochs=15)

    score, acc, prec, rec, fmeas = model.evaluate(X[test], y[test],
                                batch_size=batch_size)
    print('score',score)
    print('accuracy',acc)
    print('precision',prec)
    print('recall',rec)
    print('fmeasure',fmeas)
    scores.append(score)
    accuracies.append(acc)
    precs.append(prec)
    recalls.append(rec)
    fms.append(fmeas)
    break
    
print('Mean score:', np.mean(scores))
print('Mean accuracy:', np.mean(accuracies))
print('Mean precision:', np.mean(precs))
print('Mean recall:', np.mean(recalls))
print('Mean fmeasure:', np.mean(fms))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
score 0.513919462984
accuracy 0.801530153123
precision 0.793170667306
recall 0.856103162042
fmeasure 0.820325159039
Mean score: 0.513919462984
Mean accuracy: 0.801530153123
Mean precision: 0.793170667306
Mean recall: 0.856103162042
Mean fmeasure: 0.820325159039


In [13]:
model.save('lstm_dependency.h5')