In [106]:
import keras 
from keras import layers
from keras import regularizers
from keras.layers import Embedding, Dropout, Dense, LSTM, Bidirectional, Input, Dense, Flatten, Conv1D, GlobalMaxPooling1D, Permute, Lambda
from keras.models import Sequential, Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import precision_recall_fscore_support as score
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import backend as K 
import numpy as np 
from tqdm import tqdm_notebook as tqdm
import json
import pandas as pd 
import os

def data_loader(file):
    with open(file, 'r') as f:
        sents = []
        labels = []
        for line in f:
            line = line.strip().split()
            labels.append(int(line[2]))
            sents.append(' '.join(line[4:]))
    return sents, labels

In [38]:
train_sents, train_labels = data_loader("../../data/train.txt")
val_sents, val_labels = data_loader("../../data/validate.txt")
test_sents, test_labels = data_loader("../../data/test.txt")

# Add a sentence for no mention case
data_description = list(data_description)
data_description.insert(0, "There is no mention.")


In [39]:
maxlen = 100
mention_len = 50
emb_dim = 50
HIDDEN_DIM = 256
EPOCHS = 10  
NEG_RATIO = 3
BATCH_SIZE = 10
MODEL_NAME = "LSTM"
DATASET_CLASS = max(train_labels+val_labels+test_labels)+1

In [40]:
##only first 1330 datasets have mentions

In [41]:
datasets = pd.read_json('../../../train_test/data_sets.json', encoding='utf-8')
dataset_mention = datasets["mention_list"].values

In [42]:
mentions = dataset_mention[: DATASET_CLASS-1]

In [43]:
def choose_longest(mention):
    if len(mention)==0:
        return "no mention"
    else:   
        idx = np.argmax(np.asarray([len(a.strip().split()) for a in mention]))
        return mention[idx]

In [44]:
longest_mentions = [choose_longest(m) for m in mentions]
longest_mentions = ["no mention"] + longest_mentions

In [45]:
len(longest_mentions)

1331

In [46]:
##load glove
embedding_index = {}
f = open('../glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embedding_index[word] = coefs
f.close()

###NOT using dataset info anymore
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sents+val_sents+test_sents+longest_mentions)
X_train = tokenizer.texts_to_sequences(train_sents)
X_val = tokenizer.texts_to_sequences(val_sents)
X_test = tokenizer.texts_to_sequences(test_sents)
long_mentions = tokenizer.texts_to_sequences(longest_mentions)

word_index = tokenizer.word_index
vocab_size = len(word_index)+1
print ("vocab size: ", vocab_size)

embedding_matrix = np.zeros((vocab_size, emb_dim))
counter = 0
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        counter += 1
    else:
        embedding_matrix[i] = np.random.randn(emb_dim)
print ("{}/{} words covered in glove".format(counter, vocab_size))


vocab size:  50349
31230/50349 words covered in glove


In [47]:
max([len(m) for m in long_mentions])
## mention_len=10

75

In [48]:
X_train = pad_sequences(X_train, maxlen=maxlen)
X_val = pad_sequences(X_val, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)
Y_train = np.asarray(train_labels)
Y_val = np.asarray(val_labels)
Y_test = np.asarray(test_labels)
Y_train = keras.utils.to_categorical(Y_train, num_classes=DATASET_CLASS)
Y_val = keras.utils.to_categorical(Y_val, num_classes=DATASET_CLASS)
Y_test = keras.utils.to_categorical(Y_test, num_classes=DATASET_CLASS)

long_men = pad_sequences(long_mentions, maxlen=mention_len)

In [49]:
##randomly shuffle data and labels
##np.random.seed(0)
N = X_train.shape[0]
indices = np.arange(N)
np.random.shuffle(indices)
X_train = X_train[indices]
Y_train = Y_train[indices] 


In [97]:
long_men.shape

(1331, 50)

In [133]:
def build_model():
    embedding_layer = Embedding(vocab_size, emb_dim, weights=[embedding_matrix], input_length=maxlen, trainable=False)
    article_input = Input(shape=(maxlen,), dtype='int32')
    article_emb = embedding_layer(article_input)
    
    article_lstm = LSTM(HIDDEN_DIM, dropout=0.2, recurrent_dropout=0.3)
    article_vector = article_lstm(article_emb)
    #vector shape: (batch_size, hidden_dim)
    
    ##mention input: (DATASET_CLASS, mention_len)
    mentions = K.constant(long_men)
    mention_emb = embedding_layer(mentions)
    ##shape: (DATASET_CLASS, mention_len, emb_dim)
    
    mention_vec = Conv1D(emb_dim, 5)(mention_emb)
    mention_vec = GlobalMaxPooling1D()(mention_vec)
    ##shape: (DATASET_CLASS, emb_dim)
    
    article_vector = Dense(emb_dim, kernel_regularizer=regularizers.l2(0.01))(article_vector)
    ##shape: (batch_size, emb_dim)
    
    mention_vec = Lambda(lambda x: K.permute_dimensions(x, (1,0)))(mention_vec)
    ##shape: (emb_dim, DATASET_CLASS)
    
    def dot(inp):
        x = inp[0]
        y = inp[1]
        return K.dot(x, y)
    
    preds = Lambda(dot)([article_vector, mention_vec])
    ##shape: (batch_size, DATASET_CLASS)
    
    output = Dense(DATASET_CLASS, activation='sigmoid', kernel_regularizer=regularizers.l2(0.01))(preds)
    ##Just to add a layer of sigmoid
    
    model = Model(article_input, output)

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model 


In [134]:
model = build_model()

AttributeError: 'NoneType' object has no attribute '_inbound_nodes'