In [7]:
import keras 
from keras import layers
from keras.layers import Embedding, Dense, LSTM, Bidirectional, Input, Dense, Flatten
from keras.models import Sequential, Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import precision_recall_fscore_support as score
from keras.callbacks import ModelCheckpoint, EarlyStopping
import numpy as np 
from tqdm import tqdm_notebook as tqdm
import json
import pandas as pd 
import os

In [8]:
data_set = pd.read_json('../../train_test/data_sets.json', encoding='utf-8')
#note: dataset_id = index + 1
data_description = data_set["description"].values

DIR = '../../data/golden_data'

X = []
Y = []

with open(DIR, 'r') as f:
	for line in f:
		line = line.strip().split()
		Y.append(int(line[0]))
		X.append(' '.join(line[1:]))

print (len(X), 'sampled loaded')

##X: strings of texts
##Y: dataset id mentioned in that string

27149 sampled loaded


In [9]:
# Add a sentence for no mention case
data_description = list(data_description)
data_description.insert(0, "There is no mention.")

#### Instead of adding a sentence for no mention, we can also train embedding vectors as a whole without uisng LSTM to run over the data description. But I think the data description provides some information and should have better performance.

In [10]:
len(data_description)

10349

In [11]:
maxlen = 200
vocab_size = 50000 ##more than 80K unique tokens
EMB_DIM = 50
HIDDEN_DIM = 256
EPOCHS = 5  ## train more epochs with GPU, it takes 1h per epoch on my CPU
NEG_RATIO = 3
BATCH_SIZE = 10
DATASET_CLASS = len(data_description) 
MODEL_NAME = "LSTM"

#actual batch size = BATCH_SIZE * (1 + NEG_RATIO)

In [12]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X+data_description)
X_seq = tokenizer.texts_to_sequences(X)
des_seq = tokenizer.texts_to_sequences(list(data_description))

word_index = tokenizer.word_index
print ("Found %s unique tokens."%len(word_index))

Found 81656 unique tokens.


In [13]:
data = pad_sequences(X_seq, maxlen=maxlen)
des = pad_sequences(des_seq, maxlen=maxlen)
labels = np.asarray(Y)

In [14]:
##randomly shuffle data and labels
##np.random.seed(0)
N = data.shape[0]
indices = np.arange(N)
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices] 

#### Note: if you want to reproduce the exact same result, you need to use the same dataset and the same random seed as mine.

In [15]:
## I set apart the last 2000 samples as test set
## 400 samples as validation
val_data = data[-1200 : -1000]
val_labels = labels[-1200 : -1000]
test_data = data[-1000 : ]
test_labels = labels[-1000 : ]
data = data[ : -1200]
labels = labels[ : -1200]

In [24]:
## I am not using Glove here, may get better results with Glove
def build_model():
    embedding_layer = Embedding(vocab_size, EMB_DIM, input_length=maxlen)
    article_input = Input(shape=(maxlen,), dtype='int32')
    article_emb = embedding_layer(article_input)
    
    dataset_input = Input(shape=(maxlen,), dtype='int32')
    dataset_emb = embedding_layer(dataset_input)
    
    article_lstm = LSTM(HIDDEN_DIM, dropout=0.2, recurrent_dropout=0.3)
    article_vector = article_lstm(article_emb)
    #vector shape: (batch_size, hidden_dim)
    
    dataset_lstm = LSTM(HIDDEN_DIM, dropout=0.2, recurrent_dropout=0.2)
    dataset_vector = dataset_lstm(dataset_emb)
    
    merged = layers.merge.dot([article_vector, dataset_vector], axes=1)
    #shape: (batch_size, 1)
    output = Dense(1, activation='sigmoid')(merged)
    
    model = Model([article_input, dataset_input], output)
#     dataset_lstm_model = Model(dataset_input, dataset_vector)
#     dataset_compare_model = Model([article_input, dataset_vector], output)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#     return model, dataset_lstm_model, dataset_compare_model
    return model 


In [17]:
##the batch for each x_sample contains one correct match,
##NEG_RATIO wrong matches, (negative sampling)
##the batch_size here means how many different x_samples in one batch
def generate_batch(x_samples, y_samples, datasets, batch_size, neg_ratio=3):
    total_size = batch_size*(1+NEG_RATIO)
    num_batches = len(x_samples) // batch_size
    while True:
        for batchIdx in range(0, num_batches):
            start = batchIdx * batch_size
            end = (batchIdx + 1) * batch_size
            article_batch = np.zeros(shape=(total_size, maxlen))
            dataset_batch = np.zeros(shape=(total_size, maxlen))
            outputs = np.zeros(shape=(total_size,))
            lineIdx = 0 ##index in the batch 
            
            ## fill in one batch
            for line in range(start, end):
                #each x is used (1+neg_ratio) times
                for i in range(1+NEG_RATIO):
                    if i == 0:
                        ## Add one correct match
                        article_batch[lineIdx] = x_samples[line]
                        dataset_idx = y_samples[line]
                        dataset_batch[lineIdx] = datasets[dataset_idx]
                        outputs[lineIdx] = 1
                        lineIdx += 1
                    else:
                        dataset_idx = np.random.randint(0, DATASET_CLASS)
                        while dataset_idx == y_samples[line]:
                            dataset_idx = np.random.randint(0, DATASET_CLASS)
                        article_batch[lineIdx] = x_samples[line]
                        dataset_batch[lineIdx] = datasets[dataset_idx]
                        outputs[lineIdx] = 0
                        lineIdx += 1
            
            ##can shuffle the batch here as well
            yield [article_batch, dataset_batch], outputs
    
    
    

In [18]:
def load_weights(model, weight_file_path):
    if os.path.exists(weight_file_path):
        model.load_weights(weight_file_path)

def get_weight_path(model_dir_path):
    if not os.path.exists(model_dir_path):
        os.makedirs(model_dir_path)
    return model_dir_path + '/' + MODEL_NAME + '-weights.h5'

In [19]:
##maybe the ratio I used is a bit big
def fit(model, epochs=EPOCHS, batch_size=BATCH_SIZE, neg_ratio=NEG_RATIO, model_dir_path=None):  
    if model_dir_path is None:
        model_dir_path = '../../models'
    weight_file_path = get_weight_path(model_dir_path)
    checkpoint = ModelCheckpoint(weight_file_path, save_best_only=True)
    earlystopping = EarlyStopping(monitor='val_loss', patience=2)
    train_gen = generate_batch(data, labels, des, batch_size, neg_ratio)
    val_gen = generate_batch(val_data, val_labels, des, batch_size, neg_ratio)
    train_num_batches = len(data) // batch_size
    val_num_batches = len(val_data) // batch_size
    history = model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches,
                                  epochs=epochs, verbose=1, callbacks=[checkpoint, earlystopping],
                                  validation_data=val_gen, validation_steps=val_num_batches)
    model.save_weights(weight_file_path)
    return history
    

In [20]:
#return the predicted mention and the confidence
#label 0 means no mention
def inference(test_data, datasets):
    scores = []
    labels = []
    for x in test_data:
        max_score = 0
        max_index = 0
        for i in tqdm(range(len(datasets))):
            ##batch_size here is 1
            s = model.predict([[x], [data[i]]])
            if s > max_score:
                max_score = s
                max_index = i
        scores.append(max_score)
        labels.append(max_index)
    return scores, labels

In [21]:
#to get all dataset vectors
def get_dataset_vectors(dataset_lstm_model, datasets):
    vectors = []
    for data in datasets:
        vectors.append(dataset_lstm_model.predict(dataset))
    return vectors

In [None]:
def quick_inference(dataset_compare_model, test_data, data_vectors):
    scores = []
    labels = []
    for data in test_data:
        max_score = 0
        max_index = 0
        for i in range(len(datasets)):
            s = dataset_compare_model.predict()

In [None]:
def evaluate(outputs, targets):
    precision, recall, fscore, support = score(targets, output)
    
    print('precision: {}'.format(precision))
    print('recall: {}'.format(recall))
    print('fscore: {}'.format(fscore))

    return precision, recall, fscore

In [25]:
model = build_model()

In [None]:
fit(model)

In [47]:
#model.load_weights(get_weight_path('../../models'))
## The prediction is very slow because it computes dot product for every dataset
## it tales 10mins for one prediction, so i have to run it on GPU to evaluate
scores, output_labels = inference(test_data, des)




  0%|          | 0/10349 [00:00<?, ?it/s][A[A[A


  0%|          | 2/10349 [00:00<10:42, 16.11it/s][A[A[A


  0%|          | 4/10349 [00:00<10:08, 17.01it/s][A[A[A


  0%|          | 6/10349 [00:00<09:42, 17.76it/s][A[A[A


  0%|          | 9/10349 [00:00<08:57, 19.23it/s][A[A[A


  0%|          | 12/10349 [00:00<08:26, 20.42it/s][A[A[A


  0%|          | 14/10349 [00:00<08:39, 19.90it/s][A[A[A


  0%|          | 16/10349 [00:00<10:14, 16.82it/s][A[A[A


  0%|          | 18/10349 [00:01<10:50, 15.88it/s][A[A[A


  0%|          | 20/10349 [00:01<10:43, 16.06it/s][A[A[A


  0%|          | 22/10349 [00:01<10:56, 15.73it/s][A[A[A


  0%|          | 24/10349 [00:01<10:45, 15.99it/s][A[A[A


  0%|          | 27/10349 [00:01<10:27, 16.44it/s][A[A[A


  0%|          | 29/10349 [00:01<10:34, 16.28it/s][A[A[A


  0%|          | 31/10349 [00:01<10:37, 16.18it/s][A[A[A


  0%|          | 34/10349 [00:02<10:20, 16.62it/s][A[A[A


  0%|          | 3

KeyboardInterrupt: 




  4%|▍         | 410/10349 [00:38<15:40, 10.57it/s][A[A[A

In [None]:
evaluate(output_labels, test_labels)