In [112]:
from __future__ import print_function

from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, TimeDistributed, Conv2D, Lambda
from keras.layers import add, dot, concatenate
from keras.layers import LSTM, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.initializers import Constant
import numpy as np

from dataproc_utils import *
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [43]:
from keras import backend

In [124]:
batch_size = 64
epochs = 25
random_state = 42
body_size = 15 # paragraph max length
claim_size = 15 # claim max length
embedding_dim = 25
output_size = 4  # size of the output vector
TEST_SIZE = .2

In [125]:
# open saved wordvecs from file and make id dicts
w2v = load_wordvecs('twitter_glo_vecs\\wordvecs25d.txt')
print(len(w2v), 'pretrained embeddings')

17355 pretrained embeddings


In [126]:
# load data and labels
data = load_proc_data('train_bodies.txt', 'train_claims.txt', split_pars=True)
labels = [label for body, claim, label in data]
y = np.array(labels)

# print('First input tuple (body, claim, stance):\n', data[0])

In [127]:
# train/validation split
train_data, val_data, train_labels, val_labels = train_test_split(data, y, test_size=TEST_SIZE, random_state=random_state)

In [None]:
# create a vocabulary dict from train data
word2freq = make_word_freq_V(train_data)
word2index = word2idx(word2freq, pretrained=w2v)

vocab_size = len(word2index)
print('Vocab size:', vocab_size, 'unique words in the train set')

In [87]:
# vectorize input words (turn each word into its index from the word2index dict)
# for new words in test set that don't appear in train set, use index of <unknown>
train_body, train_claim = vocab_vectorizer(train_data, word2index, max_par_len=body_size, max_claim_len=claim_size)
val_body, val_claim = vocab_vectorizer(val_data, word2index, max_par_len=body_size, max_claim_len=claim_size)

In [88]:
train_claim.shape

(39977, 12)

In [89]:
train_body.shape

(39977, 9, 20)

In [90]:
# prepare embedding matrix
embedding_matrix = np.zeros((vocab_size + 1, embedding_dim))
for w, i in word2index.items():
    #print(i, w2v[w])
    embedding_matrix[i] = w2v[w]

# d = (X, W, E)

In [101]:
# load pre-trained word vectors into embedding layers
# we set trainable to false to keep the embeddings fixed
embedding_body = Embedding(vocab_size + 1,
                            embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=(9, body_size,),
                            trainable=False)

In [102]:
embedding_claim = Embedding(vocab_size + 1,
                            embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=claim_size,
                            trainable=False)

In [103]:
# initialize placeholders and embed pre-trained word vectors
input_body = Input(shape=(9, body_size,), dtype='int32')
input_claim = Input(shape=(claim_size,), dtype='int32')

In [104]:
embedded_body = embedding_body(input_body)
embedded_claim = embedding_claim(input_claim)

# CNN

In [115]:
CNN_FILTERS = 100
CNN_FILTER_SIZE = 5

## TimeDistributed for paragraphs

In [116]:
cnn_body = TimeDistributed(
  Conv1D(CNN_FILTERS, CNN_FILTER_SIZE, padding='same', activation='relu')
)(embedded_body)
cnn_body = Lambda(lambda x: backend.max(x, axis=None, keepdims=True))(cnn_body)

## Normal for claims

In [119]:
cnn_claim = Conv1D(100, 5, padding='same', activation='relu')(embedded_claim)
cnn_claim = MaxPooling1D(5, padding='same')(cnn_claim)

# LSTM

## TimeDistributed for paragraphs

In [121]:
# train two lstms
lstm_body = TimeDistributed(LSTM(100))(embedded_body)

## Normal for claims

In [122]:
lstm_claim = LSTM(100)(embedded_claim)

# INFERENCE

In [123]:
p_lstm = dot([backend.transpose(lstm_claim), lstm_body], axes=(0, 1))
p_lstm = Activation('softmax')(p_lstm)  # shape: (samples, body_size, claim_size)

ValueError: Dimension incompatibility 100 != 9. Layer shapes: (100, None), (None, 9, 100)

In [37]:
p_cnn = dot([lstm_body, backend.transpose(lstm_claim)], axes=(0, 1))
p_cnn = Activation('softmax')(p_cnn)  # shape: (samples, body_size, claim_size)

In [None]:
output = ###

In [39]:
output = Dense(128, activation='relu')(output)
preds = Dense(output_size, activation='softmax')(output)

ValueError: Layer dense_3 was called with an input that isn't a symbolic tensor. Received type: <class 'keras.layers.core.Dense'>. Full input: [<keras.layers.core.Dense object at 0x000002065C138BA8>]. All inputs to the layer should be tensors.

In [None]:
# build the model
model = Model([input_body, input_claim], preds)
model.compile(optimizer='rmsprop',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


# train
model.fit(train_body, train_labels,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(val_body, val_labels))

# print model summary
print(model.summary())