In [67]:
import pandas as pd
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split

from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
import keras
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation, Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D


In [26]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [14]:
train_labels = to_categorical(np.array(train['Stance']))
test_labels = to_categorical(np.array(test['Stance']))

In [28]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train['combinedText'])
train_sequence = tokenizer.texts_to_sequences(train['combinedText'])
test_sequence = tokenizer.texts_to_sequences(test['combinedText'])

In [38]:
# longest_sequences = [len(x) for x in (train_sequence)]
# longest_sequence = max(longest_sequences)
# longest_sequence
longest_sequence = 200

In [39]:
train_pad = pad_sequences(train_sequence, maxlen=longest_sequence, padding='post', truncating='post')
test_pad = pad_sequences(test_sequence, maxlen=longest_sequence, padding='post', truncating='post')

In [53]:
embedding_dim = 200
in_file = '../data/glove/glove.6B.200d.txt'
out_file = '../data/glove.200d.word2vec.txt'

glove2word2vec(in_file, out_file)
w2v = KeyedVectors.load_word2vec_format(out_file, binary=False)


In [57]:
vocab = tokenizer.word_index.keys()
# Add one because index 0 is reserved and isn't assigned to any word
# https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer
embedding_matrix = np.zeros((len(vocab)+1, embedding_dim))

embedding_matrix[0] = np.random.random((1, embedding_dim))
for i, word in enumerate(vocab, 1):
    try:
        embedding_matrix[i] = w2v[word]
    except KeyError as e:
        embedding_matrix[i] = np.random.random((1, embedding_dim))

In [68]:
train_pad, val_pad, train_labels, val_labels = train_test_split(train_pad, train_labels, random_state = 42, test_size = 0.15)


In [77]:
keras.backend.clear_session()

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                    output_dim=embedding_dim,
                    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
                    input_length=longest_sequence,
                    trainable=True,
                    name='embedding_layer',
                    ))
model.add(Conv1D(filters=256, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(embedding_dim, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(4, activation='softmax'))
print(model.summary())
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, 200, 200)          5575200   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 198, 256)          153856    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 99, 256)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 97, 128)           98432     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 48, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 46, 64)            24640     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 23, 64)           

In [78]:
fit = model.fit(train_pad, train_labels, epochs=10, batch_size=200, validation_data=(val_pad, val_labels))

Train on 42476 samples, validate on 7496 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [79]:
 _, accuracy = model.evaluate(test_pad, test_labels, batch_size=200)
print("Test Set Accuracy = {:.4f}".format(accuracy))

Test Set Accuracy = 0.6882
