In [19]:
import pandas as pd
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split

from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
import keras
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation, Flatten, GlobalMaxPooling1D
from keras.layers.convolutional import Conv1D, MaxPooling1D

In [36]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [43]:
training_labels = to_categorical(np.array(train['Stance']))
testing_labels = to_categorical(np.array(test['Stance']))

In [38]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train['combinedText'])
train_sequence = tokenizer.texts_to_sequences(train['combinedText'])
test_sequence = tokenizer.texts_to_sequences(test['combinedText'])

In [16]:
# longest_sequences = [len(x) for x in (train_sequence)]
# longest_sequence = max(longest_sequences)
# longest_sequence = 200

4900

In [8]:
# train_pad = pad_sequences(train_sequence, maxlen=longest_sequence, padding='post', truncating='post')
# test_pad = pad_sequences(test_sequence, maxlen=longest_sequence, padding='post', truncating='post')

In [39]:
embedding_dim = 200
in_file = '../data/glove/glove.6B.200d.txt'
out_file = '../data/glove.200d.word2vec.txt'

glove2word2vec(in_file, out_file)
w2v = KeyedVectors.load_word2vec_format(out_file, binary=False)


In [40]:
vocab = tokenizer.word_index.keys()
# Add one because index 0 is reserved and isn't assigned to any word
# https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer
embedding_matrix = np.zeros((len(vocab)+1, embedding_dim))

embedding_matrix[0] = np.random.random((1, embedding_dim))
for i, word in enumerate(vocab, 1):
    try:
        embedding_matrix[i] = w2v[word]
    except KeyError as e:
        embedding_matrix[i] = np.random.random((1, embedding_dim))

In [11]:
# train_pad, val_pad, train_labels, val_labels = train_test_split(train_pad, train_labels, random_state = 42, test_size = 0.15)


In [None]:
# Hyperparameters to tune: 
# height/width: 3x3, 5x5, 7x7
# MaxPooling vs Max-Over-Time Pooling
# Different drop-out rates
# Seqeuence length
kernel_sizes = [3, 5, 7]
# pooling_method = [MaxPooling1D(pool_size=2)]
dropouts = [0.2, 0.3, 0.4]
sequences = [100, 200, 300, 500] 

best_model = Sequential()
best_accuracy = 0
for sequence_length in sequences:
    train_pad = pad_sequences(train_sequence, maxlen=sequence_length, padding='post', truncating='post')
    test_pad = pad_sequences(test_sequence, maxlen=sequence_length, padding='post', truncating='post')
    train_pad, val_pad, train_labels, val_labels = train_test_split(train_pad, training_labels, random_state = 42, test_size = 0.15)
    for kernel in kernel_sizes:
        for dropout in dropouts:
            keras.backend.clear_session()

            model = Sequential()
            model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                                output_dim=embedding_dim,
                                embeddings_initializer=keras.initializers.Constant(embedding_matrix),
                                input_length=sequence_length,
                                trainable=True,
                                name='embedding_layer',
                                ))
            model.add(Conv1D(filters=256, kernel_size=kernel, activation='relu'))
            model.add(MaxPooling1D(pool_size=2))
            model.add(Conv1D(filters=128, kernel_size=kernel, activation='relu'))
            model.add(MaxPooling1D(pool_size=2))
            model.add(Conv1D(filters=64, kernel_size=kernel, activation='relu'))
            model.add(GlobalMaxPooling1D())
#             model.add(Flatten())
            model.add(Dense(embedding_dim, activation='relu'))
            model.add(Dropout(dropout))
            model.add(Dense(4, activation='softmax'))
#             print(model.summary())
            model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
            model.fit(train_pad, train_labels, epochs=10, batch_size=200, validation_data=(val_pad, val_labels))

            _, accuracy = model.evaluate(test_pad, testing_labels, batch_size=200)
            print("Test Set Accuracy = {:.4f}".format(accuracy))

            if (accuracy > best_accuracy):
                best_accuracy = accuracy
                best_model = model
                best_model_string = f"Best Model has seq_length={sequence_length}, kernel={kernel} and dropout={dropout}"
                print(best_model_string) 

Train on 42476 samples, validate on 7496 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Set Accuracy = 0.7214
Best Model has seq_length=100, kernel=3 and dropout=0.2
Train on 42476 samples, validate on 7496 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Set Accuracy = 0.6854
Train on 42476 samples, validate on 7496 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Set Accuracy = 0.6481
Train on 42476 samples, validate on 7496 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Set Accuracy = 0.5996
Train on 42476 samples, validate on 7496 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Set Accuracy = 0.6571
Train on 

In [None]:
best_model_string