In [1]:
import pandas as pd
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split

from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
import keras
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation, Flatten, GlobalMaxPooling1D
from keras.layers.convolutional import Conv1D, MaxPooling1D

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [43]:
training_labels = to_categorical(np.array(train['Stance']))
testing_labels = to_categorical(np.array(test['Stance']))

In [38]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train['combinedText'])
train_sequence = tokenizer.texts_to_sequences(train['combinedText'])
test_sequence = tokenizer.texts_to_sequences(test['combinedText'])

In [16]:
# longest_sequences = [len(x) for x in (train_sequence)]
# longest_sequence = max(longest_sequences)
longest_sequence = 200

4900

In [8]:
train_pad = pad_sequences(train_sequence, maxlen=longest_sequence, padding='post', truncating='post')
test_pad = pad_sequences(test_sequence, maxlen=longest_sequence, padding='post', truncating='post')

In [7]:
embedding_dim = 200
in_file = '../data/glove/glove.6B.200d.txt'
out_file = '../data/glove.200d.word2vec.txt'

glove2word2vec(in_file, out_file)
w2v = KeyedVectors.load_word2vec_format(out_file, binary=False)


In [8]:
vocab = tokenizer.word_index.keys()
# Add one because index 0 is reserved and isn't assigned to any word
# https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer
embedding_matrix = np.zeros((len(vocab)+1, embedding_dim))

embedding_matrix[0] = np.random.random((1, embedding_dim))
for i, word in enumerate(vocab, 1):
    try:
        embedding_matrix[i] = w2v[word]
    except KeyError as e:
        embedding_matrix[i] = np.random.random((1, embedding_dim))

In [11]:
# train_pad, val_pad, train_labels, val_labels = train_test_split(train_pad, train_labels, random_state = 42, test_size = 0.15)


In [None]:
# Hyperparameters to tune: 
# height/width: 3x3, 5x5, 7x7
# MaxPooling vs Max-Over-Time Pooling
# Different drop-out rates
# Seqeuence length
kernel_sizes = [3, 5, 7]
# pooling_method = [MaxPooling1D(pool_size=2)]
dropouts = [0.2, 0.3, 0.4]
sequences = [100, 200, 300, 500] 

best_model = Sequential()
best_accuracy = 0
for sequence_length in sequences:
    train_pad = pad_sequences(train_sequence, maxlen=sequence_length, padding='post', truncating='post')
    test_pad = pad_sequences(test_sequence, maxlen=sequence_length, padding='post', truncating='post')
    train_pad, val_pad, train_labels, val_labels = train_test_split(train_pad, training_labels, random_state = 42, test_size = 0.15)
    for kernel in kernel_sizes:
        for dropout in dropouts:
            keras.backend.clear_session()

            model = Sequential()
            model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                                output_dim=embedding_dim,
                                embeddings_initializer=keras.initializers.Constant(embedding_matrix),
                                input_length=sequence_length,
                                trainable=True,
                                name='embedding_layer',
                                ))
            model.add(Conv1D(filters=256, kernel_size=kernel, activation='relu'))
            model.add(MaxPooling1D(pool_size=2))
            model.add(Conv1D(filters=128, kernel_size=kernel, activation='relu'))
            model.add(MaxPooling1D(pool_size=2))
            model.add(Conv1D(filters=64, kernel_size=kernel, activation='relu'))
            model.add(GlobalMaxPooling1D())
#             model.add(Flatten())
            model.add(Dense(embedding_dim, activation='relu'))
            model.add(Dropout(dropout))
            model.add(Dense(4, activation='softmax'))
#             print(model.summary())
            model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
            model.fit(train_pad, train_labels, epochs=10, batch_size=200, validation_data=(val_pad, val_labels))

            _, accuracy = model.evaluate(test_pad, testing_labels, batch_size=200)
            print("Test Set Accuracy = {:.4f}".format(accuracy))

            if (accuracy > best_accuracy):
                best_accuracy = accuracy
                best_model = model
                best_model_string = f"Best Model has seq_length={sequence_length}, kernel={kernel} and dropout={dropout}"
                print(best_model_string) 

In [3]:
word_count = lambda x:len(x.split())
train['text_wc'] = train['combinedText'].apply(word_count)

p = 75.0

print(' Summary :{} % of the summaries have a length less than or equal to {}'.format(p, np.percentile(train['text_wc'], p)))
MAX_LEN = int(np.percentile(train['text_wc'], p))

 Summary :75.0 % of the summaries have a length less than or equal to 478.0


In [4]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler


oversample = RandomOverSampler(sampling_strategy={0:round(36545*0.25), 1: round(36545*0.5), 2: round(36545*0.5)})
undersample = RandomUnderSampler(sampling_strategy={3: round(36545*0.5)})

train_over, training_over_labels = oversample.fit_resample(train, train['Stance'])

print(Counter(training_over_labels))

train_combined_sampling, training_combined_sampling_labels = undersample.fit_resample(train_over, training_over_labels)

print(f"Combined:{Counter(training_combined_sampling_labels)}")

training_combined_labels = to_categorical(np.array(training_combined_sampling_labels))
testing_labels = to_categorical(np.array(test['Stance']))

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train['combinedText'])
test_sequence = tokenizer.texts_to_sequences(test['combinedText'])
train_combined_sequence = tokenizer.texts_to_sequences(train_combined_sampling['combinedText'])


Counter({3: 36545, 1: 18272, 2: 18272, 0: 9136})
Combined:Counter({1: 18272, 2: 18272, 3: 18272, 0: 9136})


In [5]:
train_combined_pad = pad_sequences(train_combined_sequence, maxlen=MAX_LEN, padding='post', truncating='post')
test_pad = pad_sequences(test_sequence, maxlen=MAX_LEN, padding='post', truncating='post')
train_combined_pad, val_combined_pad, train_combined_labels, val_combined_labels = train_test_split(train_combined_pad, training_combined_labels, random_state = 42, test_size = 0.15)


In [9]:
kernel_sizes = [3, 5, 7]
dropouts = [0.2, 0.3, 0.4]

best_model = Sequential()
best_accuracy = 0
for kernel in kernel_sizes:
    for dropout in dropouts:
        keras.backend.clear_session()
        
        print(f"Current Model is: kernel={kernel}, dropout={dropout}")

        model = Sequential()
        model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                            output_dim=embedding_dim,
                            embeddings_initializer=keras.initializers.Constant(embedding_matrix),
                            input_length=MAX_LEN,
                            trainable=True,
                            name='embedding_layer',
                            ))
        model.add(Conv1D(filters=256, kernel_size=kernel, activation='relu'))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Conv1D(filters=128, kernel_size=kernel, activation='relu'))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Conv1D(filters=64, kernel_size=kernel, activation='relu'))
        model.add(GlobalMaxPooling1D())
        #             model.add(Flatten())
        model.add(Dense(embedding_dim, activation='relu'))
        model.add(Dropout(dropout))
        model.add(Dense(4, activation='softmax'))
        #             print(model.summary())
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        model.fit(train_combined_pad, train_combined_labels, epochs=10, batch_size=200, validation_data=(val_combined_pad, val_combined_labels))

        _, accuracy = model.evaluate(test_pad, testing_labels, batch_size=200)
        print("Test Set Accuracy = {:.4f}".format(accuracy))

        if (accuracy > best_accuracy):
            best_accuracy = accuracy
            best_model = model
            best_model_string = f"Best Model has kernel={kernel} and dropout={dropout}"
            print(best_model_string) 

Current Model is: kernel=3, dropout=0.2


Train on 54359 samples, validate on 9593 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Set Accuracy = 0.5930
Best Model has kernel=3 and dropout=0.2
Current Model is: kernel=3, dropout=0.3
Train on 54359 samples, validate on 9593 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Set Accuracy = 0.6555
Best Model has kernel=3 and dropout=0.3
Current Model is: kernel=3, dropout=0.4
Train on 54359 samples, validate on 9593 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Set Accuracy = 0.6959
Best Model has kernel=3 and dropout=0.4
Current Model is: kernel=5, dropout=0.2
Train on 54359 samples, validate on 9593 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/

In [11]:
best_model_string

'Best Model has kernel=3 and dropout=0.4'

In [None]:
# BEST MODEL

keras.backend.clear_session()
sequence_length = 200;
kernel = 3
dropout = 0.2;
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                    output_dim=embedding_dim,
                    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
                    input_length=sequence_length,
                    trainable=True,
                    name='embedding_layer',
                    ))
model.add(Conv1D(filters=256, kernel_size=kernel, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=kernel, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=64, kernel_size=kernel, activation='relu'))
model.add(GlobalMaxPooling1D())
#             model.add(Flatten())
model.add(Dense(embedding_dim, activation='relu'))
model.add(Dropout(dropout))
model.add(Dense(4, activation='softmax'))
#             print(model.summary())
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_pad, train_labels, epochs=10, batch_size=200, validation_data=(val_pad, val_labels))

_, accuracy = model.evaluate(test_pad, testing_labels, batch_size=200)
print("Test Set Accuracy = {:.4f}".format(accuracy))

In [None]:
y_pred = model.predict(test_pad)
y_classes = y_pred.argmax(axis=-1)

for i in range(len(y_classes)):
    if y_classes[i] == 0: y_classes[i] = "disagree"
    
    if y_classes[i] == 1: y_classes[i] = "agree"
    
    if y_classes[i] == 2: y_classes[i] = "discuss"
    
    if y_classes[i] == 3: y_classes[i] = "unrelated"
    

In [None]:
test_df = pd.read_csv('../fnc-1-baseline-master/fnc-1/competition_test_stances_unlabeled.csv')
test_df['Stance'] = y_classes
test_df.to_csv('answer.csv', index=False, encoding='utf-8')