In [13]:
import numpy as np
import copy
from tqdm import tqdm
import pandas as pd
import re
import gensim
import keras
from keras import optimizers
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.layers import *
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from score import report_score
from sklearn.metrics import accuracy_score
from tensorflow.keras import layers
from tensorflow.keras import regularizers

In [28]:
datadir="fnc-1"
w2v_path = './w2v/GoogleNews-vectors-negative300.bin'
save_path = "./saved/"
batch_size = 128
max_sent_length = 350
random_state = 37
epoch = 20

In [15]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


### Pre-process Dataset

In [16]:
raw_train_bodies = pd.read_csv(datadir + '/train_bodies.csv')   
raw_train_stances = pd.read_csv(datadir + '/train_stances.csv')
raw_test_bodies = pd.read_csv(datadir + '/competition_test_bodies.csv') 
raw_test_stances = pd.read_csv(datadir + '/competition_test_stances.csv') 

In [17]:
stance_to_int = {"agree":0, "discuss": 1, "disagree": 2, "unrelated": 3}
int_to_stance = {0:"agree", 1:"discuss", 2:"disagree", 3: "unrelated"}

In [18]:
actual_test_stances = raw_test_stances['Stance']
raw_train_stances['Stance'] = raw_train_stances['Stance'].apply(lambda x: stance_to_int[x])
raw_test_stances['Stance'] = raw_test_stances['Stance'].apply(lambda x: stance_to_int[x])

In [19]:
train_df = raw_train_stances.join(raw_train_bodies.set_index('Body ID'), on='Body ID')
test_df = raw_test_stances.join(raw_test_bodies.set_index('Body ID'), on='Body ID')

In [20]:
def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric
    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()

# Pre-processing words
clean_train_headline = [text_to_word_sequence(clean(head)) for head in train_df['Headline']]
clean_train_bodies = [text_to_word_sequence(clean(body)) for body in train_df['articleBody']]
clean_test_headline = [text_to_word_sequence(clean(head)) for head in test_df['Headline']]
clean_test_bodies = [text_to_word_sequence(clean(body)) for body in test_df['articleBody']]

In [21]:
wordlist = []
for i in range(len(clean_train_headline)):
    wordlist.append(clean_train_headline[i])
for i in range(len(clean_train_bodies)):
    wordlist.append(clean_train_bodies[i])
for i in range(len(clean_test_headline)):
    wordlist.append(clean_test_headline[i])
for i in range(len(clean_test_bodies)):
    wordlist.append(clean_test_bodies[i])

In [22]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(wordlist)
len(tokenizer.word_index)

29451

In [23]:
train_lines = [] 
for i in range(len(clean_train_headline)):
    headline =  clean_train_headline[i]
    body = clean_train_bodies[i]
    newline = headline+body
    train_lines.append(newline)

test_lines = [] 
for i in range(len(clean_test_headline)):
    headline =  clean_test_headline[i]
    body = clean_train_bodies[i]
    newline = headline+body
    test_lines.append(newline)

In [34]:
X_train = tokenizer.texts_to_sequences([' '.join(seq[:max_sent_length]) for seq in train_lines])
raw_X_train = pad_sequences(X_train, maxlen=max_sent_length, padding='post', truncating='post')
raw_y_train = train_df['Stance']

In [35]:
X_test = tokenizer.texts_to_sequences([' '.join(seq[:max_sent_length]) for seq in test_lines])
X_test = pad_sequences(X_test, maxlen=max_sent_length, padding='post', truncating='post')
y_test = test_df['Stance']

In [36]:
# Convert y to onehot
y_train_onehot = np_utils.to_categorical(raw_y_train)
y_test_onehot = np_utils.to_categorical(y_test)

### Basic Model

In [29]:
embedding_dim = 300
embeddings = gensim.models.KeyedVectors.load_word2vec_format(w2v_path, binary=True)
embeddings_matrix = np.random.uniform(-0.05, 0.05, size=(len(tokenizer.word_index)+1, embedding_dim))

for word, i in tokenizer.word_index.items():
    try:
        embeddings_vector = embeddings[word]
        embeddings_matrix[i] = embeddings_vector
    except KeyError:
        pass
        
del embeddings


In [30]:
def cnn_model(n_classes):
    kernel_sizes = [3, 4, 5]
    num_filters=[80, 80, 80]  
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                            output_dim=embedding_dim,
                            weights = [embeddings_matrix],
                            trainable=False, name='embedding_layer',
                            mask_zero=True))
    model.add(keras.layers.Conv1D(num_filters[0], kernel_sizes[0], padding='valid', activation='relu', kernel_regularizer=regularizers.L2(0.001)))
    model.add(Activation(activation='relu', name='activation_1'))
    model.add(keras.layers.MaxPooling1D(3))
    model.add(keras.layers.Conv1D(num_filters[1], kernel_sizes[1], padding='valid', activation='relu', kernel_regularizer=regularizers.L2(0.001)))
    model.add(Activation(activation='relu', name='activation_2'))
    model.add(keras.layers.MaxPooling1D(3))
    model.add(keras.layers.Conv1D(num_filters[2], kernel_sizes[2], padding='valid', activation='relu', kernel_regularizer=regularizers.L2(0.001)))
    model.add(Activation(activation='relu', name='activation_3'))
    model.add(GlobalMaxPooling1D())
    model.add(keras.layers.Dense(n_classes, activation='softmax', name='output_layer'))
    
    return model

### Basic model trained over to Stance Dataset

In [31]:
basic_model = cnn_model(n_classes=4)
basic_model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [32]:
print(basic_model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, None, 300)         8835600   
_________________________________________________________________
conv1d (Conv1D)              (None, None, 80)          72080     
_________________________________________________________________
activation_1 (Activation)    (None, None, 80)          0         
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, None, 80)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 80)          25680     
_________________________________________________________________
activation_2 (Activation)    (None, None, 80)          0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 80)          0

In [37]:
history = basic_model.fit(raw_X_train, y_train_onehot,
          batch_size=batch_size,
          epochs=epoch,
        )  

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [38]:
preds = basic_model.predict(X_test)
outputs = [int_to_stance[np.argmax(p, axis = -1)] for p in preds]
report_score(actual_test_stances,outputs)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    98     |    19     |    116    |   1670    |
-------------------------------------------------------------
| disagree  |    37     |     5     |    53     |    602    |
-------------------------------------------------------------
|  discuss  |    171    |    22     |    375    |   3896    |
-------------------------------------------------------------
| unrelated |    823    |    107    |   1337    |   16082   |
-------------------------------------------------------------
Score: 4603.0 out of 11651.25	(39.50649071987984%)


39.50649071987984