In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd drive/My\ Drive/NUS-Fake-News-Detection

In [None]:
import os 
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Embedding, Bidirectional, CuDNNGRU, CuDNNLSTM, Activation,\
                        Dense, Input, concatenate, Dropout, GlobalMaxPool1D
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, TensorBoard


# random seed for reproducibility
np.random.seed(4123)

# Loading the dataset

In [None]:
!ls dataset

In [None]:
train_df = pd.read_csv('dataset/train2.tsv', sep='\t')
val_df = pd.read_csv('dataset/val2.tsv', sep='\t')
test_df = pd.read_csv('dataset/test2.tsv', sep='\t')

# Models

## Creating embedding layer

In [None]:
def readGloveFile(gloveFile):
    with open(gloveFile, 'r') as f:
        wordToGlove = {}  # map from a token (word) to a Glove embedding vector
        wordToIndex = {}  # map from a token to an index
        indexToWord = {}  # map from an index to a token 

        for line in f:
            record = line.strip().split()
            token = record[0] # take the token (word) from the text line
            wordToGlove[token] = np.array(record[1:], dtype=np.float64) # associate the Glove embedding vector to a that token (word)

        tokens = sorted(wordToGlove.keys())
        for idx, tok in enumerate(tokens):
            kerasIdx = idx + 1
            wordToIndex[tok] = kerasIdx # associate an index to a token (word)
            indexToWord[kerasIdx] = tok # associate a word to a token (word). Note: inverse of dictionary above

    return wordToIndex, indexToWord, wordToGlove

# Create Pretrained Keras Embedding Layer
def createPretrainedEmbeddingLayer(wordToGlove, wordToIndex, isTrainable, inputLayer=None):
    vocabLen = len(wordToIndex) + 1  # adding 1 to account for masking
    embDim = next(iter(wordToGlove.values())).shape[0]

    embeddingMatrix = np.zeros((vocabLen, embDim))  # initialize with zeros
    for word, index in wordToIndex.items():
        embeddingMatrix[index, :] = wordToGlove[word] # create embedding: word index to Glove word embedding

    if inputLayer is None:
        embeddingLayer = Embedding(vocabLen, embDim, weights=[embeddingMatrix], trainable=isTrainable)
    else:
        embeddingLayer = Embedding(vocabLen, embDim, weights=[embeddingMatrix], trainable=isTrainable) (inputLayer)
    return embeddingLayer

wordToIndex, indexToWord, wordToGlove = readGloveFile('glove.6B.100d.txt')
pretrainedEmbeddingLayer = createPretrainedEmbeddingLayer(wordToGlove, wordToIndex, False)

## Loading relevant data corresponding to the condition

In [None]:
# mapping labels to integers
def process_labels(labels, classifier):
    if classifier == 'binary':
        labels = labels.replace({'half-true': 1, 'mostly-true': 1, 'true': 1, 
                                 'barely-true': 0, 'pants-fire': 0, 'false': 0})
    else:
        labels = labels.replace({'pants-fire': 0, 'false': 1, 'barely-true': 2, 
                                 'half-true': 3, 'mostly-true': 4, 'true': 5})
    return labels  


def s_condition(df, classifier='sixway'):
    labels = df.iloc[:, 2]
    labels = process_labels(labels, classifier)
    
    data = df.iloc[:, 3].tolist()
    t = Tokenizer()
    t.fit_on_texts(data)
    tokens = t.texts_to_sequences(data)
    data = pad_sequences(tokens)
    return data, labels


def sj_condition(df, classifier='sixway'):
    labels = df.iloc[:, 2]
    labels = process_labels(labels, classifier)
    
    statement = df.iloc[:, 3]
    justification = df.iloc[:, 15]
    data = (statement.map(str) + justification.map(str)).tolist()
    t = Tokenizer()
    t.fit_on_texts(data)
    tokens = t.texts_to_sequences(data)
    data = pad_sequences(tokens)
    return data, labels


def sjplus_condition(df, classifier='sixway'):
    labels = df.iloc[:, 2]
    labels = process_labels(labels, classifier)
    
    statement = df.iloc[:, 3]
    statement = statement.map(str).tolist()
    justification = df.iloc[:, 15]
    justification = justification.map(str).tolist()
    t = Tokenizer()
    t.fit_on_texts(statement + justification)
    tokens = t.texts_to_sequences(statement)
    statement = pad_sequences(tokens, maxlen=800)
    tokens = t.texts_to_sequences(justification)
    justification = pad_sequences(tokens, maxlen=800)

    data = [statement, justification]
    return data, labels


## Creating models based for each conditions

In [None]:
def s_model(classifier='sixway'):
    model = Sequential()
    model.add(pretrainedEmbeddingLayer)
    model.add(Bidirectional(CuDNNSTM(32)))
    if classifier == 'binary':
        model.add(Dense(1, activation='sigmoid'))
    else:
        model.add(Dense(6, activation='softmax'))
    model.summary()
    return model


def sj_model(classifier='sixway'):
    model = Sequential()
    model.add(pretrainedEmbeddingLayer)
    model.add(Bidirectional(CuDNNLSTM(32)))
    
    if classifier == 'binary':
        model.add(Dense(1, activation='sigmoid'))
    else:
        model.add(Dense(6, activation='softmax'))
    model.summary()
    return model


def sjplus_model(input_dims, classifier='sixway'):
    input1 = Input(shape=(input_dims[0],))
    x1 = createPretrainedEmbeddingLayer(wordToGlove, wordToIndex, False, input1)
    x1 = Dropout(0.1) (x1)
    x1 = Bidirectional(CuDNNGRU(32, return_sequences=True)) (x1)
    x1 = Dropout(0.1) (x1)
    x1 = Bidirectional(CuDNNLSTM(32, return_sequences=True)) (x1)
    x1 = Dropout(0.1) (x1)
    x1 = GlobalMaxPool1D()(x1)
    x1 = Dense(50, activation="relu")(x1)
    x1 = Dropout(0.4)(x1)
    model1 = Model(inputs=input1, outputs=x1)

    input2 = Input(shape=(input_dims[1],))
    x2 = createPretrainedEmbeddingLayer(wordToGlove, wordToIndex, False, input2)
    x2 = Dropout(0.1)(x2)
    x2 = Bidirectional(CuDNNGRU(32, return_sequences=True)) (x2)
    x2 = Dropout(0.1)(x2)
    x2 = Bidirectional(CuDNNLSTM(32, return_sequences=True,)) (x2)
    x2 = Dropout(0.1)(x2)
    x2 = GlobalMaxPool1D()(x2)
    x2 = Dense(50, activation="relu")(x2)
    x2 = Dropout(0.4)(x2)
    model2 = Model(inputs=input2, outputs=x2)
    
    x = concatenate([model1.output, model2.output])
    if classifier == 'binary':
        out = Dense(1, activation='sigmoid') (x)
    else:
        out = Dense(6, activation='softmax') (x)
    model = Model(inputs=[model1.input, model2.input], outputs=out)
    model.summary()
    return model


# Training the model

In [None]:
ckpt_path = 'fake_news_binary.hdf5'

earlystop = EarlyStopping(monitor='val_acc', patience=5, verbose=1, restore_best_weights=True)
reducelr = ReduceLROnPlateau(monitor='val_acc', factor=0.5, patience=3, verbose=1, min_lr=1.e-6)
modelckpt_cb = ModelCheckpoint(ckpt_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
# tb = TensorBoard()

callbacks = [earlystop, reducelr, modelckpt_cb]

In [None]:
classifier = 'binary'
batch_size = 16
np.random.seed(4123)

x_train, y_train = sjplus_condition(train_df, classifier)
x_val, y_val = sjplus_condition(val_df, classifier)

binary_model = sjplus_model([len(x_train[0][0]), len(x_train[1][0])], classifier)
binary_model.compile(loss='binary_crossentropy', optimizer=Adam(lr=1.e-5), metrics=['acc'])
binary_history = binary_model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=batch_size, epochs=2)

In [None]:
binary_model.compile(loss='binary_crossentropy', optimizer=Adam(lr=3.e-4), metrics=['acc'])
binary_history = binary_model.fit(x_train, y_train, validation_data=(x_val, y_val), 
                                  batch_size=batch_size, callbacks=callbacks, epochs=20)

# Model evalulation

In [None]:
x_test, y_test = sjplus_condition(test_df, classifier)

binary_model.load_weights('fake_news_binary_58379.hdf5')
score, acc = binary_model.evaluate(x_test, y_test)
print('Test accuracy: ', acc)