# Importing libraries

In [2]:
import re
import numpy as np
import pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer

nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shengjie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Shengjie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Importing data

In [3]:
d1_path = "./data/processed_tsv/K1.tsv"
d2_path = "./data/processed_tsv/K2.tsv"

d1 = pd.read_csv(d1_path, delimiter="\t")
d2 = pd.read_csv(d2_path, delimiter="\t")
d1['content'] = d1['content'].apply(lambda x: str(x))
d2['content'] = d2['content'].apply(lambda x: str(x))
d1['domain'] = 0
d2['domain'] = 1


In [4]:
print('===== d1 =====\n', d1.groupby(['label'])[['label']].count())
print('===== d2 =====\n', d2.groupby(['label'])[['label']].count())


===== d1 =====
        label
label       
0      23481
1      21417
===== d2 =====
        label
label       
0       4488
1       5752


# Text preprocessing

In [5]:
class Preprocess:

    def __init__(self):
        self.wordnet_lemmatizer = WordNetLemmatizer()
        self.porter = PorterStemmer()

    def clean_text(self, text):
        '''Clean text by removing unnecessary characters and altering the format of words.'''

        text = str(text)
        text = text.lower()
        text = re.sub(r"i'm", "i am", text)
        text = re.sub(r"he's", "he is", text)
        text = re.sub(r"she's", "she is", text)
        text = re.sub(r"it's", "it is", text)
        text = re.sub(r"that's", "that is", text)
        text = re.sub(r"what's", "that is", text)
        text = re.sub(r"where's", "where is", text)
        text = re.sub(r"how's", "how is", text)
        text = re.sub(r"\'ll", " will", text)
        text = re.sub(r"\'ve", " have", text)
        text = re.sub(r"\'re", " are", text)
        text = re.sub(r"\'d", " would", text)
        text = re.sub(r"\'re", " are", text)
        text = re.sub(r"won't", "will not", text)
        text = re.sub(r"can't", "cannot", text)
        text = re.sub(r"n't", " not", text)
        text = re.sub(r"n'", "ng", text)
        text = re.sub(r"'bout", "about", text)
        text = re.sub(r"'til", "until", text)
        text = re.sub(r"[()\"_#/@;*%:{}<>`+=~|.!?,'$-\[\]]", "", text)
        text = re.sub(r"[0-9]", "", text)

        return text

    def lemmatizer(self, text):
        sentence_words = nltk.word_tokenize(text)
        ret_text = []
        for word in sentence_words:
            ret_text.append(self.wordnet_lemmatizer.lemmatize(word))

        return " ".join(ret_text)

    def stemmer(self, text):
        sentence_words = nltk.word_tokenize(text)
        ret_text = []
        for word in sentence_words:
            ret_text.append(self.porter.stem(word))

        return " ".join(ret_text)


# Formating data

In [6]:
def encoder_data(df, lemmatize=False, stem=False):
    preprocessor = Preprocess()
    encoder_inputs = df['content'].apply(lambda x: preprocessor.clean_text(x))
    if lemmatize:
        encoder_inputs = df['content'].apply(
            lambda x: preprocessor.lemmatizer(x))
    if stem:
        encoder_inputs = df['content'].apply(lambda x: preprocessor.stemmer(x))
    encoder_inputs = np.array(encoder_inputs.values.tolist())
    return encoder_inputs


def fake_news_target(df):
    return np.array(df['label'].apply(lambda x: int(x)).values.tolist())


def domain_target(df):
    return np.array(df['domain'].apply(lambda x: int(x)).values.tolist())


class Shuffle:

    def __init__(self, data_len):
        self.idx = np.arange(data_len)
        np.random.shuffle(self.idx)

    def shuffle(self, data):
        return np.array(data)[self.idx]


In [7]:
# Creating the word2idx, idx2word mapping using the Keras Tokenizer
def create_vocab(text_lists, vocab_len):
    tokenizer = Tokenizer(oov_token="<UNK>")
    tokenizer.fit_on_texts(text_lists)
    # Due to ambiguity with regards to Keras Tokenizer num_words, below is a good enough fix,
    # though it changes the tokenizer word_index outside of the class
    num_words = vocab_len

    sorted_by_word_count = sorted(
        tokenizer.word_counts.items(), key=lambda kv: kv[1], reverse=True)
    tokenizer.word_index = {}
    word2idx = {}
    idx2word = {}
    i = 0
    for word, count in sorted_by_word_count:
        if i == num_words:
            break

        # <= because tokenizer is 1 indexed
        tokenizer.word_index[word] = i + 1
        word2idx[word] = i+1
        idx2word[i+1] = word
        i += 1

    tokenizer.word_index[tokenizer.oov_token] = num_words+1
    word2idx[tokenizer.oov_token] = num_words+1
    idx2word[num_words+1] = tokenizer.oov_token

    return word2idx, idx2word, tokenizer


def pad_tokenize_data(encoder_inputs, max_sentence_length, tokenizer):

    t_encoder_inputs = tokenizer.texts_to_sequences(encoder_inputs)
    t_encoder_inputs = pad_sequences(
        t_encoder_inputs, maxlen=max_sentence_length, padding='post', truncating='post')

    return t_encoder_inputs


In [8]:
def batch_generator(X, Y_CC, Y_DC, max_sentence_length, word2idx, batch_size=128):

    y_cc = to_categorical(Y_CC)
    y_dc = to_categorical(Y_DC)
    for idx in range(0, len(X), batch_size):
        encoder_input = np.zeros((batch_size, max_sentence_length))
        decoder_target = np.zeros(
            (batch_size, max_sentence_length, len(word2idx)+1))
        for j, input_seq in enumerate(X[idx:idx+batch_size]):
            for i, word_idx in enumerate(input_seq):
                encoder_input[j, i] = word_idx
                decoder_target[j, i, word_idx] = 1

        yield [encoder_input, [decoder_target, y_cc[idx:idx+batch_size], y_dc[idx:idx+batch_size]]]


def all_data_generator(X, Y_CC, Y_DC, max_sentence_length):
    encoder_input = np.zeros((len(X), max_sentence_length))
    for j, input_seq in enumerate(X):
        for i, word_idx in enumerate(input_seq):
            encoder_input[j, i] = word_idx

    y_cc = to_categorical(Y_CC)
    y_dc = to_categorical(Y_DC)

    return [encoder_input, y_cc, y_dc]


# Building Models

In [9]:
def classification_model(max_encoder_len, embedding_dim, latent_dim, vocab_len, include_glove=False):
    inputs = Input(shape=(max_encoder_len,), name="encoder_inputs")

    layer_embedding = Embedding(vocab_len+1, embedding_dim, trainable=True,
                                input_length=max_encoder_len, mask_zero=True, name="encoder_embedding")
    layer_lstm = LSTM(latent_dim, return_state=True)
    layer_nonlinear = Dense(128, activation="tanh", name="non_linear")
    layer_softmax = Dense(2, activation="softmax", name="softmax")

    embedded = layer_embedding(inputs)
    encoded, _, _ = layer_lstm(embedded)
    logits = layer_nonlinear(encoded)
    outputs = layer_softmax(logits)

    return Model(inputs, outputs)


## Domain Independent Model

In [10]:
def domain_independent_model(max_encoder_len, embedding_dim, latent_dim, vocab_len, include_glove=False):
    inputs = Input(shape=(max_encoder_len,), name="encoder_inputs")

    layer_embedding = Embedding(vocab_len+1, embedding_dim, trainable=True,
                                input_length=max_encoder_len, mask_zero=True, name="encoder_embedding")
    layer_lstm = LSTM(latent_dim, return_state=True)
    layer_nonlinear_cc = Dense(128, activation="tanh", name="non_linear")
    layer_softmax_cc = Dense(2, activation="softmax", name="softmax")
    layer_nonlinear_dc = Dense(128, activation="tanh", name="non_linear")
    layer_softmax_dc = Dense(2, activation="softmax", name="softmax")

    embedded = layer_embedding(inputs)
    encoded, _, _ = layer_lstm(embedded)
    logits_cc = layer_nonlinear_cc(encoded)
    outputs_cc = layer_softmax_cc(logits_cc)
    logits_dc = layer_nonlinear_dc(encoded)
    outputs_dc = layer_softmax_dc(logits_dc)

    return Model(inputs, [outputs_cc, outputs_dc])


# Experiments

## Experiment 1

In [18]:
def prepare_training_data(df, vocab_size, max_sentence_length):
    encoder_inputs = encoder_data(df)
    y_cc = fake_news_target(df)
    y_dc = domain_target(df)

    # Initializing the shuffle class instance
    shuffle = Shuffle(len(encoder_inputs))
    encoder_inputs = shuffle.shuffle(encoder_inputs)
    y_cc = shuffle.shuffle(y_cc)
    y_dc = shuffle.shuffle(y_dc)  # Not used

    word2idx, idx2word, tokenizer = create_vocab(encoder_inputs, vocab_size)
    vocab_len = len(word2idx)

    encoder_inputs = pad_tokenize_data(
        encoder_inputs, max_sentence_length, tokenizer)

    encoder_inputs, y_cc, y_dc = all_data_generator(
        encoder_inputs, y_cc, y_dc, max_sentence_length)

    train_X, test_X, train_Y, test_Y = train_test_split(
        encoder_inputs, y_cc, test_size=0.1, random_state=42)

    return [[train_X, test_X, train_Y, test_Y], [word2idx, idx2word, tokenizer], vocab_len]


def prepare_testing_data(df, max_sentence_length):
    encoder_inputs = encoder_data(df)
    y_cc = fake_news_target(df)
    y_dc = domain_target(df)

    encoder_inputs = pad_tokenize_data(
        encoder_inputs, max_sentence_length, tokenizer)

    encoder_inputs, y_cc, y_dc = all_data_generator(
        encoder_inputs, y_cc, y_dc, max_sentence_length)

    return [encoder_inputs, y_cc, y_dc]


In [19]:
max_sentence_length = 50
embedding_dim = 100
latent_dim = 64
vocab_size = 500

[train_X, test_X, train_Y, test_Y], \
    [word2idx, idx2word, tokenizer], vocab_len = prepare_training_data(
        d1, vocab_size, max_sentence_length)


In [20]:
cc_model = classification_model(
    max_sentence_length, embedding_dim, latent_dim, vocab_len, include_glove=False)
cc_model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_inputs (InputLayer)  [(None, 50)]             0         
                                                                 
 encoder_embedding (Embeddin  (None, 50, 100)          50200     
 g)                                                              
                                                                 
 lstm (LSTM)                 [(None, 64),              42240     
                              (None, 64),                        
                              (None, 64)]                        
                                                                 
 non_linear (Dense)          (None, 128)               8320      
                                                                 
 softmax (Dense)             (None, 2)                 258       
                                                             

In [21]:
cc_model.compile(optimizer="rmsprop",
                 loss='binary_crossentropy', metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', mode='min', patience=5, verbose=1)
mcp_save = ModelCheckpoint(
    '.mdl_wts.hdf5', save_best_only=True, monitor='val_loss', mode='min')
reduce_lr_loss = ReduceLROnPlateau(
    monitor='val_loss', factor=0.1, patience=7, verbose=1, min_delta=1e-4, mode='min')
history = cc_model.fit(train_X,
                       train_Y,
                       batch_size=256,
                       validation_split=0.1,
                       callbacks=[es, mcp_save, reduce_lr_loss],
                       epochs=30)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 00022: early stopping


In [23]:
y_pred = cc_model.predict(test_X)
y_pred = np.array([np.argmax(x) for x in y_pred])
y_true = np.array([np.argmax(x) for x in test_Y])

print(
    f"Prediction acuracy on same domain is {round(accuracy_score(y_true,y_pred),2)}")
print(f"Precision on same domain is {round(precision_score(y_true,y_pred),2)}")
print(f"Recall on same domain is {round(recall_score(y_true,y_pred),2)}")
print(f"F1 on same domain is {round(f1_score(y_true,y_pred),2)}")

cmtx = pd.DataFrame(
    confusion_matrix(y_true, y_pred, labels=[1, 0]),
    index=['true:Fake', 'true:Real'],
    columns=['pred:Fake', 'pred:Real']
)
print("\n")
print(cmtx)

d2_encoder_inputs, d2_y_cc, d2_y_dc = prepare_testing_data(
    d2, max_sentence_length)

y_pred = cc_model.predict(d2_encoder_inputs)
y_pred = np.array([np.argmax(x) for x in y_pred])
y_true = np.array([np.argmax(x) for x in d2_y_cc])

print(
    f"Prediction acuracy on different domain is {round(accuracy_score(y_true,y_pred),2)}")
print(
    f"Precision on different domain is {round(precision_score(y_true,y_pred),2)}")
print(f"Recall on different domain is {round(recall_score(y_true,y_pred),2)}")
print(f"F1 on different domain is {round(f1_score(y_true,y_pred),2)}")
cmtx = pd.DataFrame(
    confusion_matrix(y_true, y_pred, labels=[1, 0]),
    index=['true:Fake', 'true:Real'],
    columns=['pred:Fake', 'pred:Real']
)
print("\n")
print(cmtx)


Prediction acuracy on same domain is 0.95
Precision on same domain is 0.93
Recall on same domain is 0.96
F1 on same domain is 0.95


           pred:Fake  pred:Real
true:Fake       2104         88
true:Real        156       2142
Prediction acuracy on different domain is 0.46
Precision on different domain is 0.62
Recall on different domain is 0.1
F1 on different domain is 0.18


           pred:Fake  pred:Real
true:Fake        600       5152
true:Real        370       4118


## Experiment 2

In [11]:
def prepare_training_data(df, vocab_size, max_sentence_length):
    encoder_inputs = encoder_data(df)
    y_cc = fake_news_target(df)
    y_dc = domain_target(df)

    # Initializing the shuffle class instance
    shuffle = Shuffle(len(encoder_inputs))
    encoder_inputs = shuffle.shuffle(encoder_inputs)
    y_cc = shuffle.shuffle(y_cc)
    y_dc = shuffle.shuffle(y_dc)

    word2idx, idx2word, tokenizer = create_vocab(encoder_inputs, vocab_size)
    vocab_len = len(word2idx)

    encoder_inputs = pad_tokenize_data(
        encoder_inputs, max_sentence_length, tokenizer)

    encoder_inputs, y_cc, y_dc = all_data_generator(
        encoder_inputs, y_cc, y_dc, max_sentence_length)

    train_X, test_X, train_C_Y, test_C_Y, train_D_Y, test_D_Y = train_test_split(
        encoder_inputs, y_cc, y_dc, test_size=0.1, random_state=42)

    return [[train_X, test_X, train_C_Y, test_C_Y, train_D_Y, test_D_Y], [word2idx, idx2word, tokenizer], vocab_len]


In [12]:
max_sentence_length = 100
embedding_dim = 100
latent_dim = 64
vocab_size = 500

[train_X, test_X, train_C_Y, test_C_Y, train_D_Y, test_D_Y], \
    [word2idx, idx2word, tokenizer], vocab_len = prepare_training_data(
        pd.concat([d1, d2]), vocab_size, max_sentence_length)


MemoryError: Unable to allocate 7.77 GiB for an array with shape (55138,) and data type <U37846

In [28]:

dc_model = domain_independent_model(
    max_sentence_length, embedding_dim, latent_dim, vocab_len, include_glove=False)
dc_model.summary()


Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_inputs (InputLayer)  [(None, 100)]            0         
                                                                 
 encoder_embedding (Embeddin  (None, 100, 100)         50200     
 g)                                                              
                                                                 
 lstm_1 (LSTM)               [(None, 64),              42240     
                              (None, 64),                        
                              (None, 64)]                        
                                                                 
 non_linear (Dense)          (None, 128)               8320      
                                                                 
 softmax (Dense)             (None, 2)                 258       
                                                           

In [29]:
dc_model.compile(optimizer="rmsprop", loss=[
              'binary_crossentropy', 'binary_crossentropy'], loss_weights=[0.7, -0.2], metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', mode='min', patience=30, verbose=1)
mcp_save = ModelCheckpoint(
    '.md2_wts.hdf5', save_best_only=True, monitor='val_loss', mode='min')
reduce_lr_loss = ReduceLROnPlateau(
    monitor='val_loss', factor=0.1, patience=7, verbose=1, min_delta=1e-4, mode='min')
history = dc_model.fit(train_X,
                    [train_C_Y, train_D_Y],
                    batch_size=256,
                    validation_split=0.1,
                    callbacks=[es, mcp_save, reduce_lr_loss],
                    epochs=30)


Epoch 1/30


AttributeError: in user code:

    File "c:\Users\Shengjie\Projects\fake-news-detection\.venv\lib\site-packages\keras\engine\training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\Shengjie\Projects\fake-news-detection\.venv\lib\site-packages\keras\engine\training.py", line 867, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Shengjie\Projects\fake-news-detection\.venv\lib\site-packages\keras\engine\training.py", line 860, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\Shengjie\Projects\fake-news-detection\.venv\lib\site-packages\keras\engine\training.py", line 817, in train_step
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    File "c:\Users\Shengjie\Projects\fake-news-detection\.venv\lib\site-packages\keras\engine\compile_utils.py", line 439, in update_state
        self.build(y_pred, y_true)
    File "c:\Users\Shengjie\Projects\fake-news-detection\.venv\lib\site-packages\keras\engine\compile_utils.py", line 359, in build
        self._metrics = tf.__internal__.nest.map_structure_up_to(y_pred, self._get_metric_objects,
    File "c:\Users\Shengjie\Projects\fake-news-detection\.venv\lib\site-packages\keras\engine\compile_utils.py", line 485, in _get_metric_objects
        return [self._get_metric_object(m, y_t, y_p) for m in metrics]
    File "c:\Users\Shengjie\Projects\fake-news-detection\.venv\lib\site-packages\keras\engine\compile_utils.py", line 485, in <listcomp>
        return [self._get_metric_object(m, y_t, y_p) for m in metrics]
    File "c:\Users\Shengjie\Projects\fake-news-detection\.venv\lib\site-packages\keras\engine\compile_utils.py", line 506, in _get_metric_object
        y_t_rank = len(y_t.shape.as_list())

    AttributeError: 'tuple' object has no attribute 'shape'


In [None]:
y_pred,_ = dc_model.predict(test_X)
y_pred = np.array([np.argmax(x) for x in y_pred])
y_true = np.array([np.argmax(x) for x in test_C_Y])

print(f"Prediction acuracy on both domains is {round(accuracy_score(y_true,y_pred),2)}")
print(f"Precision on both domains is {round(precision_score(y_true,y_pred),2)}")
print(f"Recall on both domains is {round(recall_score(y_true,y_pred),2)}")
print(f"F1 on both domains is {round(f1_score(y_true,y_pred),2)}")
cmtx = pd.DataFrame(
    confusion_matrix(y_true, y_pred, labels=[1, 0]), 
    index=['true:Fake', 'true:Real'], 
    columns=['pred:Fake', 'pred:Real']
)
print("\n")
print(cmtx)