In [1]:
#### Import all required models and flows ####

import re
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from mlflow import log_metric, log_params

import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
from tensorflow import keras
from tensorflow.core.protobuf import rewriter_config_pb2
from tensorflow.keras.backend import set_session

import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors
from pathlib import Path

In [2]:
##### Initialize all hyperparameters to none ####
MAX_ARTICLE_LENGTH = None
EMBEDDING_VECTOR_LENGTH = None
EMBEDDING_VOCAB_SIZE = None
LSTM_MEMORY_SIZE = None
NN_OPTIMIZER = None
NN_LOSS_FUNCTION = None
NN_EPOCHS = None
USE_GLOVE_EMBEDDINGS = None
NN_BATCH_SIZE = None
DATASET = None
DROPOUT_RATE = None
NN_ARCH_TYPE = None

##### Initialize dataset area ####
RANDOM_SEED = 139
DATASET_PATH = "fakeNewsDatasets"
ID_UNKNOWN = 399999

#### Helper functions used later ####
def cleanArticle(string):
    strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())

def article_to_word_id_list(article, model):
    word_index_list = []
    word_list = article.split()
    for i, word in enumerate(word_list):
        if word in model.vocab:
            word_index_list.append(model.vocab[word].index)
        else:
            # Unknown
            word_index_list.append(ID_UNKNOWN)
    return word_index_list    

In [3]:
#### Load glove model and convert to word2vec ####

def load_glove_model_v2(dim):
    "Adapted from https://www.programcreek.com/python/example/99240/gensim.models.KeyedVectors.load_word2vec_format"

    GLOVE_FILEPATH = 'glove.6B.%dd.txt' % EMBEDDING_VECTOR_LENGTH
    print("Loading Glove embedding")
    glove_data_file = GLOVE_FILEPATH
    word2vec_output_file = '%s.w2v' % glove_data_file

    if not Path(word2vec_output_file).exists():
        glove2word2vec(glove_input_file=glove_data_file, word2vec_output_file=word2vec_output_file)
    model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
    print("Loaded Glove embedding")

    embedding_matrix = np.zeros((len(model.vocab), dim))
    for i in range(len(model.vocab)):
        embedding_vector = model[model.index2word[i]]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return model, embedding_matrix

In [4]:
#### Read the dataset into a dataframe for further processing ####

def read_dataset(dataset_name):
    
    def remove_numbers(in_str):
        return re.sub(r'[0-9]+', '', in_str)
        
    print("Reading dataset")
    result_data_list = []
    data_dir = DATASET_PATH
    for news_type in ['fake', 'legit']:
        folder = '%s/%s/%s' % (data_dir, dataset_name, news_type)
        for fname in os.listdir(folder):
            result_data = {}
            result_data['dataset_name'] = dataset_name
            result_data['news_type'] = news_type
            if news_type == 'fake':
                result_data['is_fake'] = 1
            else:
                result_data['is_fake'] = 0
            if dataset_name == 'fakeNewsDataset':
                result_data['news_category'] = remove_numbers(fname.split('.')[0])
            result_data['file_name'] = fname
            filepath = os.path.join(folder, fname)
            with open(filepath, 'r', encoding="utf8") as f:
                file_data = f.read().split('\n')
                # Some articles don't have a headline, but only article body.
                if len(file_data) > 1:
                    news_content_data = ' '.join(file_data[2:])
                    result_data['news_headline'] = file_data[0]
                else:
                    news_content_data = file_data[0]
                    result_data['news_headline'] = ''
                result_data['news_content'] = news_content_data
                result_data['news_all'] = ' '.join(file_data[0:])
                result_data_list.append(result_data)
                
    df = pd.DataFrame(result_data_list)
    
    model, embedding_matrix = load_glove_model_v2(EMBEDDING_VECTOR_LENGTH)
    df['news_all_clean'] = df['news_all'].apply(lambda a: cleanArticle(a))
    df['news_embed_idx'] = df['news_all_clean'].apply(lambda a: article_to_word_id_list(a, model))
    
    X_train, X_test, y_train, y_test = train_test_split(df['news_embed_idx'], df['is_fake'], 
                                                        test_size=.2, random_state=RANDOM_SEED)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=RANDOM_SEED)
    
    print("Finished reading dataset")
    return X_train, X_val, X_test, y_train, y_val, y_test, embedding_matrix

In [5]:
def model_execute(hyperparameter_dict):

    np.random.seed(RANDOM_SEED)
    
    for k, v in hyperparameter_dict.items():
        globals()[k] = v
        
    X_train, X_val, X_test, y_train, y_val, y_test, embedding_matrix = read_dataset(DATASET)
    
    # Add padding if needed
    X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=MAX_ARTICLE_LENGTH)
    X_val = tf.keras.preprocessing.sequence.pad_sequences(X_val, maxlen=MAX_ARTICLE_LENGTH)
    X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=MAX_ARTICLE_LENGTH)
    
    # For easy reset of notebook state.
    tf.keras.backend.clear_session()  
    config_proto = tf.ConfigProto()
    off = rewriter_config_pb2.RewriterConfig.OFF
    config_proto.graph_options.rewrite_options.arithmetic_optimization = off
    session = tf.Session(config=config_proto)
    set_session(session)

    # Define model
    model = keras.Sequential()

    model.add(keras.layers.Embedding(EMBEDDING_VOCAB_SIZE, EMBEDDING_VECTOR_LENGTH, input_length=MAX_ARTICLE_LENGTH))

    # Neural network type
    if NN_ARCH_TYPE == '2layerLSTM':
        model.add(keras.layers.LSTM(LSTM_MEMORY_SIZE, dropout=DROPOUT_RATE, return_sequences=True, 
                                    input_shape=(MAX_ARTICLE_LENGTH, EMBEDDING_VECTOR_LENGTH)))
        model.add(keras.layers.LSTM(LSTM_MEMORY_SIZE, dropout=DROPOUT_RATE))
    elif NN_ARCH_TYPE == '1layerLSTM':
        model.add(keras.layers.LSTM(LSTM_MEMORY_SIZE, dropout=DROPOUT_RATE))
    elif NN_ARCH_TYPE == '1layerGRU':
        model.add(keras.layers.GRU(LSTM_MEMORY_SIZE, dropout=DROPOUT_RATE))
    else:
        assert False, "Unknown NN arch type"

    model.add(keras.layers.Dense(1, activation='sigmoid'))
    model.compile(loss=NN_LOSS_FUNCTION, optimizer=tf.keras.optimizers.Adam(0.0001), metrics=['accuracy'])
    print(model.summary())
    
    # Train model
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=NN_EPOCHS, batch_size=NN_BATCH_SIZE)
    
    # Predict model on validation (Dev) set 
    scores = model.evaluate(X_val, y_val, verbose=1)
    accuracy = scores[1] * 100
    log_metric('accuracy', accuracy)
    print("Accuracy: %.2f%%" % accuracy)
    
    # Predict model on test set 
    scores = model.evaluate(X_test, y_test, verbose=1)
    accuracy = scores[1] * 100
    log_metric('accuracy', accuracy)
    print("Accuracy on Test Set: %.2f%%" % accuracy)
    
    # Confusion matrix of results (ensure it doesn't predict the same class for all records)
    y_pred = model.predict(X_test)
    y_pred = (y_pred > 0.5)
    print(confusion_matrix(y_test, y_pred))

In [14]:
hyperparameter = {
            'MAX_ARTICLE_LENGTH': 500,
            'EMBEDDING_VECTOR_LENGTH': 50,
            'EMBEDDING_VOCAB_SIZE': 400000,
            'LSTM_MEMORY_SIZE': 100,
            'NN_LOSS_FUNCTION': 'binary_crossentropy',
            'NN_EPOCHS': 35,
            'USE_GLOVE_EMBEDDINGS': False,
            'NN_BATCH_SIZE': 50,
            'DATASET': 'celebrityDataset',
            'DROPOUT_RATE': 0.5,
            'NN_ARCH_TYPE': '1layerLSTM',
        }
model_execute(hyperparameter)

Reading dataset
Loading Glove embedding
Loaded Glove embedding
Finished reading dataset
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 50)           20000000  
_________________________________________________________________
lstm (LSTM)                  (None, 100)               60400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 20,060,501
Trainable params: 20,060,501
Non-trainable params: 0
___________________________________________________________

In [15]:
hyperparameter = {
            'MAX_ARTICLE_LENGTH': 500,
            'EMBEDDING_VECTOR_LENGTH': 50,
            'EMBEDDING_VOCAB_SIZE': 400000,
            'LSTM_MEMORY_SIZE': 100,
            'NN_LOSS_FUNCTION': 'binary_crossentropy',
            'NN_EPOCHS': 25,
            'USE_GLOVE_EMBEDDINGS': False,
            'NN_BATCH_SIZE': 50,
            'DATASET': 'celebrityDataset',
            'DROPOUT_RATE': 0.5,
            'NN_ARCH_TYPE': '2layerLSTM',
        }
model_execute(hyperparameter)

Reading dataset
Loading Glove embedding
Loaded Glove embedding
Finished reading dataset
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 50)           20000000  
_________________________________________________________________
lstm (LSTM)                  (None, 500, 100)          60400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 20,140,901
Trainable params: 20,140,901
Non-trainable params: 0
_________________________________________________________________
None
Train on 300 samples, validate on 100 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoc

In [16]:
hyperparameter = {
            'MAX_ARTICLE_LENGTH': 500,
            'EMBEDDING_VECTOR_LENGTH': 50,
            'EMBEDDING_VOCAB_SIZE': 400000,
            'LSTM_MEMORY_SIZE': 100,
            'NN_LOSS_FUNCTION': 'binary_crossentropy',
            'NN_EPOCHS': 45,
            'USE_GLOVE_EMBEDDINGS': False,
            'NN_BATCH_SIZE': 50,
            'DATASET': 'celebrityDataset',
            'DROPOUT_RATE': 0.5,
            'NN_ARCH_TYPE': '1layerGRU',
        }
model_execute(hyperparameter)

Reading dataset
Loading Glove embedding
Loaded Glove embedding
Finished reading dataset
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 50)           20000000  
_________________________________________________________________
gru (GRU)                    (None, 100)               45300     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 20,045,401
Trainable params: 20,045,401
Non-trainable params: 0
_________________________________________________________________
None
Train on 300 samples, validate on 100 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoc

In [22]:
hyperparameter = {
            'MAX_ARTICLE_LENGTH': 200,
            'EMBEDDING_VECTOR_LENGTH': 50,
            'EMBEDDING_VOCAB_SIZE': 400000,
            'LSTM_MEMORY_SIZE': 100,
            'NN_LOSS_FUNCTION': 'binary_crossentropy',
            'NN_EPOCHS': 35,
            'USE_GLOVE_EMBEDDINGS': False,
            'NN_BATCH_SIZE': 50,
            'DATASET': 'fakeNewsDataset',
            'DROPOUT_RATE': 0.5,
            'NN_ARCH_TYPE': '1layerLSTM',
        }
model_execute(hyperparameter)

Reading dataset
Loading Glove embedding
Loaded Glove embedding
Finished reading dataset
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 50)           20000000  
_________________________________________________________________
lstm (LSTM)                  (None, 100)               60400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 20,060,501
Trainable params: 20,060,501
Non-trainable params: 0
___________________________________________________________

In [23]:
hyperparameter = {
            'MAX_ARTICLE_LENGTH': 200,
            'EMBEDDING_VECTOR_LENGTH': 50,
            'EMBEDDING_VOCAB_SIZE': 400000,
            'LSTM_MEMORY_SIZE': 100,
            'NN_LOSS_FUNCTION': 'binary_crossentropy',
            'NN_EPOCHS': 25,
            'USE_GLOVE_EMBEDDINGS': False,
            'NN_BATCH_SIZE': 50,
            'DATASET': 'fakeNewsDataset',
            'DROPOUT_RATE': 0.5,
            'NN_ARCH_TYPE': '2layerLSTM',
        }
model_execute(hyperparameter)

Reading dataset
Loading Glove embedding
Loaded Glove embedding
Finished reading dataset
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 50)           20000000  
_________________________________________________________________
lstm (LSTM)                  (None, 200, 100)          60400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 20,140,901
Trainable params: 20,140,901
Non-trainable params: 0
_________________________________________________________________
None
Train on 288 samples, validate on 96 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch

In [24]:
hyperparameter = {
            'MAX_ARTICLE_LENGTH': 200,
            'EMBEDDING_VECTOR_LENGTH': 50,
            'EMBEDDING_VOCAB_SIZE': 400000,
            'LSTM_MEMORY_SIZE': 100,
            'NN_LOSS_FUNCTION': 'binary_crossentropy',
            'NN_EPOCHS': 45,
            'USE_GLOVE_EMBEDDINGS': False,
            'NN_BATCH_SIZE': 50,
            'DATASET': 'fakeNewsDataset',
            'DROPOUT_RATE': 0.5,
            'NN_ARCH_TYPE': '1layerGRU',
        }
model_execute(hyperparameter)

Reading dataset
Loading Glove embedding
Loaded Glove embedding
Finished reading dataset
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 50)           20000000  
_________________________________________________________________
gru (GRU)                    (None, 100)               45300     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 20,045,401
Trainable params: 20,045,401
Non-trainable params: 0
_________________________________________________________________
None
Train on 288 samples, validate on 96 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch

**BERT transformers**

In [9]:
from simpletransformers.classification import ClassificationModel
"Adapted from https://towardsdatascience.com/\
simple-transformers-introducing-the-easiest-bert-roberta-xlnet-and-xlm-library-58bf8c59b2a3"
train_args = {
    'evaluate_during_training': True,
    'logging_steps': 100,
    'num_train_epochs': 2,
    'evaluate_during_training_steps': 100,
    'save_eval_checkpoints': False,
    'train_batch_size': 32,
    'eval_batch_size': 64,
    'overwrite_output_dir': True,
    'fp16': False,
    'wandb_project': "visualization-demo"
}

In [10]:
"Adapted from https://towardsdatascience.com/\
simple-transformers-introducing-the-easiest-bert-roberta-xlnet-and-xlm-library-58bf8c59b2a3"
model_BERT = ClassificationModel('bert', 'bert-base-cased', num_labels=2, use_cuda=False, cuda_device=0, args=train_args)

In [14]:
#### Read the dataset into a dataframe for further processing ####
def bert_read_dataset(dataset_name):
    
    def remove_numbers(in_str):
        return re.sub(r'[0-9]+', '', in_str)
    
    print("Reading dataset")
    result_data_list = []
    data_dir = DATASET_PATH
    for news_type in ['fake', 'legit']:
        folder = '%s/%s/%s' % (data_dir, dataset_name, news_type)
        for fname in os.listdir(folder):
            result_data = {}
            result_data['dataset_name'] = dataset_name
            result_data['news_type'] = news_type
            if news_type == 'fake':
                result_data['is_fake'] = 1
            else:
                result_data['is_fake'] = 0
            if dataset_name == 'fakeNewsDataset':
                result_data['news_category'] = remove_numbers(fname.split('.')[0])
            result_data['file_name'] = fname
            filepath = os.path.join(folder, fname)
            with open(filepath, 'r', encoding="utf8") as f:
                file_data = f.read().split('\n')
                # Some articles don't have a headline, but only article body.
                if len(file_data) > 1:
                    news_content_data = ' '.join(file_data[2:])
                    result_data['news_headline'] = file_data[0]
                else:
                    news_content_data = file_data[0]
                    result_data['news_headline'] = ''
                result_data['news_content'] = news_content_data
                result_data['news_all'] = ' '.join(file_data[0:])
                result_data_list.append(result_data)
                
    df = pd.DataFrame(result_data_list)
    
    df['news_all_clean'] = df['news_all'].apply(lambda a: cleanArticle(a))
    
    X_train, X_test, y_train, y_test = train_test_split(df.drop(['is_fake','news_type','file_name'],axis = 1), 
                                                        df['is_fake'], 
                                                        test_size=.2, random_state=RANDOM_SEED)
    X_train.reset_index(drop=True),\
            y_train.reset_index(drop=True), X_test.reset_index(drop=True), y_test.reset_index(drop=True)
    print("Finished reading dataset")
    
    train_df_clean = pd.concat([X_train, y_train], axis=1)
    train_df_clean = train_df_clean.drop(['dataset_name', 'news_headline', 'news_content', 'news_all'], axis=1)
    train_df_clean = train_df_clean.drop(['news_category'], axis=1)
    print("Shape of training data set: ", train_df_clean.shape)
    print("View of data set: ", train_df_clean.head())

    eval_df_clean = pd.concat([X_test, y_test], axis=1)
    eval_df_clean = eval_df_clean.drop(['dataset_name', 'news_headline', 'news_content', 'news_all'], axis=1)
    eval_df_clean = eval_df_clean.drop(['news_category'], axis=1)
    print("Shape of Eval data set: ", eval_df_clean.shape)

    model_BERT.train_model(train_df_clean, eval_df=eval_df_clean)
    result, model_outputs, wrong_predictions = model_BERT.eval_model(eval_df_clean, acc=accuracy_score)
    print(result)

In [12]:
def bert_model(hyperparameter_dict):

    np.random.seed(RANDOM_SEED)
    for k, v in hyperparameter_dict.items():
        globals()[k] = v
    bert_read_dataset(DATASET)

In [13]:
hyperparameter = {
            'MAX_ARTICLE_LENGTH': 500,
            'EMBEDDING_VECTOR_LENGTH': 50,
            'DATASET': 'celebrityDataset',
        }
bert_model(hyperparameter)

Reading dataset
Finished reading dataset
Shape of training data set:  (400, 2)
View of data set:                                          news_all_clean  is_fake
477  lady gaga announces netflix documentary talks ...        0
439  this is what brad pitt has been texting jennif...        0
8    caitlyn jenner looks to new girlfriend for sup...        1
96   girl fight kanye west confronts anna wintour o...        1
232  caitlyn jenners memoir reportedly lands movie ...        1
Shape of Eval data set:  (100, 2)


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Current iteration', max=13, style=ProgressStyle(description_w…

Running loss: 0.677338


HBox(children=(IntProgress(value=0, description='Current iteration', max=13, style=ProgressStyle(description_w…

Running loss: 0.508778



HBox(children=(IntProgress(value=0), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))


{'mcc': 0.339556023699088, 'tp': 31, 'tn': 36, 'fp': 18, 'fn': 15, 'acc': 0.67, 'eval_loss': 0.5950743854045868}


In [16]:
hyperparameter = {
            'MAX_ARTICLE_LENGTH': 200,
            'EMBEDDING_VECTOR_LENGTH': 50,
            'DATASET': 'fakeNewsDataset',
        }
bert_model(hyperparameter)

Reading dataset
Finished reading dataset
Shape of training data set:  (384, 2)
View of data set:                                          news_all_clean  is_fake
150  donald trumps win to the presidency causes fre...        1
284  trumps pick for education could face unusually...        0
26   macron and le pen fight for votes  anticipatin...        1
362  girls who code closing computer science gender...        0
179  the us supreme court has done the unthinkable ...        1
Shape of Eval data set:  (96, 2)


HBox(children=(IntProgress(value=0, max=384), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Current iteration', max=12, style=ProgressStyle(description_w…

Running loss: 0.543366


HBox(children=(IntProgress(value=0, description='Current iteration', max=12, style=ProgressStyle(description_w…

Running loss: 0.276190



HBox(children=(IntProgress(value=0, max=96), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))


{'mcc': 0.3046525455707007, 'tp': 26, 'tn': 37, 'fp': 15, 'fn': 18, 'acc': 0.65625, 'eval_loss': 0.6550406515598297}
