In [None]:
import re
import os
import pathlib
import time
import numpy as np
import pandas as pd
import matplotlib
from matplotlib.offsetbox import AnchoredText
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from keras.utils.vis_utils import model_to_dot
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D
from keras.layers import LSTM, Dropout
from keras.layers import Embedding, TextVectorization
from keras.preprocessing import sequence
from keras.callbacks import ModelCheckpoint
import nltk # for natural language processing
from nltk.corpus import stopwords # for removing english stopwords
from nltk.stem import WordNetLemmatizer # for term stemming
from prettytable import PrettyTable
import sklearn # for predictive data analysis
from sklearn.model_selection import train_test_split
from tabulate import tabulate
from collections import defaultdict
from IPython.core.display import SVG
from IPython.core.interactiveshell import InteractiveShell # to modify Jupyter notebook configuration
InteractiveShell.ast_node_interactivity = "all" # so that all outputs in a cell are returned (instead of last instance)

# #Supress default INFO logging
# import logging
# logger = logging.getLogger()
# logger.setLevel(logging.CRITICAL)

In [None]:
# Load the fake and true news CSVs into Pandas dataframes
true_news = pd.read_csv('True.csv') 
fake_news = pd.read_csv('Fake.csv')

# Add column for fake/true target (true == 1, false == 0)
true_news['target'] = 1
fake_news['target'] = 0

# True and Fake news value counts - are they balanced?
print("Compare number of observations in true news and fake news data frames")
true_news['target'].value_counts()
print()
fake_news['target'].value_counts()
print()

In [None]:
# Remove random rows from fake_news (n = 2064) data frame so it has same number of rows as true_news
np.random.seed(5)
remove_n = 2064
drop_indices = np.random.choice(fake_news.index, remove_n, replace = False)
fake_news = fake_news.drop(drop_indices)

# Check that have same number of observations now
print("True and fake datasets should have same number of samples now...")
true_news['target'].value_counts()
print()
fake_news['target'].value_counts()
print()

# Preview first and last 5 rows in datasets to ensure they imported properly
print("Preview of the raw datasets to ensure they imported properly:")
true_news.head(-5)
print()
fake_news.head(-5)
print()

In [None]:
# Combine true_news and fake_news data frames into one
dfs = [true_news, fake_news]
news_data = pd.concat(dfs)

# Concatenate text columns and isolate only relevant columns for analysis (i.e., text and target)
news_data['text'] = news_data['title'] + ' ' + news_data['text']
news_data = news_data[['text', 'target']]

# Check that binary values were assigned correctly
print("Dimensions of data frame that will be cleaned:")
news_data.shape # data frame dimensions
print()

print("First and last five rows of pre-cleaned concatenated dataset:")
news_data.head(-5) # first 5 and last 5 rows
print()

print("Null values by column:")
news_data.isnull().sum() # check for null values
print()

In [None]:
# Instantiate WordNetLemmatizer() -- reduce words to their roots
wnl = WordNetLemmatizer()

# Download multilingual Wordnet data from OMW
nltk.download('omw-1.4')

# List of english stopwords
nltk.download('stopwords') 
stop_words = set(stopwords.words('english'))

# Download english dictionary ('wordnet')
nltk.download('wordnet');

# Download pre-trained GloVe embeddings using the following commands: 
# "wget http://nlp.stanford.edu/data/glove.6B.zip"
# "unzip -q glove.6B.zip"

In [None]:
def data_cleaning(row):
    row = row.lower() # convert text into lowercase
    row = re.sub('[^a-zA-Z]', ' ', row) # remove number and special characters using regex (keep words only)
    token = row.split() # split the data into tokens
    news = [wnl.lemmatize(word) for word in token if not word in stop_words] # lemmatize the words and remove any stopwords (e.g., a, an, the, etc.)
    row_clean = [word for word in news if len(word) >= 3] # only keep words greater than or equal to length of 3
    cleaned_news = ' '.join(row_clean) # join all tokenized words with space in between 
    
    return cleaned_news

In [None]:
# Clean the data - might take a couple minutes to run.
news_data['text'] = news_data['text'].apply(lambda x : data_cleaning(x)) # 'text' column gets cleaned
print("First and last five rows after cleaning the data:")
news_data.head(-5) # check that cleaning went as planned
print()

In [None]:
# Check for null values
print("Null values by column:")
news_data.isnull().sum() # want zero null values
print()

# Check number unique values in each column
print("Unique values by column:")
news_data.nunique() # number unique values in each column
print()

In [None]:
# Isolate the target and text features
target = news_data['target'] # target values
text_dataset = news_data['text'] # predictor text features

# Check first and last 5 rows of target and text datasets
target.head(-5)
text_dataset.head(-5)

In [None]:
# Split the data into training and test subsets
train_data, test_data, train_target, test_target = train_test_split(text_dataset, target, random_state = 5, train_size = 0.80)

# Check the split from into training and testing datasets
train_data.head(-5)
test_data.head(-5)

In [None]:
# Convert the text_dataset, train_data, and test_data data frames to a tensor
train_data = tf.convert_to_tensor(train_data, dtype = tf.string) # train data
test_data = tf.convert_to_tensor(test_data, dtype = tf.string) # test data
train_target = tf.convert_to_tensor(train_target, dtype = tf.int32) # train data
test_target = tf.convert_to_tensor(test_target, dtype = tf.int32) # test data
text_dataset = tf.convert_to_tensor(text_dataset, dtype = tf.string) # text dataset (train and test combined) for full vocabulary

# Double check conversion to tensor
test_data
train_target

In [None]:
# List of different values to try for TfidVectorizer max_features (i.e., top key words)
key_words = [100, 200, 500, 1000, 5000] # How many of the top key words to keep - iterate over list
n_grams = [(1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)] # ngram_range dictates if we keep 1 word (1, 1), 1 or 2 words (1, 2) etc.
max_len = [300, 600] # sequence length to pad the outputs to

In [None]:
# ngrams parameter defines phrase length and iterates over n_grams list.
# (1, 1) keeps single words only, (3,3) keeps three word phrases,
# (1, 3) keeps one, two, or three word phrases in top n words and so on.
for ng in n_grams:

    # Max tokens parameter chooses the top n words (iterate over key_words list)
    for kw in key_words:
        
        # output_sequence_length parameter defines the sequence length to pad the outputs to
        # iterates over values in max_len list.
        for ml in max_len:

            # Create the vocabulary index. 
            vectorizer = tf.keras.layers.TextVectorization(
                max_tokens = kw, # number of top key words to keep
                standardize = None, # data already cleaned above
                split = 'whitespace', # already split tokens during data cleaning
                ngrams = ng, # range of token string lengths to keep in max_tokens (unigram, bigram, trigram)
                output_mode = 'int', # outputs one integer index per split string token
                output_sequence_length = ml, # if set, output will have its time (length) dimension padded to exactly the specified value
                pad_to_max_tokens = False, # not valid argument for integer outputs
                vocabulary = None, # optional, the adapt layer below handle's this step better
                idf_weights = None, # not valid argument for integer outputs
                sparse = False, # not applicable argument for integer outputs
                ragged = False, # false cause you want each sequence shrunk or padded to the same output_sequence_length
            )

            # Now that the vocab layer has been created, call `adapt` on the text-only
            # dataset to create the vocabulary. You don't have to batch, but for large
            # datasets this means we're not keeping spare copies of the dataset.
            vectorizer.adapt(text_dataset)
            
            # Retrieve the top 10 words from vectorized vocabulary
            vectorizer.get_vocabulary()[:10]

            # Create a dictionary mapping words to their indices
            voc = vectorizer.get_vocabulary()
            word_index = dict(zip(voc, range(len(voc))))

            # Make a dictionary mapping strings to their NumPy vector representation in gloVe:
            data_dir = 'E:/Zack/School/Classes/Summer 22/CSC 7333 - Machine Learning/Group Project/glove.6B.100d.txt'

            embeddings_index = {}
            with open(data_dir, encoding = "utf8") as f:
                for line in f:
                    word, coefs = line.split(maxsplit = 1)
                    coefs = np.fromstring(coefs, "f", sep = " ")
                    embeddings_index[word] = coefs

            print("Found %s word vectors." % len(embeddings_index))
            print()

            # Prepare a corresponding embedding matrix to use in the model's Embedding layer
            num_tokens = len(voc) + 2 # plus 2 because 0 is reserved for padding and 1 is reserved for tokens not in vocab
            embedding_dim = 100
            hits = 0
            misses = 0

            # Prepare embedding matrix
            embedding_matrix = np.zeros((num_tokens, embedding_dim))
            for word, i in word_index.items():
                embedding_vector = embeddings_index.get(word)
                if embedding_vector is not None:
                    # Words not found in embedding index will be all-zeros.
                    # This includes the representation for "padding" and tokens not in vocabulary index
                    embedding_matrix[i] = embedding_vector
                    hits += 1
                else:
                    misses += 1
            print("Converted %d words (%d misses)" % (hits, misses))
            print()
            
            # Create the model that uses the vectorized text layer
            model = Sequential()

            # Start by creating an explicit input layer. It needs to have a shape of
            # (1,) (because we need to guarantee that there is exactly one string
            # input per batch), and the dtype needs to be 'string'.
            model.add(tf.keras.Input(shape = (1,), dtype = tf.string))

            # The first layer in our model is the vectorization layer. After this layer,
            # we have a tensor of shape (batch_size, max_len) containing vocab indices
            model.add(vectorizer)

            # Now, the model can map strings to integers, and you can add an embedding
            # layer to map these integers to learned embeddings.
            model.add(Embedding(
                num_tokens, # kw (or voc) + 2
                embedding_dim, # 100 dimension matrix
                input_length = ml, # size of the padded sequence
                embeddings_initializer = keras.initializers.Constant(embedding_matrix), # intialize based on your embedding matrix
                trainable = False # set to false so you don't update embedding during training
            ))

            # Finish building the model
            model.add(Conv1D(filters = 128, kernel_size = 5, padding = 'same', activation = 'relu')) # extract local features using 128 filters with kernel size of 5 and default ReLU activiation function
            model.add(MaxPooling1D(pool_size = 2)) # pool large vector features from above CNN layer with window size of 2--this downsamples the feature vectors/parameters
            model.add(LSTM(32)) # this is your RNN (long short-term memory (LSTM)) layer; the pooled features from above layer are the input w/ default hyperbolic tangent activation
            model.add(Dense(1, activation = 'sigmoid')) # this layer classifies the trained vector features and shrinks the output dimension to 1, which corresponds to classification label. 
            model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy']) # train the model using batch size 64 across 10 epochs using adaptive moment estimation (ADAM) to define learning rate in each epoch

            # CNN-RNN hybrid model structure: 
            print(f"CNN-RNN hybrid model structure using top {kw} words, an n-gram range of {ng}, and padded length of {ml} is:") 
            print(model.summary())
            print()

            # Create variable for storing start time
            start_time = time.time()

            # Save model weights 
            filepath = f"best_weights_cnn-rnn_{kw}words_{ng}range_{ml}maxLength.tf"
            checkpoint = ModelCheckpoint(filepath, monitor = 'val_accuracy', verbose = 1, save_best_only = True, mode = 'max', save_weights_only = True)
            callbacks_list = [checkpoint]
            history = model.fit(train_data, train_target, epochs = 5, batch_size = 64, verbose = 1, callbacks = callbacks_list, validation_split = 0.2) # fit the model on training data

            # Evaluate the model
            scores = model.evaluate(test_data, test_target, verbose = 1)
            loss = (scores[0]) # store best loss value
            accuracy = (scores[1]*100) # store best accuracy value
            print(f"CNN-RNN hybrid model accuracy using top {kw}, an n-gram range of {ng}, and padded length of {ml} is: {accuracy:.2f}%")
            print()

            # Create variable for storing current time
            # Subtract start time from current time to get runtime
            current_time = time.time()
            elapsed_time = current_time - start_time
            print(f"Model training and evaluation time using top {kw} words, an n-gram range of {ng}, and padded length of {ml} is: {elapsed_time:.2f} seconds")
            print()

            # Training and validation accuracy
            fig, ax = plt.subplots()
            plt.plot(history.history['accuracy']) # training accuracy
            plt.plot(history.history['val_accuracy']) # validation accuracy
            plt.title(f'CNN-RNN Model Accuracy: n-gram range {ng},\n{kw} top words, and padded length of {ml}')
            plt.legend(['Training', 'Validation'], loc = 'center right')
            plt.ylabel('Accuracy')
            plt.xlabel('Epochs')
            plt.xticks([0, 1, 2, 3, 4])
            text_box = AnchoredText(f"Accuracy: {accuracy:.2f}%", loc = 'lower right', frameon = False, pad = 0.5)
            plt.setp(text_box.patch, facecolor = 'white', alpha = 0.5)
            ax.add_artist(text_box)
            plt.show();

            # Training and validation loss
            fig, ax = plt.subplots()
            plt.plot(history.history['loss']) # training accuracy
            plt.plot(history.history['val_loss']) # validation accuracy
            plt.title(f'CNN-RNN Model Loss: n-gram range {ng},\n{kw} top words, and padded length of {ml}')
            plt.ylabel('Loss')
            plt.xlabel('Epochs')
            plt.xticks([0, 1, 2, 3, 4])
            plt.legend(['Training', 'Validation'], loc = 'center right')
            text_box = AnchoredText(f"Loss: {loss:.4f}", loc = 'upper right', frameon = False, pad = 0.5)
            plt.setp(text_box.patch, facecolor = 'white', alpha = 0.5)
            ax.add_artist(text_box)
            plt.show();

            # # Create empty dictionary to append model metrics to
            modelMetrics = [('model', filepath), ('n_grams', ng), ('key_words', kw), ('padded_length', ml), ('loss', loss), ('accuracy', accuracy), ('elapsed_time', elapsed_time)]
            metrics_dict = defaultdict(list)
            for k, v in modelMetrics:
                metrics_dict[k].append(v)
            
            updatedMetrics = []
            updatedMetrics = updatedMetrics.append(metrics_dict)

            print(f"Updated model metrics for top {kw} words, an n-gram range of {ng}, and padded length of {ml} are:")
            print(tabulate(updatedMetrics, headers = 'keys'))
            print();