<a href="https://colab.research.google.com/github/sweet-addy/Fake-True-News-NLP-Models/blob/master/NLP_Fake_vs_Real_News_(Modeling_with_Basic_Deep_Neural_Networks).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
import unicodedata
import en_core_web_sm
import os
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn import svm
from time import process_time, time
from sklearn import metrics
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


  import pandas.util.testing as tm


In [6]:
from google.colab import files


In [17]:
uploaded = files.upload()


Saving Fake_True_news_shuffled.csv to Fake_True_news_shuffled (1).csv


In [18]:
%%time
import io
Fake_True_news = pd.read_csv(io.BytesIO(uploaded['Fake_True_news_shuffled.csv']))
# Dataset is now stored in a Pandas Dataframe

CPU times: user 1.96 s, sys: 198 ms, total: 2.16 s
Wall time: 2.17 s


In [19]:
Fake_True_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      44898 non-null  object
 1   text       44898 non-null  object
 2   subject    44898 non-null  object
 3   date       44898 non-null  object
 4   real/fake  44898 non-null  int64 
 5   title_nlp  44898 non-null  object
 6   text_nlp   44267 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.4+ MB


In [20]:
#didn't notice that during pre-processing some rows in 'text_nlp' got deleted. Drop the rows that contain a NaN in that column.
Fake_True_news = Fake_True_news.dropna(axis=0, subset=['text_nlp'])
Fake_True_news.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44267 entries, 0 to 44897
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      44267 non-null  object
 1   text       44267 non-null  object
 2   subject    44267 non-null  object
 3   date       44267 non-null  object
 4   real/fake  44267 non-null  int64 
 5   title_nlp  44267 non-null  object
 6   text_nlp   44267 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.7+ MB


**Build Traning and Test Datasets (same as in Jupyter NOtebook)**

In [23]:
#create feature data frame and isolate target variable

#feature dataframe
X_text = Fake_True_news['text_nlp'] # load the dataset as a pandas data frame

#target variable
y_realfake = Fake_True_news['real/fake']

# create training and testing vars
X_train_text, X_test_text, y_train_realfake, y_test_realfake = train_test_split(X_text, y_realfake, test_size=0.2,random_state=27)

In [24]:
%%time

# build BOW features on train news
cv = CountVectorizer(binary=False, min_df=5, max_df=1.0, ngram_range=(1,2))
cv_Xtrain_features = cv.fit_transform(X_train_text)


# build TFIDF features on train news
tv = TfidfVectorizer(use_idf=True, min_df=5, max_df=1.0, ngram_range=(1,2),
                     sublinear_tf=True)
tv_Xtrain_features = tv.fit_transform(X_train_text)

CPU times: user 1min 14s, sys: 634 ms, total: 1min 15s
Wall time: 1min 15s


In [25]:
#transform test news into features
cv_Xtest_features = cv.transform(X_test_text)
tv_Xtest_features = tv.transform(X_test_text)

In [26]:
print('BOW model:> Train features shape:', cv_Xtrain_features.shape, ' Test features shape:', cv_Xtest_features.shape)
print('TFIDF model:> Train features shape:', tv_Xtrain_features.shape, ' Test features shape:', tv_Xtest_features.shape)

BOW model:> Train features shape: (35413, 273556)  Test features shape: (8854, 273556)
TFIDF model:> Train features shape: (35413, 273556)  Test features shape: (8854, 273556)


**Newer Supervised Deep Learning Models**

In [27]:
import gensim
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Activation, Dense
from sklearn.preprocessing import LabelEncoder

**Prediction class label encoding**

In [28]:
%%time

le = LabelEncoder()
# tokenize train reviews & encode train labels
tokenized_train = [nltk.word_tokenize(text)
                       for text in X_train_text]
y_train = le.fit_transform(y_train_realfake)
# tokenize test reviews & encode test labels
tokenized_test = [nltk.word_tokenize(text)
                       for text in X_test_text]
y_test = le.fit_transform(y_test_realfake)

CPU times: user 57.2 s, sys: 607 ms, total: 57.8 s
Wall time: 58 s


In [29]:
#print class label encoding map and encoded labels
print('Sentiment class label map:', dict(zip(le.classes_, le.transform(le.classes_))))
print('Sample test label transformation:\n'+'-'*35,
      '\nActual Labels:', y_test_realfake[:3], '\nEncoded Labels:', y_test[:3])

Sentiment class label map: {0: 0, 1: 1}
Sample test label transformation:
----------------------------------- 
Actual Labels: 35267    0
19951    1
36697    1
Name: real/fake, dtype: int64 
Encoded Labels: [0 1 1]


**Feature Engineering with word embeddings**

In [30]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [31]:
%%time

# build word2vec model
w2v_num_features = 300
w2v_model = gensim.models.Word2Vec(tokenized_train, size=w2v_num_features, window=150,
                                   min_count=10, workers=4, iter=5)

2020-09-28 06:04:32,118 : INFO : collecting all words and their counts
2020-09-28 06:04:32,121 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-09-28 06:04:32,669 : INFO : PROGRESS: at sentence #10000, processed 2389521 words, keeping 66284 word types
2020-09-28 06:04:33,247 : INFO : PROGRESS: at sentence #20000, processed 4872953 words, keeping 91307 word types
2020-09-28 06:04:33,825 : INFO : PROGRESS: at sentence #30000, processed 7321798 words, keeping 108145 word types
2020-09-28 06:04:34,145 : INFO : collected 115591 word types from a corpus of 8639353 raw words and 35413 sentences
2020-09-28 06:04:34,146 : INFO : Loading a fresh vocabulary
2020-09-28 06:04:34,273 : INFO : effective_min_count=10 retains 29956 unique words (25% of original 115591, drops 85635)
2020-09-28 06:04:34,274 : INFO : effective_min_count=10 leaves 8430884 word corpus (97% of original 8639353, drops 208469)
2020-09-28 06:04:34,379 : INFO : deleting the raw counts dictionary of

CPU times: user 18min 20s, sys: 2.34 s, total: 18min 22s
Wall time: 9min 30s


In [32]:
def averaged_word2vec_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    
    def average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype="float64")
        nwords = 0.
        
        for word in words:
            if word in vocabulary: 
                nwords = nwords + 1.
                feature_vector = np.add(feature_vector, model.wv[word])
        if nwords:
            feature_vector = np.divide(feature_vector, nwords)

        return feature_vector

    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [33]:
# generate averaged word vector features from word2vec model
avg_wv_train_features = averaged_word2vec_vectorizer(corpus=tokenized_train, model=w2v_model,
                                                     num_features=w2v_num_features)
avg_wv_test_features = averaged_word2vec_vectorizer(corpus=tokenized_test, model=w2v_model,
                                                    num_features=w2v_num_features)

In [34]:
print('Word2Vec model:> Train features shape:', avg_wv_train_features.shape, ' Test features shape:', avg_wv_test_features.shape)


Word2Vec model:> Train features shape: (35413, 300)  Test features shape: (8854, 300)


**Modeling with Deep Neural Networks**

**Building Deep neural network architecture**

In [35]:
def construct_deepnn_architecture(num_input_features):
    dnn_model = Sequential()
    dnn_model.add(Dense(512, input_shape=(num_input_features,)))
    dnn_model.add(Activation('relu'))
    dnn_model.add(Dropout(0.2))
    
    dnn_model.add(Dense(256))
    dnn_model.add(Activation('relu'))
    dnn_model.add(Dropout(0.2))
    
    dnn_model.add(Dense(256))
    dnn_model.add(Activation('relu'))
    dnn_model.add(Dropout(0.2))
    
    dnn_model.add(Dense(1))
    dnn_model.add(Activation('sigmoid'))

    dnn_model.compile(loss='binary_crossentropy', optimizer='adam',                 
                      metrics=['accuracy'])
    return dnn_model

In [36]:
w2v_dnn = construct_deepnn_architecture(num_input_features=w2v_num_features)

In [37]:
w2v_dnn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               154112    
_________________________________________________________________
activation (Activation)      (None, 512)               0         
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               6

**Model Training, Prediction, and Performance Evaluation**

In [38]:
batch_size = 100
w2v_dnn.fit(avg_wv_train_features, y_train, epochs=10, batch_size=batch_size, 
            shuffle=True, validation_split=0.1, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f85d9268550>

In [39]:
y_pred = w2v_dnn.predict_classes(avg_wv_test_features)
predictions = le.inverse_transform(y_pred)


Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
  y = column_or_1d(y, warn=True)


In [40]:
from sklearn.metrics import confusion_matrix, classification_report

labels = ['real', 'fake']
print(classification_report(y_test_realfake, predictions))
pd.DataFrame(confusion_matrix(y_test_realfake, predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4335
           1       1.00      1.00      1.00      4519

    accuracy                           1.00      8854
   macro avg       1.00      1.00      1.00      8854
weighted avg       1.00      1.00      1.00      8854



Unnamed: 0,real,fake
real,4323,12
fake,12,4507
