In [1]:
# setup our environment

import pandas as pd
import numpy as np

DATA_FILE = "data/fake_or_real_news.csv"
TEST_SIZE = 0.2

In [2]:
# define a function that allows us to evaluate our models

from sklearn.metrics import accuracy_score

def evaluate_model(predict_fun, X_train, y_train, X_test, y_test):
    '''
    evaluate the model, both training and testing errors are reported
    '''
    # training error
    y_predict_train = predict_fun(X_train)
    train_acc = accuracy_score(y_train,y_predict_train)
    
    # testing error
    y_predict_test = predict_fun(X_test)
    test_acc = accuracy_score(y_test,y_predict_test)
    
    return train_acc, test_acc

In [3]:
# estimate 95% confidence interval on error

# NOTE: based on conversation on stackexchange: 
# https://stats.stackexchange.com/questions/247551/how-to-determine-the-confidence-of-a-neural-network-prediction
# towards bottom of the page.

from math import sqrt

def error_conf(error, n):
    term = 1.96*sqrt((error*(1-error))/n)
    lb = error - term
    ub = error + term
    
    return lb, ub

In [4]:
# read in our data and preprocess it
# Note: the news items in the data set range from 0 words to 100,000 words
# we restrict ourselves to news items between 500 and 5,000 words.

df = pd.read_csv(DATA_FILE)
df.drop(labels=['id','title'], axis='columns', inplace=True)
# only select stories with between 500 and 5000 words
mask = list(df['text'].apply(lambda x: len(x) >= 500 and len(x) <= 5000))
df = df[mask]

In [5]:
print(df.shape)

(3510, 2)


In [6]:
# show that the sample data is pretty balanced
print(len(df[df['label'] == 'REAL']), len(df[df['label'] == 'FAKE']))

1472 2038


In [7]:
X = df['text']
y = df['label']

In [8]:
# convert labels to numeric labels
# NOTE: DNNs need numeric labels

def convert(x):
    if x == 'FAKE':
        return 0.0
    else:
        return 1.0


## Syntactic Features

In [9]:
# set up vector models for training and testing

from sklearn.feature_extraction.text import CountVectorizer

# data vectorizer
vectorizer = CountVectorizer(analyzer = "word", 
                             binary = True, 
                             min_df = 2,
                             stop_words='english')
docarray = vectorizer.fit_transform(X).toarray()
docterm = pd.DataFrame(docarray, columns=vectorizer.get_feature_names())

In [10]:
print(docterm.shape)

(3510, 23516)


In [11]:
# create training and test data
from sklearn.model_selection import train_test_split
docterm_train, docterm_test, y_train, y_test = train_test_split(docterm, y, test_size=TEST_SIZE)

### Naive Bayes

In [12]:
# Naive Bayes Model - train & test
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(docterm_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [13]:
# evaluate model
train_acc, test_acc = evaluate_model(model.predict, docterm_train, y_train, docterm_test, y_test)
print("Training Accuracy: {:.2f}%".format(train_acc*100))
print("Testing Accuracy: {:.2f}%".format(test_acc*100))

Training Accuracy: 96.69%
Testing Accuracy: 89.17%


In [14]:
# computing 95% confidence interval
n = docterm_test.shape[0]
lb, ub = error_conf(1-test_acc, n)

print("95% confidence interval: {:.2f}%-{:.2f}%".format((1-ub)*100,(1-lb)*100))

95% confidence interval: 86.88%-91.47%


### Random Forest

In [15]:
# Random Forest Model - train & test
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(min_samples_split=60)
model.fit(docterm_train, y_train)

  from numpy.core.umath_tests import inner1d


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=60,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [16]:
# evaluate model
train_acc, test_acc = evaluate_model(model.predict, docterm_train, y_train, docterm_test, y_test)
print("Training Accuracy: {:.2f}%".format(train_acc*100))
print("Testing Accuracy: {:.2f}%".format(test_acc*100))

Training Accuracy: 95.48%
Testing Accuracy: 83.05%


In [17]:
# computing 95% confidence interval
n = docterm_test.shape[0]
lb, ub = error_conf(1-test_acc, n)

print("95% confidence interval: {:.2f}%-{:.2f}%".format((1-ub)*100,(1-lb)*100))

95% confidence interval: 80.27%-85.82%


### Deep Neural Network

In [18]:
# DNN
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout

# build the model
model = Sequential()
model.add(Dense(1024, input_dim=docterm_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(docterm_train, y_train.apply(convert),
          epochs=20,
          batch_size=128,
          validation_data=(docterm_test, y_test.apply(convert)))

Using TensorFlow backend.


Train on 2808 samples, validate on 702 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fad3360cda0>

In [19]:
# evaluate model
train_acc, test_acc = evaluate_model(model.predict_classes, 
                                     docterm_train, 
                                     y_train.apply(convert), 
                                     docterm_test, 
                                     y_test.apply(convert))
print("Training Accuracy: {:.2f}%".format(train_acc*100))
print("Testing Accuracy: {:.2f}%".format(test_acc*100))

Training Accuracy: 57.98%
Testing Accuracy: 58.40%


In [20]:
# computing 95% confidence interval
n = docterm_test.shape[0]
lb, ub = error_conf(1-test_acc, n)

print("95% confidence interval: {:.2f}%-{:.2f}%".format((1-ub)*100,(1-lb)*100))

95% confidence interval: 54.76%-62.05%


## Semantic Features

In [21]:
# Load Spacy semantic model

import spacy

EMBEDDING_DIM = 300

# NOTE: for performance reasons disable everything in the pipeline except the tokenizer
nlp = spacy.load('en_core_web_lg', disable=['parser', 'tagger', 'ner', 'textcat'])

In [22]:
# preprocess text for semantic features

def embed(X):
    '''
    x is a list of strings and embed will compute
    an embedding vector for each and return an array
    of shape (len(x),EMBEDDING_DIM)
    '''
    vectors = []
    text_array = np.array(X)

    print(text_array.shape)
    
    for i in range(text_array.shape[0]):
        vector = nlp(text_array[i]).vector
        vectors.append(vector)
    
    return pd.DataFrame(vectors)

vectors = embed(X)

(3510,)


In [23]:
# create training and test data
from sklearn.model_selection import train_test_split
vectors_train, vectors_test, y_train, y_test = train_test_split(vectors, y, test_size=TEST_SIZE)

### Naive Bayes

In [24]:
# Naive Bayes Model 
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(vectors_train,y_train)

GaussianNB(priors=None)

In [25]:
# evaluate model
train_acc, test_acc = evaluate_model(model.predict, vectors_train, y_train, vectors_test, y_test)
print("Training Accuracy: {:.2f}%".format(train_acc*100))
print("Testing Accuracy: {:.2f}%".format(test_acc*100))

Training Accuracy: 73.29%
Testing Accuracy: 73.79%


In [26]:
# computing 95% confidence interval
n = docterm_test.shape[0]
lb, ub = error_conf(1-test_acc, n)

print("95% confidence interval: {:.2f}%-{:.2f}%".format((1-ub)*100,(1-lb)*100))

95% confidence interval: 70.54%-77.04%


### Random Forest

In [27]:
# Random Forest Model 
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(min_samples_split=10)
model.fit(vectors_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [28]:
# evaluate model
train_acc, test_acc = evaluate_model(model.predict, vectors_train, y_train, vectors_test, y_test)
print("Training Accuracy: {:.2f}%".format(train_acc*100))
print("Testing Accuracy: {:.2f}%".format(test_acc*100))

Training Accuracy: 98.68%
Testing Accuracy: 82.76%


In [29]:
# computing 95% confidence interval
n = docterm_test.shape[0]
lb, ub = error_conf(1-test_acc, n)

print("95% confidence interval: {:.2f}%-{:.2f}%".format((1-ub)*100,(1-lb)*100))

95% confidence interval: 79.97%-85.56%


### Deep Neural Network

In [30]:
# DNN - MLP
from keras.models import Sequential
from keras.layers import Dense, Dropout

# build the model
model = Sequential()
model.add(Dense(256, input_dim=EMBEDDING_DIM, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [31]:
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(vectors_train, y_train.apply(convert),
          epochs=15,
          batch_size=128,
          validation_data=(vectors_test, y_test.apply(convert)))

Train on 2808 samples, validate on 702 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7fac8e0cfe10>

In [32]:
# evaluate model
train_acc, test_acc = evaluate_model(model.predict_classes, 
                                     vectors_train, 
                                     y_train.apply(convert), 
                                     vectors_test, 
                                     y_test.apply(convert))
print("Training Accuracy: {:.2f}%".format(train_acc*100))
print("Testing Accuracy: {:.2f}%".format(test_acc*100))

Training Accuracy: 88.25%
Testing Accuracy: 86.61%


In [33]:
# estimating 95% confidence interval
n = docterm_test.shape[0]
lb, ub = error_conf(1-test_acc, n)

print("95% confidence interval: {:.2f}%-{:.2f}%".format((1-ub)*100,(1-lb)*100))

95% confidence interval: 84.09%-89.13%
