In [1]:
# setup our environment

import pandas as pd
import numpy as np

DATA_FILE = "data/fake_or_real_news.csv"

In [2]:
# define a function that allows us to evaluate our models

from sklearn.metrics import accuracy_score

def evaluate_model(predict_fun, X_train, y_train, X_test, y_test):
    '''
    evaluate the model, both training and testing errors are reported
    '''
    # training error
    y_predict_train = predict_fun(X_train)
    print("Training Accuracy: {: 6.2f}%".format(accuracy_score(y_train,y_predict_train)*100))
    # testing error
    y_predict_test = predict_fun(X_test)
    print("Testing Accuracy: {: 6.2f}%".format(accuracy_score(y_test,y_predict_test)*100))

In [3]:
# read in our data and clean it
# NOTE: the data file contains empty 'text' entries

df = pd.read_csv(DATA_FILE)
df.drop(labels=['id'], axis='columns', inplace=True)

def drop_empty_rows(df):
    drop_list = []
    for i in range(df.shape[0]):
        if df.loc[i,'text'].isspace():
            print("found empty text @ {}...dropping".format(i))
            drop_list.append(i)
    new_df = df.drop(labels=drop_list, axis='index')
    new_index = [i for i in range(new_df.shape[0])]
    new_df.index = new_index
    return new_df

df = drop_empty_rows(df)

found empty text @ 106...dropping
found empty text @ 710...dropping
found empty text @ 806...dropping
found empty text @ 919...dropping
found empty text @ 940...dropping
found empty text @ 1664...dropping
found empty text @ 1736...dropping
found empty text @ 1851...dropping
found empty text @ 1883...dropping
found empty text @ 1941...dropping
found empty text @ 2244...dropping
found empty text @ 2426...dropping
found empty text @ 2576...dropping
found empty text @ 2662...dropping
found empty text @ 2788...dropping
found empty text @ 2832...dropping
found empty text @ 3073...dropping
found empty text @ 3350...dropping
found empty text @ 3511...dropping
found empty text @ 3641...dropping
found empty text @ 3642...dropping
found empty text @ 4014...dropping
found empty text @ 4142...dropping
found empty text @ 4253...dropping
found empty text @ 4713...dropping
found empty text @ 4744...dropping
found empty text @ 5017...dropping
found empty text @ 5088...dropping
found empty text @ 5213..

In [4]:
# create our training and test data

from sklearn.model_selection import train_test_split

TEST_SIZE = 0.2

X = df['text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE)

In [5]:
# convert labels to numeric labels
# NOTE: DNNs need numeric labels

def convert(x):
    if x == 'FAKE':
        return 0.0
    else:
        return 1.0

y_train_num = y_train.apply(convert)
y_test_num = y_test.apply(convert)

## Syntactic Features

In [6]:
# set up vector models for training and testing

from sklearn.feature_extraction.text import CountVectorizer

# training data vectorizer
vectorizer_train = CountVectorizer(analyzer = "word", 
                                   binary = True, 
                                   min_df = 2,
                                   stop_words='english')
docarray_train = vectorizer_train.fit_transform(X_train).toarray()
docterm_train = pd.DataFrame(docarray_train, columns=vectorizer_train.get_feature_names())

# testing data vectorizer
# NOTE: we have to make sure the features of the training and testing sets are the same!
vectorizer_test = CountVectorizer(analyzer = "word", 
                                  binary = True, 
                                  vocabulary=docterm_train.columns)
docarray_test = vectorizer_test.fit_transform(X_test).toarray()
docterm_test = pd.DataFrame(docarray_test, columns=docterm_train.columns)

In [7]:
print(docterm_train.shape)

(5039, 35697)


In [8]:
# Naive Bayes Model 
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(docterm_train, y_train)


# evaluate model
evaluate_model(model.predict, docterm_train, y_train, docterm_test, y_test)

Training Accuracy:  94.34%
Testing Accuracy:  90.32%


In [9]:
# Random Forest Model 
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(min_samples_split=60)
model.fit(docarray_train, y_train)

# evaluate model
evaluate_model(model.predict, docterm_train, y_train, docterm_test, y_test)


Training Accuracy:  95.67%
Testing Accuracy:  84.44%


In [10]:
# DNN
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout

# build the model
model = Sequential()
model.add(Dense(1024, input_dim=docterm_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])


model.fit(docterm_train, y_train_num,
          epochs=5,
          batch_size=128,
          validation_data=(docterm_test, y_test_num))

Using TensorFlow backend.


Train on 5039 samples, validate on 1260 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a2b91dd8>

In [11]:
# evaluate model

def predict(X):
    return np.rint(model.predict(X)) # threshold the predictions to retrieve labels

evaluate_model(predict, docterm_train, y_train_num, docterm_test, y_test_num)

Training Accuracy:  99.90%
Testing Accuracy:  94.37%


## Semantic Features

In [12]:
# Load Spacy semantic model

import spacy

EMBEDDING_DIM = 300

# NOTE: for performance reasons disable everything in the pipeline except the tokenizer
nlp = spacy.load('en_core_web_lg', disable=['parser', 'tagger', 'ner', 'textcat'])

In [13]:
# preprocess text for semantic features

def embed(X):
    '''
    x is a list of strings and embed will compute
    an embedding vector for each and return an array
    of shape (len(x),EMBEDDING_DIM)
    '''
    vectors = []
    text_array = np.array(X)

    print(text_array.shape)
    
    for i in range(text_array.shape[0]):
        vector = nlp(text_array[i]).vector
        vectors.append(vector)
    
    return pd.DataFrame(vectors)

vectors_train = embed(X_train)
vectors_test = embed(X_test)

(5039,)
(1260,)


In [14]:
# Naive Bayes Model 
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(vectors_train,y_train)

# evaluate model
evaluate_model(model.predict, vectors_train, y_train, vectors_test, y_test)

Training Accuracy:  70.69%
Testing Accuracy:  69.60%


In [15]:
# Random Forest Model 
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(min_samples_split=10)
model.fit(vectors_train, y_train)

# evaluate model
evaluate_model(model.predict, vectors_train, y_train, vectors_test, y_test)

Training Accuracy:  98.79%
Testing Accuracy:  84.60%


In [18]:
# DNN - MLP
from keras.models import Sequential
from keras.layers import Dense, Dropout

# build the model
model = Sequential()
model.add(Dense(256, input_dim=EMBEDDING_DIM, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(vectors_train, y_train_num,
          epochs=20,
          batch_size=128,
         validation_data=(vectors_test, y_test_num))

Train on 5039 samples, validate on 1260 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1cb7a8d30>

In [17]:
# evaluate model

def predict(X):
    return np.rint(model.predict(X)) # threshold the predictions to retrieve labels

evaluate_model(predict, vectors_train, y_train_num, vectors_test, y_test_num)

Training Accuracy:  92.42%
Testing Accuracy:  90.87%
