In [1]:
import numpy as np
import pandas as pd


In [2]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2

TEXT_DATA = 'data/fake_or_real_news.csv'
GLOVE_DATA = 'data/glove.6B.300d.txt'

In [3]:
# define a function that allows us to evaluate our models

from sklearn.metrics import accuracy_score

def evaluate_model(predict_fun, X_train, y_train, X_test, y_test):
    '''
    evaluate the model, both training and testing errors are reported
    '''
    # training error
    y_predict_train = predict_fun(X_train)
    print("Training Accuracy: {: 6.2f}%".format(accuracy_score(y_train,y_predict_train)*100))
    # testing error
    y_predict_test = predict_fun(X_test)
    print("Testing Accuracy: {: 6.2f}%".format(accuracy_score(y_test,y_predict_test)*100))

In [4]:
# Indexing word vectors.
# build index mapping words in the embeddings set to their embedding vector 

embeddings_index = {}
with open(GLOVE_DATA) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))


Found 400000 word vectors.


In [5]:
# Processing text dataset
# NOTE: the data file contains empty 'text' entries

df = pd.read_csv(TEXT_DATA)
df.drop(labels=['id'], axis='columns', inplace=True)

def drop_empty_rows(df):
    drop_list = []
    for i in range(df.shape[0]):
        if df.loc[i,'text'].isspace():
            print("found empty text @ {}...dropping".format(i))
            drop_list.append(i)
    new_df = df.drop(labels=drop_list, axis='index')
    new_index = [i for i in range(new_df.shape[0])]
    new_df.index = new_index
    return new_df

df = drop_empty_rows(df)

found empty text @ 106...dropping
found empty text @ 710...dropping
found empty text @ 806...dropping
found empty text @ 919...dropping
found empty text @ 940...dropping
found empty text @ 1664...dropping
found empty text @ 1736...dropping
found empty text @ 1851...dropping
found empty text @ 1883...dropping
found empty text @ 1941...dropping
found empty text @ 2244...dropping
found empty text @ 2426...dropping
found empty text @ 2576...dropping
found empty text @ 2662...dropping
found empty text @ 2788...dropping
found empty text @ 2832...dropping
found empty text @ 3073...dropping
found empty text @ 3350...dropping
found empty text @ 3511...dropping
found empty text @ 3641...dropping
found empty text @ 3642...dropping
found empty text @ 4014...dropping
found empty text @ 4142...dropping
found empty text @ 4253...dropping
found empty text @ 4713...dropping
found empty text @ 4744...dropping
found empty text @ 5017...dropping
found empty text @ 5088...dropping
found empty text @ 5213..

In [6]:
# prepare text samples and their labels                                                                  
texts = list(df['text'])
labels_index = {'FAKE': 0, 'REAL': 1}
labels = list(df['label'].apply(lambda x: 0 if x == 'FAKE' else 1))

print('Found %s texts.' %len(texts))

Found 6299 texts.


In [7]:
# vectorize the text samples into a 2D integer tensor                                                   
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)


Using TensorFlow backend.


Found 98817 unique tokens.
Shape of data tensor: (6299, 1000)
Shape of label tensor: (6299, 2)


In [8]:
# split the data into a training set and a validation set   
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(data, labels, test_size=VALIDATION_SPLIT)

In [9]:
# prepare embedding matrix                                                                                       
from keras.layers import Embedding
from keras.initializers import Constant

num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.                                                  
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer                                                       
# note that we set trainable = False so as to keep the embeddings fixed                                          
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)


In [10]:
# train a 1D convnet with global maxpooling                                                                      
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D
from keras.models import Model

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(x_val, y_val))


Train on 5039 samples, validate on 1260 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x10fe3b7f0>

In [11]:
# evaluate model

def predict(X):
    return np.rint(model.predict(X)) # threshold the predictions to retrieve labels

evaluate_model(predict, x_train, y_train, x_val, y_val)

Training Accuracy:  100.00%
Testing Accuracy:  92.30%
