In [2]:

import pandas as pd
import numpy as np
import random
import keras

def load_kagglefakenews():
  
    df = pd.read_csv('Kaggle_FakeNews/train.csv', nrows=200, encoding='utf8')
    train_data = df['text'].values.tolist() 
    train_labels = df['label'].values.tolist() 


    combo = list(zip(train_data, train_labels))
    random.shuffle(combo)
    train_data, train_labels = zip(*combo)
    del df

    return np.asarray(train_data).tolist(), np.asarray(train_labels).tolist()

Using TensorFlow backend.


In [3]:
train_data, train_labels = load_kagglefakenews()

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.utils import to_categorical
import pickle

MAX_NB_WORDS=50000 #dictionary size
MAX_SEQUENCE_LENGTH=1500 #max word length of each individual article
EMBEDDING_DIM=300 #dimensionality of the embedding vector (50, 100, 200, 300)
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')

def tokenize_trainingdata(texts, labels):
    tokenizer.fit_on_texts(texts)
    pickle.dump(tokenizer, open('Models/tokenizer.p', 'wb'))

    sequences = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    labels = to_categorical(labels, num_classes=len(set(labels)))

    return data, labels, word_index

#and run it


Found 19554 unique tokens.


In [10]:
X, Y, word_index = tokenize_trainingdata(train_data, train_labels)    

Found 19554 unique tokens.


In [11]:
#split the data (90% train, 5% test, 5% validation)
train_data = X[:int(len(X)*0.9)]
train_labels = Y[:int(len(X)*0.9)]
test_data = X[int(len(X)*0.9):int(len(X)*0.95)]
test_labels = Y[int(len(X)*0.9):int(len(X)*0.95)]
valid_data = X[int(len(X)*0.95):]
valid_labels = Y[int(len(X)*0.95):]

In [12]:
def load_embeddings(word_index, embeddingsfile='wordEmbeddings/glove.6B.%id.txt' %EMBEDDING_DIM):
    embeddings_index = {}
    f = open(embeddingsfile, 'r', encoding='utf8')
    for line in f:
        #here we parse the data from the file
        values = line.split(' ') #split the line by spaces
        word = values[0] #each line starts with the word
        coefs = np.asarray(values[1:], dtype='float32') #the rest of the line is the vector
        embeddings_index[word] = coefs #put into embedding dictionary
    f.close()

    print('Found %s word vectors.' % len(embeddings_index))

    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    
    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    return embedding_layer
    
#and build the embedding layer
embedding_layer = load_embeddings(word_index)

Found 400000 word vectors.


In [13]:
from keras import Sequential, Model, Input
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D, Flatten, Dense, GlobalAveragePooling1D, Dropout, LSTM, CuDNNLSTM, RNN, SimpleRNN, Conv2D, GlobalMaxPooling1D
from keras import callbacks

def baseline_model(sequence_input, embedded_sequences, classes=2):
    x = Conv1D(64, 5, activation='relu')(embedded_sequences)
    x = MaxPooling1D(5)(x)
    x = Conv1D(128, 3, activation='relu')(x)
    x = MaxPooling1D(5)(x)
    x = Conv1D(256, 2, activation='relu')(x)
    x = GlobalAveragePooling1D()(x)
    x = Dense(2048, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.5)(x)
    preds = Dense(classes, activation='softmax')(x)

    model = Model(sequence_input, preds)
    return model

In [14]:
#put embedding layer into input of the model
MAX_SEQUENCE_LENGTH=1500
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

model = baseline_model(sequence_input, embedded_sequences, classes=2)

model.compile(loss='categorical_crossentropy',
              optimizer='adamax',
              metrics=['acc'])

print(model.summary())

model.fit(train_data, train_labels,
          validation_data=(valid_data, valid_labels),
          epochs=25, batch_size=64)


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1500)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1500, 300)         5866500   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1496, 64)          96064     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 299, 64)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 297, 128)          24704     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 59, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 58, 256)           657

<keras.callbacks.callbacks.History at 0x7fc5ebe33358>

In [15]:
model.evaluate(test_data, test_labels)



[0.5696607828140259, 0.800000011920929]

In [16]:
def tokenize_text(text):
    sequences = tokenizer.texts_to_sequences(text)
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    return data

In [22]:
f1 = open('scraping/article.txt', "r")
text = f1.read()
  

#tokenize
tok = tokenize_text([text])


print(model.predict(tok))  #%change of being real , fake


[[0.9888337  0.01116626]]
