In [9]:
'''
CNN on kaggle fake news dataset 
'''
import pandas as pd
import numpy as np
import random
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.models import model_from_json
import os
def load_kagglefakenews():
  
    df = pd.read_csv('Kaggle_FakeNews/train.csv', nrows=10000, encoding='utf8')
    train_data = df['text'].values.tolist() 
    train_labels = df['label'].values.tolist() 


    combo = list(zip(train_data, train_labels))
    random.shuffle(combo)
    train_data, train_labels = zip(*combo)
    del df

    return np.asarray(train_data).tolist(), np.asarray(train_labels).tolist()

In [10]:
train_data, train_labels = load_kagglefakenews()

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.utils import to_categorical
import pickle

MAX_NB_WORDS=50000 #dictionary size
MAX_SEQUENCE_LENGTH=1500 #max word length of each individual article
EMBEDDING_DIM=300 #dimensionality of the embedding vector (50, 100, 200, 300)
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')

def tokenize_trainingdata(texts, labels):
    tokenizer.fit_on_texts(texts)
    pickle.dump(tokenizer, open('Models/tokenizer.p', 'wb'))

    sequences = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    labels = to_categorical(labels, num_classes=len(set(labels)))

    return data, labels, word_index



In [12]:
X, Y, word_index = tokenize_trainingdata(train_data, train_labels)    

Found 165537 unique tokens.


In [13]:
#splitting the data (90% train, 5% test, 5% validation)
train_data = X[:int(len(X)*0.9)]
train_labels = Y[:int(len(X)*0.9)]
test_data = X[int(len(X)*0.9):int(len(X)*0.95)]
test_labels = Y[int(len(X)*0.9):int(len(X)*0.95)]
valid_data = X[int(len(X)*0.95):]
valid_labels = Y[int(len(X)*0.95):]

In [14]:
def load_embeddings(word_index, embeddingsfile='wordEmbeddings/glove.6B.%id.txt' %EMBEDDING_DIM):
    embeddings_index = {}
    f = open(embeddingsfile, 'r', encoding='utf8')
    for line in f:
        
        values = line.split(' ') 
        word = values[0] 
        coefs = np.asarray(values[1:], dtype='float32') 
        embeddings_index[word] = coefs 
    f.close()

    print('Found %s word vectors.' % len(embeddings_index))

    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:

            embedding_matrix[i] = embedding_vector
    
    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    return embedding_layer
    

embedding_layer = load_embeddings(word_index)

Found 400000 word vectors.


In [15]:
from keras import Sequential, Model, Input
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D, Flatten, Dense, GlobalAveragePooling1D, Dropout, LSTM, CuDNNLSTM, RNN, SimpleRNN, Conv2D, GlobalMaxPooling1D
from keras import callbacks

def baseline_model(sequence_input, embedded_sequences, classes=2):
    x = Conv1D(64, 5, activation='relu')(embedded_sequences)
    x = MaxPooling1D(5)(x)
    x = Conv1D(128, 3, activation='relu')(x)
    x = MaxPooling1D(5)(x)
    x = Conv1D(256, 2, activation='relu')(x)
    x = GlobalAveragePooling1D()(x)
    x = Dense(2048, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.5)(x)
    preds = Dense(classes, activation='softmax')(x)

    model = Model(sequence_input, preds)
    return model

In [None]:

MAX_SEQUENCE_LENGTH=1500
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

model = baseline_model(sequence_input, embedded_sequences, classes=2)

model.compile(loss='categorical_crossentropy',
              optimizer='adamax',
              metrics=['acc'])

print(model.summary())

model.fit(train_data, train_labels,
          validation_data=(valid_data, valid_labels),
          epochs=25, batch_size=64)

In [None]:
# # serialize model to JSON
# model_json = model.to_json()
# with open("cnn_model.json", "w") as json_file:
#     json_file.write(model_json)
# # serialize weights to HDF5
# model.save_weights("model.h5")
# print("Saved model to disk")

In [2]:

# load json and create model
json_file = open('cnn_model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load weights into new model
model.load_weights("model.h5")
print("Loaded model from disk")

NameError: name 'model_from_json' is not defined

In [None]:
model.evaluate(test_data, test_labels)

In [1]:
yhat_probs = model.predict(test_data)

NameError: name 'model' is not defined

In [13]:
print(yhat_probs)

[[5.62703995e-10 1.00000000e+00]
 [6.67451799e-01 3.32548231e-01]
 [9.91798639e-01 8.20138864e-03]
 [1.00000000e+00 5.56218478e-17]
 [0.00000000e+00 1.00000000e+00]
 [9.73284021e-02 9.02671576e-01]
 [9.02291577e-16 1.00000000e+00]
 [7.18725328e-15 1.00000000e+00]
 [3.43157902e-23 1.00000000e+00]
 [5.35201580e-05 9.99946475e-01]
 [1.00000000e+00 3.96010005e-17]
 [1.00000000e+00 3.77551384e-16]
 [7.65109303e-20 1.00000000e+00]
 [1.93075072e-16 1.00000000e+00]
 [1.00000000e+00 0.00000000e+00]
 [2.69220201e-24 1.00000000e+00]
 [9.99999881e-01 1.67683950e-07]
 [1.00000000e+00 1.97203636e-13]
 [9.99999523e-01 5.13540499e-07]
 [9.99075770e-01 9.24257038e-04]
 [7.66223085e-10 1.00000000e+00]
 [1.00000000e+00 2.48524552e-12]
 [1.00000000e+00 7.22512761e-13]
 [5.39050298e-08 1.00000000e+00]
 [1.00000000e+00 4.70323173e-12]
 [1.00000000e+00 2.43230226e-18]
 [1.19897209e-16 1.00000000e+00]
 [1.00000000e+00 3.30871242e-09]
 [1.65542059e-11 1.00000000e+00]
 [1.81566029e-05 9.99981880e-01]
 [9.999778

In [14]:
yhat_classes = []
for i in range(len(yhat_probs)):
    if(yhat_probs[i][0]<.5):
        yhat_classes.append(0.)
    else:
        yhat_classes.append(1.)
    
        

In [15]:

yhat_classes = np.array(yhat_classes)
# print(yhat_classes)


In [16]:
single_test_labels = []
for i in range(len(test_labels)):
    if(test_labels[i][0]==0):
        single_test_labels.append(0.) 
    else:
        single_test_labels.append(1.)
single_test_labels= np.array(single_test_labels)        
    
# print(single_test_labels)


In [17]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(single_test_labels, yhat_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(single_test_labels, yhat_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(single_test_labels, yhat_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(single_test_labels, yhat_classes)
print('F1 score: %f' % f1)


Accuracy: 0.946000
Precision: 0.951613
Recall: 0.940239
F1 score: 0.945892


In [18]:
def tokenize_text(text):
    sequences = tokenizer.texts_to_sequences(text)
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    return data

In [20]:
def tokenize_text(text):
    sequences = tokenizer.texts_to_sequences(text)
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    return data

In [16]:
#Using LSTMS

In [16]:
def LSTM_model(sequence_input, embedded_sequences, classes=2):
    x = LSTM(32,return_sequences=True)(embedded_sequences)
    x = LSTM(64,return_sequences=True)(x)
    x = LSTM(128)(x)
    x = Dense(4096,activation='relu')(x)
    x = Dense(1024,activation='relu')(x)
    preds = Dense(classes, activation='softmax')(x)

    model = Model(sequence_input, preds)
    return model

In [17]:

MAX_SEQUENCE_LENGTH=1500
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
model = LSTM_model(sequence_input, embedded_sequences, classes=2)

model.compile(loss='categorical_crossentropy',
              optimizer='adamax',
              metrics=['acc'])

print(model.summary())

model.fit(train_data, train_labels,
          validation_data=(valid_data, valid_labels),
          epochs=3, batch_size=64)

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1500)              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 1500, 300)         49661400  
_________________________________________________________________
lstm_1 (LSTM)                (None, 1500, 32)          42624     
_________________________________________________________________
lstm_2 (LSTM)                (None, 1500, 64)          24832     
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               98816     
_________________________________________________________________
dense_1 (Dense)              (None, 4096)              528384    
_________________________________________________________________
dense_2 (Dense)              (None, 1024)              4195

<keras.callbacks.callbacks.History at 0x7f16f3e3e610>

In [19]:

# load json and create model
json_file = open('lstm_model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load weights into new model
model.load_weights("lstm_model.h5")
print("Loaded model from disk")

Loaded model from disk


In [21]:
model.compile(loss='categorical_crossentropy',
              optimizer='adamax',
              metrics=['acc'])

In [22]:
model.evaluate(test_data, test_labels)



[0.20191344308853149, 0.9139999747276306]

In [23]:
yhat_probs = model.predict(test_data)

In [25]:
yhat_classes = []
for i in range(len(yhat_probs)):
    if(yhat_probs[i][0]<.5):
        yhat_classes.append(0.)
    else:
        yhat_classes.append(1.)
    

In [26]:

yhat_classes = np.array(yhat_classes)

In [27]:
single_test_labels = []
for i in range(len(test_labels)):
    if(test_labels[i][0]==0):
        single_test_labels.append(0.) 
    else:
        single_test_labels.append(1.)
single_test_labels= np.array(single_test_labels)        
    

In [29]:
def tokenize_text(text):
    sequences = tokenizer.texts_to_sequences(text)
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    return data

In [30]:
#prediction on real time data
f1 = open('scraping/article.txt', "r")
text = f1.read()
#tokenize
tok = tokenize_text([text])
pred = model.predict(tok) # % real %fake
print(pred*100)


[[ 3.3264155 96.67359  ]]


In [32]:
# XG Boost
from xgboost import XGBClassifier
xgmodel = XGBClassifier(learning_rate = 0.05, n_estimators = 300, max_depth = 5)

In [48]:
train_lables= train_labels[:,0]


In [50]:
xgmodel.fit(train_data, train_lables)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.05, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=300, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [73]:
#pickle model and save
import pickle
pickle.dump(xgmodel, open('xg_model', 'wb'))
print("Saved model to disk")

Saved model to disk


In [76]:
# to load the saved model
xgmodel = pickle.load(open('xg_model', 'rb'))
print("loaded model from disk")

loaded model from disk


In [52]:
y_pred_xg = model.predict(test_data)

In [53]:
test_lables = test_labels[:,0]

In [55]:
test_lables

array([0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 1., 1., 0., 1.,
       1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1.,
       0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1.,
       1., 0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0.,
       0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1.,
       0., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0.,
       0., 1., 1., 0., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0.,
       0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0., 1., 0., 1., 0., 1., 1.,
       0., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 1., 0., 1.,
       1., 1., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1.,
       1., 1., 0., 1., 0., 0., 1., 1., 1., 0., 1., 1., 0., 1., 1., 0., 1.,
       1., 1., 1., 1., 0.

In [60]:
y_pred_xgs = y_pred_xg[:,0]

In [62]:
print(y_pred_xgs)

[7.06128357e-03 6.19744062e-02 3.20575642e-03 1.35253310e-01
 5.81706390e-02 6.31131828e-02 7.70273182e-05 9.92321908e-01
 7.16926530e-02 9.98265207e-01 5.49906977e-02 4.14338941e-03
 9.63496804e-01 9.59134877e-01 9.97518301e-01 1.45598352e-01
 9.45894539e-01 9.87194240e-01 9.96788144e-01 1.97124124e-01
 9.93692875e-01 9.44889188e-01 9.83600676e-01 1.44483633e-02
 9.98581767e-01 7.68124938e-01 8.00622821e-01 9.98526931e-01
 9.98930752e-01 9.98025894e-01 9.95991409e-01 9.99679804e-01
 5.61084561e-02 9.95866776e-01 6.75858408e-02 6.65202916e-01
 9.34808850e-01 1.66921678e-03 9.71512496e-01 1.21723123e-01
 6.92266881e-01 1.41029134e-01 1.79453418e-02 9.93019760e-01
 9.38822091e-01 8.72700959e-02 6.05307927e-04 9.15731653e-05
 3.29608135e-02 9.76769626e-02 1.10870332e-01 2.61345506e-01
 1.74674860e-04 1.04473397e-01 3.04621691e-03 2.42952928e-01
 8.01896930e-01 5.43020703e-02 5.17410114e-02 9.93243992e-01
 9.67754602e-01 9.83128726e-01 8.43378603e-01 9.60045397e-01
 3.71625624e-03 1.703274

In [64]:
print(confusion_matrix(test_lables, y_pred_xgs.round(),normalize=None))

[[228   8]
 [ 35 229]]


In [70]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(test_lables, y_pred_xgs.round())
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(test_lables, y_pred_xgs.round())
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(test_lables, y_pred_xgs.round())
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(test_lables, y_pred_xgs.round())
print('F1 score: %f' % f1)

Accuracy: 0.914000
Precision: 0.966245
Recall: 0.867424
F1 score: 0.914172


In [77]:
#prediction on real time data
f1 = open('scraping/article.txt', "r")
text = f1.read()
#tokenize
tok = tokenize_text([text])
pred = xgmodel.predict(tok) # % real %fake
print(pred*100)


[100.]
