# Ensemble of all models trained on kaggle fake news dataset 


In [41]:

import pandas as pd
import numpy as np
import random

import keras
from keras.layers import Dense
from keras.models import model_from_json
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.utils import to_categorical
from keras import Sequential, Model, Input
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D, Flatten, Dense, GlobalAveragePooling1D, Dropout, LSTM, CuDNNLSTM, RNN, SimpleRNN, Conv2D, GlobalMaxPooling1D

from keras import callbacks

import os
import tensorflow
import sklearn
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn import svm

from xgboost import XGBClassifier

from sklearn import metrics
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report, confusion_matrix
from sklearn.metrics import cohen_kappa_score, roc_auc_score

import statsmodels
import statsmodels.api as sm

import pickle


#  Data-preporcessing

In [4]:
def load_kagglefakenews(flag):
    """
    agrs: flag
    if flag=1, loads first 10k rows,
    if flag=0, loads next remaning 10k rows
    Loads Kaggle Fake news net dataset (divided in chunks becasue ofo memory constraints)
    """
    if(flag==1):
        df = pd.read_csv('Kaggle_FakeNews/train.csv', nrows=10000, encoding='utf8')
    else:
        df = pd.read_csv('Kaggle_FakeNews/train.csv', skiprows=range(1, 15000),encoding='utf8')
    df.columns = ['id','title','author','text','label']
    train_data = df['text'].values.tolist() 
    train_labels = df['label'].values.tolist() 
    combo = list(zip(train_data, train_labels))
    random.shuffle(combo)
    train_data, train_labels = zip(*combo)
    del df
    return np.asarray(train_data).tolist(), np.asarray(train_labels).tolist()

In [5]:
train_data, train_labels = load_kagglefakenews(0)

In [6]:
train_data

['Rep. Adam Schiff ( ) threatened to have Congress appoint a special counsel, despite the Supreme Court’s decision that it is unconstitutional for Congress to appoint any federal official. This decision came from a case so famous that the lawmaker almost certainly learned it as a student at Harvard Law School. [Schiff tweeted Monday, “If President fired Bob Mueller, Congress would immediately   independent counsel and appoint Bob Mueller. Don’t waste our time”:  If President fired Bob Mueller, Congress would immediately   independent counsel and appoint Bob Mueller. Don’t waste our time.  —   Adam Schiff (@RepAdamSchiff) June 12, 2017,  The Harvard Law professors who taught the ranking member of the House Select Intelligence Committee are probably disappointed with their former student because the Supreme Court in 1976 unanimously said Congress would violate the Constitution by doing so. In the 1970s, Congress created the Federal Election Commission (FEC) which gave them the power to a

In [7]:
MAX_NB_WORDS=50000 #dictionary size
MAX_SEQUENCE_LENGTH=1500 #max word length of each individual article
EMBEDDING_DIM=300 #dimensionality of the embedding vector (50, 100, 200, 300)
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')

def tokenize_trainingdata(texts, labels):
    """
    agrs: texts, labels
    """
    tokenizer.fit_on_texts(texts)
    pickle.dump(tokenizer, open('Models/tokenizer.p', 'wb'))
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    labels = to_categorical(labels, num_classes=len(set(labels)))
    return data, labels, word_index



In [8]:
X, Y, word_index = tokenize_trainingdata(train_data, train_labels)    

Found 126095 unique tokens.


In [9]:
#splitting the data (90% train, 5% test, 5% validation)
train_data = X[:int(len(X)*0.9)]
train_labels = Y[:int(len(X)*0.9)]
test_data = X[int(len(X)*0.9):int(len(X)*0.95)]
test_labels = Y[int(len(X)*0.9):int(len(X)*0.95)]
valid_data = X[int(len(X)*0.95):]
valid_labels = Y[int(len(X)*0.95):]

# Functions for models

In [10]:
def load_embeddings(word_index, embeddingsfile='wordEmbeddings/glove.6B.%id.txt' %EMBEDDING_DIM):
    """
    agrs: word index
    embeddings file path
    retunrs: embedding layer
    """
    embeddings_index = {}
    f = open(embeddingsfile, 'r', encoding='utf8')
    for line in f:
        
        values = line.split(' ') 
        word = values[0] 
        coefs = np.asarray(values[1:], dtype='float32') 
        embeddings_index[word] = coefs 
    f.close()

    print('Found %s word vectors.' % len(embeddings_index))

    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:

            embedding_matrix[i] = embedding_vector
    
    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    return embedding_layer
    

embedding_layer = load_embeddings(word_index)

Found 400000 word vectors.


In [11]:

def save_model_to_json(model, filename ):
    """
    agrs: model object, filename string
    saves model in filename.txt and weights in filename_weights.h5
    """
    model_json = model.to_json()
    model_filename = filename+".json"
    with open(model_filename, "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    weights_file = filename+ "_weights.h5"
    model.save_weights(weights_file)
    print("Saved model to disk")

In [12]:
def load_model_from_json(filename):
    """
    loads model from filename.txt and weights from filename_weights.h5
    """
#     model_filename = filename+ ".json"
    model_filename = "cnn_model.json"
    json_file = open(model_filename, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json)
#     weights_file = filename+ "_weights.h5"
    weights_file = "model.h5"
    model.load_weights(weights_file)
    print("Loaded model from disk")
    return model
   

In [13]:
def print_metrics(y_true, ypred):
    # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(y_true, ypred)
    print('Accuracy: %f' % accuracy)
    # precision tp / (tp + fp)
    precision = precision_score(y_true, ypred)
    print('Precision: %f' % precision)
    # recall: tp / (tp + fn)
    recall = recall_score(y_true, ypred)
    print('Recall: %f' % recall)
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(y_true, ypred)
    print('F1 score: %f' % f1)


In [14]:
def tokenize_text(text):
    sequences = tokenizer.texts_to_sequences(text)
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    return data

# Models 


# CNN

In [15]:
def cnn_model(sequence_input, embedded_sequences, classes=2):
    x = Conv1D(64, 5, activation='relu')(embedded_sequences)
    x = MaxPooling1D(5)(x)
    x = Conv1D(128, 3, activation='relu')(x)
    x = MaxPooling1D(5)(x)
    x = Conv1D(256, 2, activation='relu')(x)
    x = GlobalAveragePooling1D()(x)
    x = Dense(2048, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.5)(x)
    preds = Dense(classes, activation='softmax')(x)

    model = Model(sequence_input, preds)
    return model

In [None]:
MAX_SEQUENCE_LENGTH=1500
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

model = cnn_model(sequence_input, embedded_sequences, classes=2)

model.compile(loss='categorical_crossentropy',
              optimizer='adamax',
              metrics=['acc'])

print(model.summary())

model.fit(train_data, train_labels,
          validation_data=(valid_data, valid_labels),
          epochs=25, batch_size=64)

In [38]:
save_model_to_json(cnn_model, "CNN_model")

Saved model to disk


In [34]:
cnn_model = load_model_from_json("model")

Loaded model from disk


In [40]:
cnn_model.compile(loss='categorical_crossentropy',
              optimizer='adamax',
              metrics=['acc'])

In [41]:
cnn_model.evaluate(test_data, test_labels)




[0.3339013277856866, 0.9580000042915344]

In [42]:
yhat_probs = cnn_model.predict(test_data)

In [24]:
def convert_oneD(yhat_probs,test_labels):
    """
    Takes 2d labels array of lists R,F where R= 0 means Fake, R=1 means real
    Same for F
    Takes array of [R,F] and makes it array of R(other is automatically inferred since values is binary(R/F))
    """
    yhat_classes = []
    for i in range(len(yhat_probs)):
        if(yhat_probs[i][0]<.5):
            yhat_classes.append(0.)
        else:
            yhat_classes.append(1.)
    yhat_classes = np.array(yhat_classes)
    single_test_labels = []
    for i in range(len(test_labels)):
        if(test_labels[i][0]==0):
            single_test_labels.append(0.) 
        else:
            single_test_labels.append(1.)
    single_test_labels= np.array(single_test_labels)
    return [single_test_labels,yhat_classes]

In [55]:
labels = convert_oneD(yhat_probs,test_labels)
oned_labels = labels[0]
yhat_classes = labels[1]

In [60]:
print_metrics(oned_labels, yhat_classes)

Accuracy: 0.958000
Precision: 0.973451
Recall: 0.936170
F1 score: 0.954447


# LSTM

In [61]:
def LSTM_model(sequence_input, embedded_sequences, classes=2):
    x = LSTM(32,return_sequences=True)(embedded_sequences)
    x = LSTM(64,return_sequences=True)(x)
    x = LSTM(128)(x)
    x = Dense(4096,activation='relu')(x)
    x = Dense(1024,activation='relu')(x)
    preds = Dense(classes, activation='softmax')(x)

    model = Model(sequence_input, preds)
    return model

In [None]:
MAX_SEQUENCE_LENGTH=1500
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
lstm_model = LSTM_model(sequence_input, embedded_sequences, classes=2)

lstm_model.compile(loss='categorical_crossentropy',
              optimizer='adamax',
              metrics=['acc'])

print(lstm_model.summary())

lstm_model.fit(train_data, train_labels,
          validation_data=(valid_data, valid_labels),
          epochs=15, batch_size=64)

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 1500)              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 1500, 300)         49661400  
_________________________________________________________________
lstm_4 (LSTM)                (None, 1500, 32)          42624     
_________________________________________________________________
lstm_5 (LSTM)                (None, 1500, 64)          24832     
_________________________________________________________________
lstm_6 (LSTM)                (None, 128)               98816     
_________________________________________________________________
dense_4 (Dense)              (None, 4096)              528384    
_________________________________________________________________
dense_5 (Dense)              (None, 1024)              4195

In [19]:
lstm_model = save_model_to_json("lstm_model_15")


Loaded model from disk


In [91]:
lstm_model = load_model_from_json("lstm_model_15")

Loaded model from disk


In [92]:
lstm_model.compile(loss='categorical_crossentropy',
              optimizer='adamax',
              metrics=['acc'])

In [93]:
lstm_model.evaluate(test_data, test_labels)



[0.3339013277856866, 0.9580000042915344]

In [94]:
yhat_probs = lstm_model.predict(test_data)

In [95]:
labels = convert_oneD(yhat_probs,test_labels)
oned_labels = labels[0]
yhat_classes = labels[1]
print_metrics(oned_labels, yhat_classes)

Accuracy: 0.958000
Precision: 0.973451
Recall: 0.936170
F1 score: 0.954447


In [29]:
def tokenize_text(text):
    sequences = tokenizer.texts_to_sequences(text)
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    return data

In [68]:
def predict_on_url(model):
    f1 = open('scraping/article.txt', "r")
    text = f1.read()
    tok = tokenize_text([text])
    
    pred = model.predict(tok)
#     xgpred = xgmodel.predict(tok) # % 100 - real , 0 - fake
#     cnnpred = model.predict(tok)
#     lstmpred = lstm_model.predict(tok)
    return pred*100


In [30]:
#prediction on real time data
f1 = open('scraping/article.txt', "r")
text = f1.read()
#tokenize
tok = tokenize_text([text])
pred = model.predict(tok) # % real %fake
print(pred*100)


[[ 3.3264155 96.67359  ]]


# XG Boost

In [62]:

from xgboost import XGBClassifier
xgmodel = XGBClassifier(learning_rate = 0.05, n_estimators = 300, max_depth = 5)

In [63]:
train_lables= train_labels[:,0]


In [50]:
xgmodel.fit(train_data, train_lables)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.05, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=300, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [64]:
#pickle model and save
def save_in_pickle(model):
    pickle.dump(xgmodel, open('xg_model', 'wb'))
    print("Saved model to disk")

In [66]:
save_in_pickle(xgmodel)

Saved model to disk


In [65]:
# to load the saved model
xgmodel = pickle.load(open('xg_model', 'rb'))
print("loaded model from disk")

loaded model from disk


In [70]:
y_pred_xg = xgmodel.predict(test_data)

In [73]:
print(y_pred_xg)

[0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1.
 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1.
 0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1.
 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0.
 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 0.
 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 0. 1. 0. 1. 1. 1. 0. 0.
 1. 1. 1. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1.
 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 0. 1. 0. 1. 0. 0.
 1. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1.
 1. 1. 1. 0. 0. 1. 1. 0. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0.
 1. 0. 1. 0. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 1. 1.
 1. 0. 0. 0. 1. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0.
 0. 1. 0. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1.
 1. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0.

In [22]:
test_lables = test_labels[:,0]

In [75]:
print(confusion_matrix(test_lables, y_pred_xg.round(),normalize=None))

[[255  10]
 [ 14 221]]


In [76]:
print_metrics(test_lables, y_pred_xg.round())


Accuracy: 0.952000
Precision: 0.956710
Recall: 0.940426
F1 score: 0.948498


# EntraTressClassifier

In [32]:
Extr = ExtraTreesClassifier(n_estimators=5,n_jobs=4)
Extr.fit(train_data, train_labels)


ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=4,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [26]:
y_pred_et = Extr.predict(test_data)
labels = convert_oneD(y_pred_et,test_labels)
oned_labels = labels[0]
yhat_classes = labels[1]

In [27]:

print(confusion_matrix(oned_labels, yhat_classes,normalize=None))

[[90 64]
 [47 89]]


In [28]:
print_metrics(oned_labels, yhat_classes)

Accuracy: 0.617241
Precision: 0.581699
Recall: 0.654412
F1 score: 0.615917


# Decision tree with Gini Index

In [97]:

clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100)
clf_gini.fit(train_data, train_lables)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=100, splitter='best')

In [98]:
y_pred_gi = clf_gini.predict(test_data)


In [99]:
print(confusion_matrix(test_lables, y_pred_gi.round(),normalize=None))

[[195  70]
 [ 63 172]]


In [101]:
print_metrics(test_lables, y_pred_gi.round())

Accuracy: 0.734000
Precision: 0.710744
Recall: 0.731915
F1 score: 0.721174


# Decision Tree with criterion entropy index [DT(E)] Classification

In [102]:
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100)
clf_entropy.fit(train_data, train_lables)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=100, splitter='best')

In [103]:
y_pred_en = clf_entropy.predict(test_data)


In [104]:
print(confusion_matrix(test_lables, y_pred_gi.round(),normalize=None))

[[195  70]
 [ 63 172]]


In [105]:
print_metrics(test_lables, y_pred_en.round())

Accuracy: 0.716000
Precision: 0.697872
Recall: 0.697872
F1 score: 0.697872


# K-Nearest Neighbors (KNN) Classification

In [106]:
classifier = KNeighborsClassifier(n_neighbors=5)

In [107]:
classifier.fit(train_data, train_lables)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [108]:
y_pred_knn = classifier.predict(test_data)

In [109]:
print(confusion_matrix(test_lables, y_pred_knn.round(),normalize=None))

[[195  70]
 [154  81]]


In [110]:
print_metrics(test_lables, y_pred_knn.round())


Accuracy: 0.552000
Precision: 0.536424
Recall: 0.344681
F1 score: 0.419689


# Random Forest (RF) Classification

In [111]:
model_tree = RandomForestClassifier(random_state=100, n_estimators=100)

In [112]:
model_tree.fit(train_data, train_lables)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=100,
                       verbose=0, warm_start=False)

In [113]:
y_pred_rf = model_tree.predict(test_data)

In [114]:
print(confusion_matrix(test_lables, y_pred_rf.round(),normalize=None))

[[172  93]
 [ 49 186]]


In [115]:
print_metrics(test_lables, y_pred_rf.round())

Accuracy: 0.716000
Precision: 0.666667
Recall: 0.791489
F1 score: 0.723735


# SVM

In [107]:
clf_svm = svm.SVC(kernel='linear')

In [None]:
clf_svm.fit(train_data, train_lables)

In [None]:
y_pred_svm = clf.predict(X_test)

In [None]:
print(confusion_matrix(test_lables, y_pred_svm.round(),normalize=None))

In [None]:
print_metrics(test_lables, y_pred_svm.round())


# Logistic Regression

In [77]:
logisticRegr = LogisticRegression()

In [79]:
logisticRegr.fit(train_data, train_lables)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [86]:
y_pred_lr = logisticRegr.predict(test_data)

In [87]:
print(y_pred_lr.size)

500


In [88]:
print(test_lables.size)

500


In [89]:
print(confusion_matrix(test_lables, y_pred_lr.round(),normalize=None))

[[176  89]
 [100 135]]


In [90]:
print_metrics(test_lables, y_pred_lr.round())

Accuracy: 0.622000
Precision: 0.602679
Recall: 0.574468
F1 score: 0.588235
