In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os


# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


wordnet_lemmatizer = WordNetLemmatizer()
stopword = stopwords.words('english')

import re
def my_tokenizer(s):
    s = s.lower() # downcase
    s = re.sub(r'@\S+','',s)
    s = re.sub(r'[^a-zA-Z0-9\s]','',s)
    s = re.sub(r'http\S+', '', s)
    tokens = nltk.tokenize.word_tokenize(s) # spliting string into words (tokens)
    tokens = [t for t in tokens if len(t) > 2] # removing short words, they're probably not useful
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [t for t in tokens if t not in stopword]
    return tokens

In [3]:
real = pd.read_csv('../input/fake-and-real-news-dataset/True.csv',parse_dates = ['date'])
fake = pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv',parse_dates = ['date'])
real['article'] = real['title']+" "+real['text']
fake['article'] = fake['title']+" "+fake['text']
real['label'] = 1
fake['label'] = 0
all_data = real.append(fake)
all_data.shape
maxLen = len(max(all_data['article'], key=len).split())

In [4]:
import random
all_data = all_data.sample(frac = 1)
all_data.shape

(44898, 6)

In [5]:
data = {'article':[my_tokenizer(sentence) for sentence in all_data['article']],'label':[label for label in all_data['label']]}
doc = pd.DataFrame(data)

In [6]:
unseen_test = doc.sample(n = 5000) 
train_data = doc.loc[~doc.index.isin(unseen_test.index)]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_vec(article,label,unseen_article,unseen_label):
    tfidf = TfidfVectorizer(lowercase = False, tokenizer = my_tokenizer,use_idf = True,norm = 'l2',smooth_idf = True)
    X = tfidf.fit_transform(article)
    y = label
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=50, test_size=0.7, shuffle=True)
    w_i = tfidf.vocabulary_
    X_unseen = tfidf.transform(unseen_article)
    y_unseen = unseen_label
    return X_train,X_test,y_train,y_test,w_i,X_unseen,y_unseen

In [7]:
import tensorflow as tf
def tok_vec(articles,label,unseen_article,unseen_label):
    article = articles.tolist()
    labels = label.tolist()

    val_size = int(len(article)*0.9)
    training_articles = article[0:val_size]
    testing_articles = article[val_size:]
    training_labels = labels[0:val_size]
    testing_labels = labels[val_size:]

    training_labels_final = np.array(training_labels)
    testing_labels_final = np.array(testing_labels)
    
    unseen_article = unseen_article.tolist()
    unseen_labels = unseen_label.tolist()
    unseen_labels_final = np.array(unseen_labels)
    
    tok = tf.keras.preprocessing.text.Tokenizer(
        num_words=400000)

    tok.fit_on_texts(training_articles)
    sequences = tok.texts_to_sequences(training_articles)
    padded = tf.keras.preprocessing.sequence.pad_sequences(
        sequences, maxlen=maxLen, dtype='int32', padding='pre', truncating='pre',
        value=0.0)

    w_i = tok.word_index
    testing_sequences = tok.texts_to_sequences(testing_articles)
    testing_padded = tf.keras.preprocessing.sequence.pad_sequences(
        testing_sequences, maxlen=maxLen, dtype='int32', padding='pre', truncating='pre',
        value=0.0)
    unseen_sequences = tok.texts_to_sequences(unseen_article)
    unseen_padded = tf.keras.preprocessing.sequence.pad_sequences(
        unseen_sequences, maxlen=maxLen, dtype='int32', padding='pre', truncating='pre',
        value=0.0)
    return padded,testing_padded,training_labels_final,testing_labels_final,w_i,unseen_padded,unseen_labels_final

#choose one of the below vectorizers

In [None]:
X_train,X_test,y_train,y_test,w_i,X_unseen,y_unseen= tfidf_vec(train_data['article'],train_data['label'],
                                                               unseen_test['article'],unseen_test['label'])

In [8]:
X_train,X_test,y_train,y_test,w_i,X_unseen,y_unseen = tok_vec(train_data['article'],train_data['label'],
                                                              unseen_test['article'],unseen_test['label'])

In [9]:
word_embedding_file = '../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt'

print("Loading Glove Model")
f = open(word_embedding_file,'r')
glove_layer = {}
for line in f:
    splitLines = line.split()
    word = splitLines[0]
    wordEmbedding = np.array([float(value) for value in splitLines[1:]])
    glove_layer[word] = wordEmbedding
print("gloves loaded")
vocabLen = len(glove_layer)
embedding_dim = glove_layer['the'].shape[0]

Loading Glove Model
gloves loaded


In [None]:
X_train.shape,y_train

In [10]:
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.models import Model
from keras.layers.embeddings import Embedding



embedding_matrix = np.zeros((vocabLen+1,embedding_dim))
for word,idx in w_i.items():
    embedding_vec = glove_layer.get(word)
    if embedding_vec is not None:
        embedding_matrix[idx] = embedding_vec
        

input_layer = Input(shape = (maxLen,),dtype = 'int32')

embedded_layer = Embedding(vocabLen+1,embedding_dim,weights = [embedding_matrix],input_length = maxLen,trainable = False)

embedded_sequence = embedded_layer(input_layer)

me = LSTM(512,return_sequences = True)(embedded_sequence)

me = LSTM(512,return_sequences = False)(me)
me = Dropout(0.6)(me)
me = Dense(1,activation = 'sigmoid')(me)
model = Model(input_layer,me)

model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 8148)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 8148, 100)         40000100  
_________________________________________________________________
lstm (LSTM)                  (None, 8148, 512)         1255424   
_________________________________________________________________
lstm_1 (LSTM)                (None, 512)               2099200   
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 513       
Total params: 43,355,237
Trainable params: 3,355,137
Non-trainable params: 40,000,100
__________________________________

In [None]:

from tensorflow.keras.utils import plot_model
plot_model(model,show_shapes = True, show_layer_names = True)
model.summary()

In [None]:
from tensorflow.keras import Input
from tensorflow.keras.layers import LSTM,Dense
from tensorflow.keras.models import Model
def rnn_model(hp):
    input_layer = Input(shape = (maxLen,1),dtype = 'float')
    lay = LSTM(hp.Int('units',min_value=32,max_value=512,step=32))(input_layer)
    lay = Dense(1,activation = 'softmax')(lay)
    model = Model(input_layer,lay)

    model.compile(optimizer = 'adam',loss = 'binary_crossentropy',metrics = ['accuracy'])
    return model

In [None]:
from tensorflow.keras import Input
from tensorflow.keras.layers import LSTM,Dense,Dropout
from tensorflow.keras.models import Model
input_layer = Input(shape = (maxLen,1),dtype = 'float')
lay = LSTM(512,return_sequences = True)(input_layer)
lay = LSTM(512,return_sequences = False)(lay)
lay = Dense(1,activation = 'softmax')(lay)
rnn_model = Model(input_layer,lay)

rnn_model.compile(optimizer = 'adam',loss = 'binary_crossentropy',metrics = ['accuracy'])
rnn_model.summary()

In [None]:
import kerastuner as kt

tuner = kt.tuners.Hyperband(rnn_model,objective = 'val_accuracy',max_epochs=10,executions_per_trial = 1,directory='my_dir',project_name = 'fakeandreal')
tuner.search(X_train,y_train,validation_data = (X_test,y_test))

In [None]:
models = tuner.get_best_models(1)[0]
tuner.results_summary()
best_hyperparameters = tuner.get_best_hyperparameters(1)[0]

In [11]:


model.fit(X_train,y_train,validation_data = (X_test,y_test),epochs = 3,batch_size = 32)

training_acc = model.evaluate(X_train,y_train)
val_acc = model.evaluate(X_test,y_test)
test_acc = model.evaluate(X_unseen,y_unseen)

Epoch 1/3
Epoch 2/3
Epoch 3/3
