In [2]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive/IST700')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
import nltk
from nltk.corpus import stopwords
import gensim
from nltk import word_tokenize
import tensorflow as tf

In [4]:
true_df = pd.read_csv("True.csv")
fake_df = pd.read_csv("Fake.csv")

In [5]:
true_df.head
fake_df.head

<bound method NDFrame.head of                                                    title  \
0       Donald Trump Sends Out Embarrassing New Year’...   
1       Drunk Bragging Trump Staffer Started Russian ...   
2       Sheriff David Clarke Becomes An Internet Joke...   
3       Trump Is So Obsessed He Even Has Obama’s Name...   
4       Pope Francis Just Called Out Donald Trump Dur...   
...                                                  ...   
23476  McPain: John McCain Furious That Iran Treated ...   
23477  JUSTICE? Yahoo Settles E-mail Privacy Class-ac...   
23478  Sunnistan: US and Allied ‘Safe Zone’ Plan to T...   
23479  How to Blow $700 Million: Al Jazeera America F...   
23480  10 U.S. Navy Sailors Held by Iranian Military ...   

                                                    text      subject  \
0      Donald Trump just couldn t wish all Americans ...         News   
1      House Intelligence Committee Chairman Devin Nu...         News   
2      On Friday, it was revea

In [6]:
print("true news shape:", true_df.shape)
print("fake news shape:", fake_df.shape)

true news shape: (21417, 4)
fake news shape: (23481, 4)


In [7]:
true_df['true'] = 1
fake_df['true'] = 0

data = pd.concat([true_df, fake_df])

In [8]:
data = data.iloc[:,[0, -1]] # Removing other columns
data = shuffle(data).reset_index(drop=True) # Shuffle

data.head()

Unnamed: 0,title,true
0,Philippines president says China agrees to wor...,1
1,House Conservatives Planning To End Paul Ryan...,0
2,Hell Comes to Frogtown: Alt Right and Triumph ...,0
3,Texas House passes 'bathroom bill' targeting p...,1
4,Retired Colonel BLASTS Trump’s ‘Amateur’ Dema...,0


In [9]:
nltk.download("stopwords")
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
def preprocess(text):
    result = ""
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in stop_words:
            result += " " + token
    result.strip()        
    return result

In [11]:
data['clean'] = data['title'].apply(preprocess)

In [12]:
data

Unnamed: 0,title,true,clean
0,Philippines president says China agrees to wor...,1,philippines president says china agrees work ...
1,House Conservatives Planning To End Paul Ryan...,0,house conservatives planning paul ryan stab s...
2,Hell Comes to Frogtown: Alt Right and Triumph ...,0,hell comes frogtown right triumph transhumanism
3,Texas House passes 'bathroom bill' targeting p...,1,texas house passes bathroom targeting public ...
4,Retired Colonel BLASTS Trump’s ‘Amateur’ Dema...,0,retired colonel blasts trump amateur demand n...
...,...,...,...
44893,CHARLIE DANIELS Rips Hollywood On Gun Control ...,0,charlie daniels rips hollywood control effort...
44894,THIS ONE PICTURE Tells You Everything You Need...,0,picture tells need know muslim refugee invasion
44895,WATCH: Guiliani Claims Trump Gave Up Birtheri...,0,watch guiliani claims trump gave birtherism c...
44896,U.S. concerned Iraqi Kurdish referendum will d...,1,concerned iraqi kurdish referendum distract s...


In [13]:
titles = [text for text in data.title]

max_len = 0
titles_len = []
for title in titles:
    titles_len.append(len(title.split()))
    max_len = max(len(title.split()), max_len)

print('Max length of the titles:', max_len)
print('Mean length of the titles:', np.mean(titles_len))

Max length of the titles: 42
Mean length of the titles: 12.453472315025168


In [14]:
train_val_data = data.sample(frac = 0.8)
test_data = data.drop(train_val_data.index)

train_data = train_val_data.sample(frac = 0.8)
val_data = train_val_data.drop(train_data.index)

# Reset Index
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

print('trainset size:', train_data.shape)
print('valset size:', val_data.shape)
print('testset size:', test_data.shape)

trainset size: (28734, 3)
valset size: (7184, 3)
testset size: (8980, 3)


In [15]:
list_of_words = []
for i in data.clean:
    for j in i.split():
        list_of_words.append(j)

total_words = len(list(set(list_of_words)))
total_words

19175

In [16]:
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer

# Creating A Tokenizer To Tokenize The Words And Create Sequences Of Tokenized Words
tokenizer = Tokenizer(num_words = total_words)
tokenizer.fit_on_texts(data['clean'])

train_sequences = tokenizer.texts_to_sequences(train_data['clean'])
val_sequences = tokenizer.texts_to_sequences(val_data['clean'])
test_sequences = tokenizer.texts_to_sequences(test_data['clean'])

In [17]:
word_index = tokenizer.word_index
index_word = {v: k for k, v in word_index.items()}

len(word_index)

19174

In [18]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Adding Padding
padded_train = pad_sequences(train_sequences,maxlen = 42, padding = 'post', truncating = 'post')
padded_val = pad_sequences(val_sequences,maxlen = 42, padding = 'post', truncating = 'post')
padded_test = pad_sequences(test_sequences,maxlen = 42, padding = 'post', truncating = 'post')

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Input, LSTM, Conv1D, MaxPool1D, Bidirectional, Dropout, BatchNormalization

## Creating model Using LSTM
embedding_vector_features=40
model1=Sequential()
model1.add(Embedding(total_words + 1,embedding_vector_features,input_length=42))
model1.add(Dropout(0.3))
model1.add(LSTM(100, return_sequences=True))
model1.add(Dropout(0.3))
model1.add(LSTM(100))
model1.add(Dropout(0.3))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model1.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 42, 40)            767040    
                                                                 
 dropout (Dropout)           (None, 42, 40)            0         
                                                                 
 lstm (LSTM)                 (None, 42, 100)           56400     
                                                                 
 dropout_1 (Dropout)         (None, 42, 100)           0         
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dropout_2 (Dropout)         (None, 100)               0         
                                                                 
 dense (Dense)               (None, 1)                 1

In [91]:
tf.keras.backend.clear_session()

In [28]:
y_train = np.asarray(train_data['true'])
y_val = np.asarray(val_data['true'])

# Training the model
model1.fit(padded_train, y_train, batch_size = 64, validation_data=(padded_val, y_val), epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f0bc2a67610>

In [29]:
y_test = np.asarray(test_data["true"])

model1.evaluate(padded_test, y_test)



[0.18813155591487885, 0.9410912990570068]

In [39]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2022-05-03 19:49:18--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-05-03 19:49:18--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-05-03 19:49:19--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2022

In [40]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
replace glove.6B.50d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [24]:
import os
embeddings_index = {}
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [25]:
num_tokens = total_words + 1
embedding_dim3 = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim3))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 18544 words (630 misses)


In [26]:
model2=Sequential()
model2.add(Embedding(total_words + 1, 100, input_length=42, weights = [embedding_matrix], trainable = False))
model2.add(Dropout(0.3))
model2.add(LSTM(100))
model2.add(Dropout(0.3))
model2.add(Dense(1,activation='sigmoid'))
model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model2.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 42, 100)           1917600   
                                                                 
 dropout_4 (Dropout)         (None, 42, 100)           0         
                                                                 
 lstm_3 (LSTM)               (None, 100)               80400     
                                                                 
 dropout_5 (Dropout)         (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 101       
                                                                 
Total params: 1,998,101
Trainable params: 80,501
Non-trainable params: 1,917,600
_________________________________________________________________
None


In [30]:
model2.fit(padded_train, y_train, batch_size = 64, validation_data=(padded_val, y_val), epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f0bbfbdb210>

In [31]:
model2.evaluate(padded_test, y_test)



[0.17466209828853607, 0.9304009079933167]

In [21]:
from keras.layers import Layer
import keras.backend as K

In [22]:
class attention(Layer):
    def __init__(self, return_sequences=True):
        self.return_sequences = return_sequences

        super(attention,self).__init__()

    def build(self, input_shape):
        self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1),
                               initializer="normal")
        self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1),
                               initializer="normal")
        super(attention,self).build(input_shape)

    def call(self, x):
        e = K.tanh(K.dot(x,self.W)+self.b)
        a = K.softmax(e, axis=1)
        output = x*a
        if self.return_sequences:

            return output
        return K.sum(output, axis=1)

In [131]:
tf.keras.backend.clear_session()

In [32]:
model3 = Sequential()
model3.add(Embedding(total_words + 1, 40, input_length=42))
model3.add(Bidirectional(LSTM(100, return_sequences=True)))
model3.add(attention()) # receive 3D and output 3D
model3.add(Dropout(0.5))
model3.add(Dense(1, activation='sigmoid'))
model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
model3.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 42, 40)            767040    
                                                                 
 bidirectional_1 (Bidirectio  (None, 42, 200)          112800    
 nal)                                                            
                                                                 
 attention_1 (attention)     (None, 42, 200)           242       
                                                                 
 dropout_6 (Dropout)         (None, 42, 200)           0         
                                                                 
 dense_3 (Dense)             (None, 42, 1)             201       
                                                                 
Total params: 880,283
Trainable params: 880,283
Non-trainable params: 0
________________________________________________

In [33]:
y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
y_val = np.asarray(y_val).astype('float32').reshape((-1,1))
y_test = np.asarray(y_test).astype('float32').reshape((-1,1))

In [117]:
tf.config.run_functions_eagerly(True)

In [34]:
model3.fit(padded_train, y_train, batch_size = 64, validation_data=(padded_val, y_val), epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0bbf44f790>

In [35]:
model3.evaluate(padded_test, y_test)



[0.2286677062511444, 0.9347254037857056]

In [36]:
model4 = Sequential()
model4.add(Embedding(total_words + 1, 100, input_length=42, weights = [embedding_matrix],trainable = False))
model4.add(Bidirectional(LSTM(100, return_sequences=True)))
model4.add(attention()) # receive 3D and output 3D
model4.add(Dropout(0.5))
model4.add(Dense(1, activation='sigmoid'))
model4.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
model4.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 42, 100)           1917600   
                                                                 
 bidirectional_2 (Bidirectio  (None, 42, 200)          160800    
 nal)                                                            
                                                                 
 attention_2 (attention)     (None, 42, 200)           242       
                                                                 
 dropout_7 (Dropout)         (None, 42, 200)           0         
                                                                 
 dense_4 (Dense)             (None, 42, 1)             201       
                                                                 
Total params: 2,078,843
Trainable params: 161,243
Non-trainable params: 1,917,600
______________________________________

In [37]:
model4.fit(padded_train, y_train, batch_size = 64, validation_data=(padded_val, y_val), epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0bbc978b10>

In [38]:
model4.evaluate(padded_test, y_test)



[0.1961296647787094, 0.9231388568878174]