In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score

import numpy as np

import keras
import keras.layers as layers
import tensorflow as tf
import os
from nltk.tokenize import word_tokenize
from tensorflow.keras.layers import TextVectorization

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences


In [2]:
data = pd.read_csv("master_dataset/processed_data.csv")
# drop unwanted features


# Drop all the column , keep only class , overall_context

x = data['overall_content']
y = data['class'] 
    
# As we will be vectorizing the content and doing LSTM on it

print(x,y)

0        donald trump sends out embarrassing new year’s...
1        drunk bragging trump staffer started russian c...
2        sheriff david clarke becomes an internet joke ...
3        trump is so obsessed he even has obama’s name ...
4        pope francis just called out donald trump duri...
                               ...                        
38653    'fully committed' nato backs new us approach a...
38654    lexisnexis withdrew two products chinese marke...
38655    minsk cultural hub becomes authorities in shad...
38656    vatican upbeat possibility pope francis visiti...
38657    indonesia buy $114 billion worth russian jets ...
Name: overall_content, Length: 38658, dtype: object 0        1
1        1
2        1
3        1
4        1
        ..
38653    0
38654    0
38655    0
38656    0
38657    0
Name: class, Length: 38658, dtype: int64


In [3]:
y.value_counts()

0    21196
1    17462
Name: class, dtype: int64

### Prepare the data

In [4]:
#first split the dataset into training and test sets


x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state = 4222)

x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size = 0.25, random_state = 4222)

#check that train set is balance
y_train.value_counts()

# Since the dataset is pretty balanced, Real - 55% and Fake - 45% of the data,
# By oversampling, we will have duplicates in the model which will overtrain out model.
# By undersampling, we might lose out on critical information.

0    12717
1    10477
Name: class, dtype: int64

In [5]:
# Find the number of maximum text 
print(x_train.str.len().max())

39601


In [6]:
# Change dataframe back to Series


max_features = 10000
maxlen = 500 # Determine how many words, you want pick to run the feature, cause if u pick a big number your LSTM will take longer to run
embedding_dim = 128

raw_train_ds = tf.data.Dataset.from_tensor_slices(
    (tf.constant(x_train.squeeze().to_list()),
     tf.keras.utils.to_categorical(y_train.to_numpy() -1))
).batch(1024)

raw_test_ds = tf.data.Dataset.from_tensor_slices(
    (tf.constant(x_test.squeeze().to_list()),
     tf.keras.utils.to_categorical(y_test.to_numpy() -1))
).batch(1024)

raw_val_ds = tf.data.Dataset.from_tensor_slices(
    (tf.constant(x_validation.squeeze().to_list()),
     tf.keras.utils.to_categorical(y_validation.to_numpy() -1))
).batch(1024)

raw_train_ds



<_BatchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None, 1), dtype=tf.float32, name=None))>

In [7]:
vectorize_layer = TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length= maxlen, # Based on how many words there are in overall_text
)



In [8]:
text_ds = raw_train_ds.map(lambda x, y : x)

vectorize_layer.adapt(text_ds)



In [9]:
print(len(vectorize_layer.get_vocabulary()))
print(vectorize_layer.get_vocabulary()[:20])



10000
['', '[UNK]', 'said', 'trump', 'the', 'us', 'would', 'president', 'i', 'people', 'it', 'one', 'state', 'new', 'also', 'donald', 'house', 'government', 'republican', 'states']


In [10]:
def vectorize_text(text,label):
    return vectorize_layer(text),label

# Vectorize the Data
train_ds = raw_train_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)

# Do async prefetching / buffering of the data for best performance on GPU
train_ds = train_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)

In [11]:
for x,y in train_ds.take(1):
    print(x[0][:20])
    print(y.shape)

tf.Tensor(
[ 320  289 2666 1770    1   72 3148 2246    1   72  242 2997    1  665
 7422  251  289  589  209  143], shape=(20,), dtype=int64)
(1024, 1)


## Build A model

In [12]:

model = keras.models.Sequential()
model.add(layers.Embedding(max_features,embedding_dim))
model.add(layers.Dropout(0.5))

model.add(layers.LSTM(128, return_sequences=True))
model.add(layers.GlobalMaxPooling1D())

model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1,activation="sigmoid",name="predictions"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])


## Train and evalute the model

In [13]:
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         1280000   
                                                                 
 dropout (Dropout)           (None, None, 128)         0         
                                                                 
 lstm (LSTM)                 (None, None, 128)         131584    
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 32)                4128      
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                        

In [14]:
model.fit(train_ds,validation_data=(val_ds),epochs=2)  #Took 10mins, the bigger the batch, the lesser iter needed.

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1d7c4fddb50>

### Analysis after Training Model

In [15]:
#print("Accuracy of the model on Training Data is - " , model.evaluate(train_ds)[1]*100 , "%")
#print("Accuracy of the model on Validation Data is - " , model.evaluate(val_ds)[1]*100 , "%")
#print("Accuracy of the model on Testing Data is - " , model.evaluate(test_ds)[1]*100 , "%")
