In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score

import numpy as np

import keras
import keras.layers as layers
import tensorflow as tf
import os
from nltk.tokenize import word_tokenize
from tensorflow.keras.layers import TextVectorization

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences


In [22]:
data = pd.read_csv("master_dataset/processed_data.csv")
list(data.columns)
# drop unwanted features

data.iloc[:,5:].columns

# Drop all the column , keep only class , text_without_stopwords and title_without_stopwords
data = data.drop(data.iloc[:,5:].columns, axis=1)

data = data.drop(['title', 'text'],axis=1)
    
# ['title', 'text', 'text_without_stopwords', 'title_without_stopwords','syllables', 'polarity_category', 'overall_content', 'polarity'], axis=1)
#'Topic 1 Probability', 'Topic 2 Probability', 'Topic 3 Probbility' , 'Topic 4 Probability' ,'Topic 5 Probability',
#'title_word_count', 'title_sentence_count', 'title_average_word_length','title_punctuation_count', 'title_stopwords_count'  
# 'polarity_category_Neutral' , 'polarity_category_Positive'

In [23]:
#dataset is slightly imbalanced so we will perform upsampling to balance the dataset.
data['class'].value_counts()

0    21196
1    17462
Name: class, dtype: int64

In [24]:
list(data.columns)

['class', 'text_without_stopwords', 'title_without_stopwords']

In [25]:
data["overall_text"] = data["text_without_stopwords"] + " " + data["title_without_stopwords"]

data

Unnamed: 0,class,text_without_stopwords,title_without_stopwords,overall_text
0,1,donald trump wish americans happy new year lea...,donald trump sends out embarrassing new year’s...,donald trump wish americans happy new year lea...
1,1,house intelligence committee chairman devin nu...,drunk bragging trump staffer started russian c...,house intelligence committee chairman devin nu...
2,1,on friday revealed former milwaukee sheriff da...,sheriff david clarke becomes an internet joke ...,on friday revealed former milwaukee sheriff da...
3,1,on christmas day donald trump announced would ...,trump is so obsessed he even has obama’s name ...,on christmas day donald trump announced would ...
4,1,pope francis used annual christmas day message...,pope francis just called out donald trump duri...,pope francis used annual christmas day message...
...,...,...,...,...
38653,0,nato allies tuesday welcomed president donald ...,'fully committed' nato backs new us approach a...,nato allies tuesday welcomed president donald ...
38654,0,lexisnexis provider legal regulatory business ...,lexisnexis withdrew two products chinese market,lexisnexis provider legal regulatory business ...
38655,0,in shadow disused soviet-era factories minsk s...,minsk cultural hub becomes authorities,in shadow disused soviet-era factories minsk s...
38656,0,vatican secretary state cardinal pietro paroli...,vatican upbeat possibility pope francis visiti...,vatican secretary state cardinal pietro paroli...


### Prepare the data

In [26]:
#first split the dataset into training and test sets


x_train, x_test, y_train, y_test = train_test_split(data['overall_text'],data['class'],test_size=0.2,random_state = 4222)

x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size = 0.25, random_state = 4222)


#balance x_train with oversampling
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler(sampling_strategy='minority')

# OverSampling only works on Dataframe, but current x_train and y_train are series
x_train, y_train = oversample.fit_resample(x_train.to_frame(), y_train.to_frame())


#check that train set is oversampled
y_train.value_counts()

class
0        12717
1        12717
dtype: int64

In [27]:
print(x_train)

                                            overall_text
0      venezuelans vote sunday nationwide mayoral pol...
1      the law order candidate broke law againdonald ...
2      the reason republican presidential frontrunner...
3      a senior israeli minister thursday declined co...
4      donald trump treated americans oppose enemy sa...
...                                                  ...
25429  the thin-skinned president-elect took angry li...
25430  donald trump 50 days presidency done nothing c...
25431  a bunch rabid gun-toting conservatives would l...
25432  donald trump new campaign manager days role al...
25433  according donald trump joe the plumber right s...

[25434 rows x 1 columns]


In [28]:
# Find the number of maximum text 
print(data.overall_text.str.len().max())

40567


In [29]:
# TA example  not working for me. 

# Change dataframe back to Series
# # Model constants.

max_features = 10000
maxlen = 500 # Set a max length of the array, if not it will do an array of like [1,10000] , and if i would to run the LSTM, it will take 30 hours

raw_train_ds = tf.data.Dataset.from_tensor_slices(
    (tf.constant(x_train.squeeze().to_list()),
     tf.keras.utils.to_categorical(y_train.to_numpy() -1))
).batch(2048)

raw_test_ds = tf.data.Dataset.from_tensor_slices(
    (tf.constant(x_test.squeeze().to_list()),
     tf.keras.utils.to_categorical(y_test.to_numpy() -1))
).batch(2048)

raw_val_ds = tf.data.Dataset.from_tensor_slices(
    (tf.constant(x_validation.squeeze().to_list()),
     tf.keras.utils.to_categorical(y_validation.to_numpy() -1))
).batch(2048)

raw_train_ds

vectorize_layer = TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=42000, # Based on how many words there are in overall_text
)

text_ds = raw_train_ds.map(lambda x, y : x)

vectorize_layer.adapt(text_ds)

print(len(vectorize_layer.get_vocabulary()))
print(vectorize_layer.get_vocabulary()[:20])

def vectorize_text(text,label):
    return vectorize_layer(text),label

# Vectorize the Data
train_ds = raw_train_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)

# Do async prefetching / buffering of the data for best performance on GPU
train_ds = train_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)


10000
['', '[UNK]', 'trump', 'said', 'the', 'us', 'would', 'i', 'president', 'people', 'it', 'one', 'state', 'new', 'also', 'donald', 'house', 'government', 'republican', 'he']


In [41]:
for x, y in train_ds.take(1):
    print(x[0][:20])
    print(y.shape)

tf.Tensor(
[   1   74  259 3013    1  687 7707  253  337  638  218  144 1774    8
 6461 2423 8901  179  589 9024], shape=(20,), dtype=int64)
(2048, 1)


In [30]:
"""
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x_train.squeeze())
tokenized_train = tokenizer.texts_to_sequences(x_train.squeeze())
x_train = pad_sequences(tokenized_train , maxlen=maxlen)
"""


In [31]:
#x_train

array([[   0,    0,    0, ...,   73, 3343, 2400],
       [   0,    0,    0, ...,  188,   16,  990],
       [   0,    0,    0, ...,  129,  513, 1221],
       ...,
       [   0,    0,    0, ..., 1005,  573,  282],
       [   0,    0,    0, ...,   30, 1789,   44],
       [   0,    0,    0, ...,   59,  109, 1893]])

In [32]:
#tokenized_test = tokenizer.texts_to_sequences(x_validation)
#x_validation = pad_sequences(tokenized_test , maxlen=maxlen)

In [33]:
#x_validation

array([[   0,    0,    0, ..., 1239,  659,  706],
       [   0,    0,    0, ..., 1015,  158,   19],
       [   0,    0,    0, ...,  574,    3, 1097],
       ...,
       [   0,    0,    0, ...,  104, 6559, 1029],
       [   0,    0,    0, ..., 1304,  455,  930],
       [   0,    0,    0, ...,  194, 4895,   72]])

In [34]:
#tokenized_test = tokenizer.texts_to_sequences(x_test)
#x_test = pad_sequences(tokenized_test , maxlen=maxlen)


In [35]:
#x_test

array([[   0,    0,    0, ...,   40,  293,  121],
       [   0,    0,    0, ...,  800,   17,  340],
       [   0,    0,    0, ..., 2666, 4170,  957],
       ...,
       [   0,    0,    0, ...,  121,  896,  209],
       [   0,    0,    0, ..., 9783, 2734, 2634],
       [   0,    0,    0, ...,  298,   74,  798]])

## Build A model

In [36]:
max_features = 10000
batch_size = 256
epochs = 10
embed_size = 100

model = keras.models.Sequential()
model.add(layers.Embedding(max_features,embed_size))
model.add(layers.Dropout(0.5))

model.add(layers.LSTM(128, return_sequences=True))
model.add(layers.GlobalMaxPool1D())

model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1,activation="sigmoid",name="predictions"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])


## Train and evalute the model

In [37]:
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 100)         1000000   
                                                                 
 dropout_2 (Dropout)         (None, None, 100)         0         
                                                                 
 lstm_1 (LSTM)               (None, None, 128)         117248    
                                                                 
 global_max_pooling1d_1 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_1 (Dense)             (None, 32)                4128      
                                                                 
 dropout_3 (Dropout)         (None, 32)                0         
                                                      

In [38]:
print(x_train.shape)
print(x_test.shape)
print(x_validation.shape)

(25434, 500)
(7732, 500)
(7732, 500)


In [39]:
#model.fit(x_train, y_train,validation_split=0.25,epochs=2)

Epoch 1/2
 15/597 [..............................] - ETA: 4:40 - loss: 0.6905 - accuracy: 0.5104

KeyboardInterrupt: 

In [None]:
model.fit(x_train, y_train,validation_data=(x_validation,y_validation),epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x259bfe258e0>

In [None]:
# Prediction if the news is fake 
# Class 1 (Fake) if predicted prob >= 0.5, else class 0 (Real)

y_pred = (model.predict(x_test) >= 0.5).astype("int")




In [None]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      1.00      1.00      4235
           1       1.00      0.99      0.99      3497

    accuracy                           0.99      7732
   macro avg       0.99      0.99      0.99      7732
weighted avg       0.99      0.99      0.99      7732



### Analysis after Training Model

In [None]:
print("Accuracy of the model on Training Data is - " , model.evaluate(x_train,y_train))
print("Accuracy of the model on Validation Data is - " , model.evaluate(x_validation,y_validation))
print("Accuracy of the model on Testing Data is - " , model.evaluate(x_test,y_test))


Accuracy of the model on Training Data is -  [0.00954868458211422, 0.9972870945930481]
Accuracy of the model on Validation Data is -  [0.03321805223822594, 0.9891360402107239]
Accuracy of the model on Testing Data is -  [0.027062280103564262, 0.9905587434768677]
