In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn import metrics
import numpy as np

import keras
import keras.layers as layers
import tensorflow as tf
import os
from nltk.tokenize import word_tokenize
from tensorflow.keras.layers import TextVectorization

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences


In [48]:
data = pd.read_csv("master_dataset/processed_data.csv")
list(data.columns)
# drop unwanted features

data.iloc[:,5:].columns

# Drop all the column , keep only class , text_without_stopwords and title_without_stopwords
data = data.drop(data.iloc[:,5:].columns, axis=1)
data = data.drop(['title', 'text'],axis=1)
    
# ['title', 'text', 'text_without_stopwords', 'title_without_stopwords','syllables', 'polarity_category', 'overall_content', 'polarity'], axis=1)
#'Topic 1 Probability', 'Topic 2 Probability', 'Topic 3 Probbility' , 'Topic 4 Probability' ,'Topic 5 Probability',
#'title_word_count', 'title_sentence_count', 'title_average_word_length','title_punctuation_count', 'title_stopwords_count'  
# 'polarity_category_Neutral' , 'polarity_category_Positive'

In [49]:
#dataset is slightly imbalanced so we will perform upsampling to balance the dataset.
data['class'].value_counts()

0    21196
1    17462
Name: class, dtype: int64

In [50]:
list(data.columns)

['class', 'text_without_stopwords', 'title_without_stopwords']

In [51]:
data["overall_text"] = data["text_without_stopwords"] + " " + data["title_without_stopwords"]

data

Unnamed: 0,class,text_without_stopwords,title_without_stopwords,overall_text
0,1,donald trump wish americans happy new year lea...,donald trump sends out embarrassing new year’s...,donald trump wish americans happy new year lea...
1,1,house intelligence committee chairman devin nu...,drunk bragging trump staffer started russian c...,house intelligence committee chairman devin nu...
2,1,on friday revealed former milwaukee sheriff da...,sheriff david clarke becomes an internet joke ...,on friday revealed former milwaukee sheriff da...
3,1,on christmas day donald trump announced would ...,trump is so obsessed he even has obama’s name ...,on christmas day donald trump announced would ...
4,1,pope francis used annual christmas day message...,pope francis just called out donald trump duri...,pope francis used annual christmas day message...
...,...,...,...,...
38653,0,nato allies tuesday welcomed president donald ...,'fully committed' nato backs new us approach a...,nato allies tuesday welcomed president donald ...
38654,0,lexisnexis provider legal regulatory business ...,lexisnexis withdrew two products chinese market,lexisnexis provider legal regulatory business ...
38655,0,in shadow disused soviet-era factories minsk s...,minsk cultural hub becomes authorities,in shadow disused soviet-era factories minsk s...
38656,0,vatican secretary state cardinal pietro paroli...,vatican upbeat possibility pope francis visiti...,vatican secretary state cardinal pietro paroli...


### Prepare the data

In [52]:
#first split the dataset into training and test sets


x_train, x_test, y_train, y_test = train_test_split(data['overall_text'],data['class'],test_size=0.3,random_state = 1)

#balance x_train with oversampling
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler(sampling_strategy='minority')

# OverSampling only works on Dataframe, but current x_train and y_train are series
x_train, y_train = oversample.fit_resample(x_train.to_frame(), y_train.to_frame())


#check that train set is oversampled
y_train.value_counts()

class
0        14879
1        14879
dtype: int64

In [53]:
print(x_train)

                                            overall_text
0      another turn left hillster school choice teach...
1      melania trump rose husband’s defense monday de...
2      it incredibly unlikely britain able negotiate ...
3      the head us senate armed services committee we...
4      republicans congress struggled thursday effort...
...                                                  ...
29753  former attorney general michael mckasey lists ...
29754  former nh governor john sununu let alison cama...
29755  san fransisco 49er quarterback colin kaepernic...
29756  unhinged leftists calling boycott ivanka trump...
29757  donald trump called yet another foreign leader...

[29758 rows x 1 columns]


In [54]:
# Find the number of maximum text 
print(data.overall_text.str.len().max())

40567


In [None]:
# TA example  not working for me. 
"""
 # Change dataframe back to Series


raw_train_ds = tf.data.Dataset.from_tensor_slices(
    (tf.constant(x_train.squeeze().to_list()),
     tf.keras.utils.to_categorical(y_train.to_numpy() -1))
)

raw_test_ds = tf.data.Dataset.from_tensor_slices(
    (tf.constant(x_test.squeeze().to_list()),
     tf.keras.utils.to_categorical(y_test.to_numpy() -1))
)
raw_train_ds
vectorize_layer = TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=42000, # Based on how many words there are in overall_text
)
text_ds = raw_train_ds.map(lambda x,y:x)

vectorize_layer.adapt(text_ds)
print(len(vectorize_layer.get_vocabulary()))
print(vectorize_layer.get_vocabulary()[:20])
def vectorize_text(text,label):
    return vectorize_layer(text),label

# Vectorize the Data
train_ds = raw_train_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

# Do async prefetching / buffering of the data for best performance on GPU
train_ds = train_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

"""


In [55]:
# # Model constants.


maxlen = 500 # Set a max length of the array, if not it will do an array of like [1,10000] , and if i would to run the LSTM, it will take 30 hours

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x_train.squeeze())
tokenized_train = tokenizer.texts_to_sequences(x_train.squeeze())
x_train = pad_sequences(tokenized_train , maxlen=maxlen)


In [56]:
x_train

array([[   0,    0,    0, ...,  129,   47,   19],
       [   0,    0,    0, ...,  440,   14,    1],
       [   0,    0,    0, ...,   77,  875,  140],
       ...,
       [   0,    0,    0, ..., 2180, 5321, 5813],
       [   0,    0,    0, ..., 6035,  325,  517],
       [   0,    0,    0, ...,   14,    1,   44]])

In [57]:
tokenized_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(tokenized_test , maxlen=maxlen)


In [None]:
x_test

array([[   0,    0,    0, ...,  447,  492, 1000],
       [   0,    0,    0, ...,  378, 1898, 1501],
       [   0,    0,    0, ...,   11,  112,   44],
       ...,
       [   0,    0,    0, ..., 1031,  189,   41],
       [   0,    0,    0, ...,  515,   25, 8007],
       [   0,    0,    0, ..., 1387,  486, 3205]])

## Build A model

In [62]:
max_features = 10000
batch_size = 256
epochs = 10
embed_size = 100

model = keras.models.Sequential()
model.add(layers.Embedding(max_features,embed_size))
model.add(layers.Dropout(0.5))

model.add(layers.LSTM(128, return_sequences=True))
model.add(layers.GlobalMaxPool1D())

model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1,activation="sigmoid",name="predictions"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])


## Train and evalute the model

In [63]:
model.summary()


Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, None, 100)         1000000   
                                                                 
 dropout_6 (Dropout)         (None, None, 100)         0         
                                                                 
 lstm_9 (LSTM)               (None, None, 128)         117248    
                                                                 
 global_max_pooling1d_3 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_9 (Dense)             (None, 32)                4128      
                                                                 
 dropout_7 (Dropout)         (None, 32)                0         
                                                      

In [65]:
model.fit(x_train, y_train,epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x286d7328eb0>

In [69]:

batch_size = 256
epochs = 10
embed_size = 100
model2 = keras.models.Sequential()
#Non-trainable embeddidng layer
model2.add(layers.Embedding(max_features, input_length=500, output_dim=embed_size))
#LSTM 
model2.add(layers.LSTM(units=128 , return_sequences = True , recurrent_dropout = 0.25 , dropout = 0.25))
model2.add(layers.LSTM(units=64 , recurrent_dropout = 0.1 , dropout = 0.1))
model2.add(layers.Dense(units = 32 , activation = 'relu'))
model2.add(layers.Dense(1, activation='sigmoid'))
model2.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])

In [70]:
model2.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 500, 100)          1000000   
                                                                 
 lstm_12 (LSTM)              (None, 500, 128)          117248    
                                                                 
 lstm_13 (LSTM)              (None, 64)                49408     
                                                                 
 dense_12 (Dense)            (None, 32)                2080      
                                                                 
 dense_13 (Dense)            (None, 1)                 33        
                                                                 
Total params: 1,168,769
Trainable params: 1,168,769
Non-trainable params: 0
_________________________________________________________________


In [71]:
model2.fit(x_train, y_train,epochs=2)  # This took 1hr 10mins , dont run again. i will cry.

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x286e4a095e0>

### Analysis after Training Model

In [72]:
print("Accuracy of the model on Training Data is - " , model.evaluate(x_train,y_train))

print("Accuracy of the model on Testing Data is - " , model.evaluate(x_test,y_test))


Accuracy of the model on Training Data is -  [0.0009639053023420274, 0.9996639490127563]
Accuracy of the model on Testing Data is -  [0.01718318648636341, 0.9945680499076843]
