In [7]:
# importing the required libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import pad_sequences
from keras import preprocessing
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
%matplotlib inline

import boto3

In [8]:
# import the dataset
import boto3
bucket='seis736-bucket' # Or whatever you called your bucket
data_key = 'tweets_fake_real.csv' # Where the file is within your bucket
data_location = 's3://seis736-bucket/tweets_fake_real.csv'.format(bucket, data_key)
df_fake_real = pd.read_csv(data_location, encoding='latin1')

In [None]:
# Dropping columns which are not required in the classificiation model. Keeping only post_text and label
df_fake_real.drop(['post_id', 'user_id', 'image_id(s)','username','timestamp'],axis=1,inplace=True)

In [None]:
X = df_fake_real.post_text
Y = df_fake_real.label
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1)

In [None]:
# Splitting the dataset in to Train and Test. 75% Training and 25% for Testing
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25)

In [None]:
# Configuring the maximum words and maimum length of word
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = keras.utils.pad_sequences(sequences,maxlen=max_len)

In [None]:
# Defining and compiling the RNN model
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

In [None]:
# Fitting the model
filepath="rnn_model_fake_real_text_classification.hdf5"
model_checkpoint=ModelCheckpoint(filepath,save_best_only=True,verbose=1)
callbacks_list=[model_checkpoint]

model.fit(sequences_matrix,Y_train,batch_size=128,epochs=300,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])


In [None]:
# Prediction on the Test set
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = keras.utils.pad_sequences(test_sequences,maxlen=max_len)

In [None]:
model.save("s3://seis736-bucket/rnn_model_fake_real_text_classification.hdf5")

In [None]:
# model Accuracy
accr = model.evaluate(test_sequences_matrix,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))