In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import string as str
import regex as re
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.callbacks import EarlyStopping
import pickle
import keras
import matplotlib


In [None]:
df = pd.read_csv('data.csv')
df = df.drop(df.columns[0],axis=1)
df = df.dropna()
df.reset_index(drop=True)

Unnamed: 0,title,y
0,Donald Trump Sends Out Embarrassing New Year’...,0
1,Drunk Bragging Trump Staffer Started Russian ...,0
2,Sheriff David Clarke Becomes An Internet Joke...,0
3,Trump Is So Obsessed He Even Has Obama’s Name...,0
4,Pope Francis Just Called Out Donald Trump Dur...,0
...,...,...
77212,vision of flames approaching corryong in victoria,1
77213,wa police and government backflip on drug amne...,1
77214,we have fears for their safety: victorian premier,1
77215,when do the 20s start,1


In [None]:
len(df[df['y']==0])

35800

In [None]:
len(df[df['y']==1])

41417

In [None]:
def clean_doc(doc):
	# split into tokens by white space
    
    doc=doc.lower()
    
    doc = re.sub(r'[^\w\s]','',doc)
    #remove digits
    doc = re.sub("\d+","",doc)
    #remove excess whitespaces
    doc = re.sub("\s+"," ",doc)
    
    return doc.strip()

In [None]:
df['clean title'] = df['title'].apply(clean_doc)


In [None]:
lens = []
for s in df['clean title'].str.split():
  lens.append(len(s))

print(max(lens))

#We will use this as max_seq_length

67


In [None]:
def fetch_word_vectors():
    f = open('glove.6B.300d.txt',encoding = "utf-8")
    embedd_index = {}
    for line in f:
        val = line.split()
        word = val[0]
        coff = np.asarray(val[1:],dtype = 'float')
        embedd_index[word] = coff
    f.close()
    #print('Found %s word vectors.' % len(embedd_index))
    return(embedd_index)
  
def construct_embedding(embedd_index,index_of_words,embed_num_dims):
    embedding_matrix = np.zeros((len(index_of_words) + 1, embed_num_dims))
    for word,i in index_of_words.items():
        temp = embedd_index.get(word)
        if temp is not None:
            embedding_matrix[i] = temp
    return embedding_matrix

In [None]:
def tokenize_sentences(sentences,num_words,embed_num_dims,max_seq_len):
    tokenizer = Tokenizer(num_words)
    tokenizer.fit_on_texts(sentences)
    sequence = tokenizer.texts_to_sequences(sentences)
    index_of_words = tokenizer.word_index
    padded_seq = pad_sequences(sequence, maxlen = max_seq_len)
    return (tokenizer,index_of_words, padded_seq)

def contruct_model(embedding_matrix,index_of_words,embed_num_dims,max_seq_len):
    model = Sequential()
    embedd_layer = Embedding(len(index_of_words) + 1 , embed_num_dims , input_length = max_seq_len , weights = [embedding_matrix])
    model.add(embedd_layer)
    model.add(SpatialDropout1D(0.4))
    model.add(LSTM(196, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(2,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    print(model.summary())
    return model

def train(model,X_train,Y_train):
    es = EarlyStopping(monitor='loss', mode='min', verbose=1)
    model.fit(X_train, Y_train, epochs = 20, batch_size=32, verbose = 2, callbacks=[es]) 
    
def evaluate(model,X_test,Y_test):
    score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = 32)
    print("score: %.2f" % (score))
    print("acc: %.2f" % (acc))

In [None]:
embedd_index = fetch_word_vectors()

tokenizer,index_of_words,padded_seq = tokenize_sentences(df['clean title'],num_words=100000,embed_num_dims=300,max_seq_len=67)
    
#construct embedding matrix
embedding_matrix = construct_embedding(embedd_index,index_of_words,embed_num_dims=300)

model = contruct_model(embedding_matrix,index_of_words,embed_num_dims=300,max_seq_len=67)
Y = pd.get_dummies(df['y']).values
X_train, X_test, Y_train, Y_test = train_test_split(padded_seq,Y, test_size = 0.3, random_state = 42)
print(X_train.shape,Y_train.shape,X_test.shape,Y_test.shape)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 67, 300)           12415500  
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 67, 300)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               389648    
_________________________________________________________________
dense (Dense)                (None, 2)                 394       
Total params: 12,805,542
Trainable params: 12,805,542
Non-trainable params: 0
_________________________________________________________________
None
(54051, 67) (54051, 2) (23166, 67) (23166, 2)


In [None]:
train(model,X_train,Y_train)
#evaluate(model,X_test,Y_test)

Epoch 1/20
1690/1690 - 568s - loss: 0.2503 - accuracy: 0.8936
Epoch 2/20
1690/1690 - 571s - loss: 0.1420 - accuracy: 0.9432
Epoch 3/20
1690/1690 - 578s - loss: 0.0948 - accuracy: 0.9629
Epoch 4/20
1690/1690 - 560s - loss: 0.0644 - accuracy: 0.9760
Epoch 5/20
1690/1690 - 557s - loss: 0.0436 - accuracy: 0.9838
Epoch 6/20
1690/1690 - 566s - loss: 0.0308 - accuracy: 0.9885
Epoch 7/20
1690/1690 - 551s - loss: 0.0219 - accuracy: 0.9922
Epoch 8/20
1690/1690 - 553s - loss: 0.0163 - accuracy: 0.9943
Epoch 9/20
1690/1690 - 556s - loss: 0.0122 - accuracy: 0.9956
Epoch 10/20
1690/1690 - 567s - loss: 0.0096 - accuracy: 0.9967
Epoch 11/20
1690/1690 - 556s - loss: 0.0087 - accuracy: 0.9971
Epoch 12/20
1690/1690 - 553s - loss: 0.0066 - accuracy: 0.9976
Epoch 13/20
1690/1690 - 551s - loss: 0.0064 - accuracy: 0.9977
Epoch 14/20
1690/1690 - 554s - loss: 0.0055 - accuracy: 0.9981
Epoch 15/20
1690/1690 - 554s - loss: 0.0039 - accuracy: 0.9989
Epoch 16/20
1690/1690 - 541s - loss: 0.0042 - accuracy: 0.9987
E

In [None]:
evaluate(model,X_test,Y_test)

724/724 - 11s - loss: 0.4876 - accuracy: 0.9346
score: 0.49
acc: 0.93


In [None]:
def save_model_to_disk(model,model_name):
    # serialize model to JSON
    model_json = model.to_json()
    with open(model_name+".json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights(model_name+".h5")
    #create pickle to save tokenizer
    with open(model_name+".pickle", 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Saved model to disk")

In [None]:
model.save('lstm_model')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: lstm_model/assets


In [None]:
!zip -r /content/file.zip /content/lstm_model

  adding: content/lstm_model/ (stored 0%)
  adding: content/lstm_model/variables/ (stored 0%)
  adding: content/lstm_model/variables/variables.data-00000-of-00001 (deflated 27%)
  adding: content/lstm_model/variables/variables.index (deflated 60%)
  adding: content/lstm_model/saved_model.pb (deflated 89%)
  adding: content/lstm_model/assets/ (stored 0%)


In [None]:
save_model_to_disk(model,"lstm_model")

Saved model to disk


In [None]:
model = keras.models.load_model('lstm_model')



In [None]:
def predict(text):
    text = tokenizer.texts_to_sequences(text)
    #padding the text to have exactly the same shape as embedding
    text = pad_sequences(text, maxlen=67, dtype='int32', value=0)
    y = model.predict(text,batch_size=1,verbose = 2)[0]
    if(np.argmax(7) == 0):
        print("Fake")
        return 0
    elif (np.argmax(y) == 1):
        print("True")
        return 1

In [None]:
predict("Hubble Finds Exoplanet That Could Mirror Planet Nine")

21/21 - 0s
True


1