In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SpatialDropout1D

In [2]:
data = pd.read_csv('processedData.csv')
data.content=data.content.astype(str)
data.drop(['Unnamed: 0','id'],axis=1 , inplace=True)
data.head()

Unnamed: 0,label,content
0,1,darrel lucu hous dem aid ’ even see comey ’ le...
1,0,daniel j flynn flynn hillari clinton big woman...
2,1,consortiumnewscom truth might get fire
3,1,jessica purkiss 15 civilian kill singl u airst...
4,1,howard portnoy iranian woman jail fiction unpu...


In [3]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [9]:
test.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [14]:
test = test[test['title'].notna()]

In [15]:
test['content'] = test['author'] + " " + test['title']

In [4]:
embDim = 100
epochs = 10
mostCommonWords = 80000
maxLen = 250

In [5]:
tokenizer = Tokenizer(num_words=mostCommonWords , filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~' , lower=True )
tokenizer.fit_on_texts(data['content'].values)

In [6]:
wordIndex = tokenizer.word_index
print(f'Number of unique tokens is {len(wordIndex)}')

Number of unique tokens is 18405


#### Training Data

In [7]:
X = tokenizer.texts_to_sequences(data['content'].values)
X = pad_sequences(X, maxlen=maxLen)
print(f'X shape is {X.shape}')

X shape is (20242, 250)


In [8]:
Y = data['label']
print(f'Y shape is {Y.shape}')

Y shape is (20242,)


#### Test Data

In [17]:
test.content=test.content.astype(str)
X_test = tokenizer.texts_to_sequences(test['content'].values)
X = pad_sequences(X, maxlen=maxLen)
print(f'X shape is {X.shape}')

X shape is (20242, 250)


In [None]:
model = Sequential([
    Embedding(mostCommonWords + 1 , embDim, input_length= X.shape[1]),
    SpatialDropout1D(0.4),
    LSTM(100, dropout=0.2 , recurrent_dropout=0.2),
    Dense(1,activation='sigmoid')
])

In [None]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics='accuracy')
model.summary()

In [None]:
history = model.fit(X,Y,epochs=epochs , batch_size=128)