In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


## Loading The Necessary Libraries

In [2]:
from tensorflow import keras
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.layers import Dense

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import seaborn as sns
import matplotlib.pyplot as plt

import warnings

warnings.filterwarnings('ignore')

## Importing the Dataset

In [3]:
train = pd.read_csv('../input/simplified-fake-news-dataset/train.csv')
test = pd.read_csv('../input/simplified-fake-news-dataset/test.csv')

## Data Preprocessing

In [4]:
test.head()

In [5]:
## Dropping Nan Values

dataset = pd.concat([train, test], ignore_index=True)

# Dropping null Values 
dataset = dataset.dropna()

# Shuffeling the dataset along rows
dataset = dataset.sample(frac=1)
labels = dataset['fake']
dataset = dataset.drop('fake', axis=1)

dataset.head()

In [6]:
[dataset.shape, labels.shape]

In [7]:
sns.set_style(style="darkgrid")
sns.countplot(x=labels)

## Data Preprocessing

In [8]:
## Vocabulary size for one hot encoding
vocab_size = 8000

## Here we will only use title of news articles to train the model
messages = dataset.copy()
messages["title"][0]

In [9]:
messages.reset_index(inplace=True)
messages.head()

### stopwords
stopwords are words that are very common in human language but are generally not useful 
because they represent particularly common words such as “the”, “of”, and “to”.<br/>
We would not want these words to take up space in our database, or taking up valuable processing time. For this, we can remove them easily, by storing a list of words that you consider to stop words

In [10]:
stop_words = set(stopwords.words('english'))

print(stop_words, end=" ")

## Stemming and Tokenizing the titles

In [11]:
### Stemming The Dataset
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stop_words]
    review = ' '.join(review)
    corpus.append(review)

In [12]:
print(f"Before Stemming: {messages.title[0]}")
print(f"After Stemming : {corpus[0]}")
len(corpus)

### One Hot Encoding
#### This converts each word in corpus to index of their position in vocabulary

In [13]:
# Converting the messages to one hot encodings
onehot = [one_hot(word, vocab_size) for word in corpus]

for i in range(5):
    print(f"Words : [{corpus[i]}]")
    print(f"OneHot :{onehot[i]}\n")

### Word Embedding 
- Word Embedding or Word Vector is a numeric vector input that represents a word in a lower-. dimensional space. It allows words with similar meaning to have a similar representation.<br/>
- They try to preserve syntactical and semantic information>Word Embeddings are a method of extracting features out of text so that we can input those features into a machine learning model to work with text data.<br/>
- They try to preserve syntactical and semantic information

In [14]:
## Padding the dataset so that each sentence will have same length

max_length=30
embedded_docs=pad_sequences(onehot,padding='post',maxlen=max_length)
print(f"After applying padding, the representation becomes : \n\n{embedded_docs[0]}")

## Creating the model

In [42]:
# length of an embedding vector for each word
embedding_vector_length = 40

model = Sequential(name="Fake_News_Detector")

model.add(Embedding(vocab_size, embedding_vector_length, input_length=max_length))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dropout(0.3))
model.add((LSTM(128, return_sequences=True)))
model.add(keras.layers.Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(keras.layers.Dropout(0.3))
model.add(Dense(1, activation='sigmoid', name="Output_Layer"))

model.summary()

#### Compiling the Model

In [43]:
## Compiling the Model
model.compile(
    loss = keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer = 'adam',
    metrics=['accuracy']
)

## Training

In [44]:
from sklearn.model_selection import train_test_split
## Final Dataset

final_dataset = np.array(embedded_docs)
final_labels = np.array(labels)

## Splitting the dataset
x_train, x_test, y_train, y_test = train_test_split(final_dataset, final_labels, test_size=0.33, random_state=0)
final_labels[:5]

In [45]:

## Training the model
print("\n\nTraining The Model")

Train = model.fit(x_train, y_train, batch_size=64, validation_data=(x_test, y_test), epochs=10)

## Performance Metrics

In [47]:
## Evaluating the model on training set

model.evaluate(x_test, y_test)

In [52]:
## Making Predictions
y_pred = (model.predict(x_test) > 0.5).astype("int32")
y_pred[50:60]

In [54]:
# Confusion Matrix

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

In [55]:
# Classification report

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))