
# Fake News Classification using LSTM
The data fake_or_real_news.csv is downloaded from the link given below. It contains four columns but we will use only the text and label part of it. The label for each text is either real (assigned the value 0) or fake (assigned the value 1).

In [28]:
import torch
from torchtext.data import Field, TabularDataset, BucketIterator
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import time
import random

from torchtext import data

# Data cleaning i.e. converting the data to lower case and creating data frame which will be used afterwards
df = pd.read_csv('fake_or_real_news.csv', names=["text","label"], nrows=1000)
X = [ a[:200].lower() for a in df["text"]]
y = [ a.lower() for a in df["label"]]

dataset = pd.DataFrame({'text': X,'label': y})

In [29]:
SEED = 42

# This makes the result reproducible later
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Tokenizing text and labels to construct an embedding later
TEXT = data.Field(tokenize = 'spacy', include_lengths = True, batch_first=True)
LABEL = data.LabelField(dtype = torch.float, batch_first=True)

In [30]:
from numpy.random import RandomState

fields = [(None, None), ('text',TEXT), ('label',LABEL)]

!touch dataset.csv

dataset.to_csv("dataset.csv")

#loading custom dataset
training_data = data.TabularDataset(path = 'dataset.csv',format = 'csv',fields = fields,skip_header = True)

# splitting the data in training and test 
train_ds, test_ds = training_data.split(split_ratio=0.7, random_state = random.seed(SEED))

# Vocabulary Construction

In [31]:
MAX_VOCAB_SIZE = 25000

# We build the vocabulary using the training data set and pre trained vectors from glove for tokens
TEXT.build_vocab(train_ds, max_size = MAX_VOCAB_SIZE, vectors = 'glove.6B.200d', unk_init = torch.Tensor.zero_)

LABEL.build_vocab(train_ds)

BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Split the data into batches using Bucket Iterator which requires minimum amount of padding
train_iterator, test_iterator = data.BucketIterator.splits(
    (train_ds, test_ds), 
    batch_size = BATCH_SIZE,
    sort = True,
    sort_key = lambda x: len(x.text),
    sort_within_batch = True,
    device = device)

In [32]:
#No. of unique tokens in text
print("Size of TEXT vocabulary :",len(TEXT.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

#Word dictionary
print(TEXT.vocab.stoi) 

Size of TEXT vocabulary : 5464
[('the', 1192), (',', 977), ('.', 585), ('of', 568), ('to', 514), ('a', 509), ('in', 421), ('and', 412), ('-', 284), ('on', 267)]


# Declaring Hyper Parameters

In [33]:
num_epochs = 10
learning_rate = 0.001

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 200
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.2
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

# Model Specifications

In [34]:
class LSTM_net(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        # Embedding Layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        # LSTM layer
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        # Linear Layers followed by dropout
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        
        self.fc2 = nn.Linear(hidden_dim, 1)
        
        self.dropout = nn.Dropout(dropout)
    def forward(self, text, text_lengths):

        embedded = self.embedding(text)
        
        # # embedded = [sent len, batch size, emb dim]
        
        # # pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        # # unpack sequence
        # # output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        # # output = [sent len, batch size, hid dim * num directions]
        # # output over padding tokens are zero tensors
        
        # # hidden = [num layers * num directions, batch size, hid dim]
        # # cell = [num layers * num directions, batch size, hid dim]
        
        # # concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        # apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        output = self.fc1(hidden)
        output = self.dropout(self.fc2(output))

        return output

In [35]:
model = LSTM_net(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

# Constructing the Embedding Matrix

In [36]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)
model.embedding.weight.data.copy_(pretrained_embeddings)

model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

model.to(device) #CNN to GPU

# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

torch.Size([5464, 200])
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0715,  0.0935,  0.0237,  ...,  0.3362,  0.0306,  0.2558],
        ...,
        [ 0.1980,  0.4274,  0.2203,  ..., -0.3212,  0.2271,  0.3212],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.2944, -0.2474, -0.1772,  ...,  0.1119,  0.4397,  0.1824]])


In [37]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

# Training the Model

In [38]:
def train(model, iterator):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.text   
        
        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()  

        #compute the loss
        loss = criterion(predictions, batch.label)        

        #compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)  

        #backpropagate the loss and compute the gradients
        loss.backward()       

        #update the weights
        optimizer.step()      

        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()   

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [39]:
def evaluate(model, iterator):
    
    epoch_acc = 0
    model.eval()

    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze()
            acc = binary_accuracy(predictions, batch.label)
            
            epoch_acc += acc.item()
        
    return epoch_acc / len(iterator)

In [40]:
t = time.time()
loss=[]
acc=[]
test_acc_list=[]

for epoch in range(num_epochs):

    train_loss, train_acc = train(model, train_iterator)
    test_acc = evaluate(model, test_iterator)
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Test. Acc: {test_acc*100:.2f}%')
    
    loss.append(train_loss)
    acc.append(train_acc)
    test_acc_list.append(test_acc)
    
print(f'time:{time.time()-t:.3f}')

	Train Loss: 0.683 | Train Acc: 51.95%
	 Test. Acc: 56.96%
	Train Loss: 0.599 | Train Acc: 65.34%
	 Test. Acc: 65.81%
	Train Loss: 0.480 | Train Acc: 76.32%
	 Test. Acc: 73.48%
	Train Loss: 0.425 | Train Acc: 77.21%
	 Test. Acc: 78.88%
	Train Loss: 0.398 | Train Acc: 79.15%
	 Test. Acc: 80.16%
	Train Loss: 0.331 | Train Acc: 81.94%
	 Test. Acc: 81.75%
	Train Loss: 0.247 | Train Acc: 85.95%
	 Test. Acc: 83.55%
	Train Loss: 0.249 | Train Acc: 84.52%
	 Test. Acc: 86.60%
	Train Loss: 0.212 | Train Acc: 85.41%
	 Test. Acc: 83.81%
	Train Loss: 0.203 | Train Acc: 86.43%
	 Test. Acc: 85.84%
time:205.805
