In [1]:
import numpy as np
import pandas as pd
import re

import nltk
from sklearn.model_selection import train_test_split
from collections import defaultdict
import random
from math import ceil

import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm.notebook import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!ls drive/MyDrive/BD_Model/Data 

Fake.csv  news_data_combined.h5  news_train.csv  syrian_war.csv  True.csv


In [4]:
if torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')

print(device)

cuda:0


In [5]:
fakes = pd.read_csv(r"drive/MyDrive/BD_Model/Data/Fake.csv")
truth = pd.read_csv(r"drive/MyDrive/BD_Model/Data/True.csv")
news_train = pd.read_csv(r"drive/MyDrive/BD_Model/Data/news_train.csv")

In [6]:
print(f"Dataset shape: {fakes.shape}")
fakes.head()

Dataset shape: (23481, 4)


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [7]:
print(f"Dataset shape: {truth.shape}")
truth.head()

Dataset shape: (21417, 4)


Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [8]:
print(f"Dataset shape: {news_train.shape}")
news_train.head()

Dataset shape: (20800, 5)


Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [9]:
fakes_data = fakes[~fakes.title.isna() & ~fakes.text.isna()][['title', 'text']]
fakes_data['is_truth'] = 0

truth_data = truth[~truth.title.isna() & ~truth.text.isna()][['title', 'text']]
truth_data['is_truth'] = 1

news_train_data = news_train[~news_train.title.isna() & ~news_train.text.isna()]
news_train_data.label = news_train_data.label.apply(lambda x: 1 if x==0 else 0)
news_train_data = news_train_data.rename(columns={'label': 'is_truth'})[['title', 'text', 'is_truth']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [10]:
data = pd.concat([fakes_data, truth_data], axis=0).reset_index(drop=True)
data = pd.concat([data, news_train_data], axis=0).reset_index(drop=True)

### Data cleaning and processing

In [11]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer() #lemmatization

def procces_text(text):
    
    text = re.sub("[^a-zA-Z]", " ",text)
    text = text.lower()
    text = [word for word in nltk.word_tokenize(text) if not word in stop_words]
    text = [lemmatizer.lemmatize(token) for token in text]

    
    return text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [12]:
data['text_processing'] = data.text.apply(procces_text)
data['title_processing'] = data.title.apply(procces_text)

In [13]:
vocab = defaultdict(lambda: 0)
vocab['<PAD>'] = 1

def word_to_number(text_list):
    for word in text_list:
        if word not in vocab:
            vocab[word] = len(vocab) + 1
    return text_list

data['text_processing'] = data['text_processing'].apply(word_to_number)
data['title_processing'] = data['title_processing'].apply(word_to_number)
print('Vocabulary size is: ', len(vocab))

Vocabulary size is:  187837


In [14]:
print(f'Average length of title: {np.mean(list(map(lambda x: len(x), data.title_processing)))}')
print(f'Average length of text: {np.mean(list(map(lambda x: len(x), data.text_processing)))}')

Average length of title: 9.188691417950569
Average length of text: 294.62945269657916


In [15]:
print(f'Median length of title: {np.median(list(map(lambda x: len(x), data.title_processing)))}')
print(f'Median length of text: {np.median(list(map(lambda x: len(x), data.text_processing)))}')

Median length of title: 9.0
Median length of text: 224.0


In [16]:
print("Count of papers with title is longer than 64 words: ", (np.array(list(map(lambda x: len(x), data.title_processing))) > 64).sum())
print("Count of papers with full text is longer than 1024 words: ", (np.array(list(map(lambda x: len(x), data.text_processing))) > 1024).sum())

Count of papers with title is longer than 64 words:  0
Count of papers with full text is longer than 1024 words:  1349


In [17]:
data["title_len"] = data.title_processing.apply(lambda x: len(x))
data["text_len"] = data.text_processing.apply(lambda x: len(x))

In [18]:
data_restricted_length = data[(data.title_len <= 64) & (data.text_len <= 1024)][['title_processing', 'text_processing', 'is_truth']].rename(
columns = {'title_processing' : 'title', 'text_processing': 'text'})

In [19]:
data_restricted_length.title = data_restricted_length.title.apply(lambda x: [vocab[i] for i in x])
data_restricted_length.text = data_restricted_length.text.apply(lambda x: [vocab[i] for i in x])

In [20]:
data_restricted_length.title = data_restricted_length.title.apply(lambda x: x if len(x) == 64 else x + [1 for i in range(64 - len(x))])
data_restricted_length.text = data_restricted_length.text.apply(lambda x: x if len(x) == 1024 else x + [1 for i in range(1024 - len(x))])

In [21]:
data_restricted_length.head()

Unnamed: 0,title,text,is_truth
0,"[2, 3, 49, 3926, 7, 8, 98, 107, 2834, 1, 1, 1,...","[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...",0
1,"[192, 5237, 3, 2396, 182, 197, 1014, 174, 1, 1...","[158, 159, 160, 161, 162, 163, 164, 165, 166, ...",0
2,"[286, 287, 288, 7049, 321, 2452, 7692, 330, 85...","[284, 195, 19, 285, 286, 287, 288, 289, 290, 2...",0
3,"[3, 603, 33, 483, 604, 533, 514, 157, 1, 1, 1,...","[474, 166, 2, 3, 475, 71, 476, 122, 477, 166, ...",0
4,"[607, 608, 359, 2, 3, 474, 971, 1, 1, 1, 1, 1,...","[607, 608, 609, 610, 474, 166, 107, 611, 2, 3,...",0


In [22]:
train_validate = data_restricted_length.sample(frac=0.8)
test = data_restricted_length.drop(train_validate.index)

train = train_validate.sample(frac=0.95)
validate = train_validate.drop(train.index)

In [23]:
title_train = np.array(train.title.tolist())
text_train = np.array(train.text.tolist())
y_train = np.array(train.is_truth.tolist())

title_test = np.array(test.title.tolist())
text_test = np.array(test.text.tolist())
y_test = np.array(test.is_truth.tolist())

title_validate = np.array(validate.title.tolist())
text_validate = np.array(validate.text.tolist())
y_validate = np.array(validate.is_truth.tolist())

In [24]:
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, title, text, label):
        super().__init__()
        self.title = title
        self.text = text
        self.label = label
        
    def __getitem__(self, idx):
        item = (self.title[idx], self.text[idx], self.label[idx])
        return item
    
    def __len__(self):
        return self.title.shape[0]

In [25]:
train_dataset = NewsDataset(title_train, text_train, y_train)
test_dataset = NewsDataset(title_test, text_test, y_test)
validate_dataset = NewsDataset(title_validate, text_validate, y_validate)

In [26]:
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True, drop_last=False)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False, drop_last=False)
validate_data_loader = torch.utils.data.DataLoader(validate_dataset, batch_size=64, shuffle=False, drop_last=False)

In [27]:
class FakeNewsDetector(nn.Module):
    def __init__(self, num_embeddings=len(vocab), embedding_dim=256, hidden=[128, 128, 64, 8, 1]):
        super().__init__()
        self.hidden = hidden
        self.embedding = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim, padding_idx=1)
        self.conv1d_1 = nn.Sequential(
            nn.Conv1d(in_channels=embedding_dim, out_channels=self.hidden[0], kernel_size=3, stride=1),
            nn.BatchNorm1d(self.hidden[0]),
            nn.SiLU(),
            nn.Conv1d(in_channels=self.hidden[0], out_channels=self.hidden[1], kernel_size=3, stride=1),
            nn.BatchNorm1d(self.hidden[1]),
            nn.SiLU(),
            nn.MaxPool1d(2)
        )
        self.conv1d_2 = nn.Sequential(
            nn.Conv1d(in_channels=self.hidden[1], out_channels=self.hidden[2], kernel_size=1, stride=1),
            nn.BatchNorm1d(self.hidden[2]),
            nn.SiLU(),
            nn.MaxPool1d(2)
        )
        self.lstm = nn.LSTM(input_size=self.hidden[2], hidden_size=self.hidden[3], num_layers=1, batch_first =True, bidirectional =True)
        self.fn_act = nn.Tanh()
        
        self.flatten = nn.Flatten()
        lstm_out_dim = 255 * self.hidden[3] * 2
        self.linear = nn.Linear(lstm_out_dim, self.hidden[4])
        
    def forward(self, x):
        x = self.embedding(x)
        x = x.transpose(-1, -2)
        
        x = self.conv1d_1(x)
        x = self.conv1d_2(x)
        
        x = x.transpose(-1, -2)
        x, _ = self.lstm(x)
        x = self.fn_act(x)
        
        x = self.flatten(x)
        x = self.linear(x)
        
        return x

In [28]:
def train_model(model, optimizer, data_loader, loss_module, num_epochs=10, is_printed=True):
    # Set model to train mode

    loss_array = []
    # Training loop
    for epoch in tqdm(range(num_epochs)):
        model.train()
        for batch in data_loader:

            ## Step 1: Move input data to device (only strictly necessary if we use GPU)
            data_inputs, data_labels = batch[1], batch[2]
            data_inputs = data_inputs.to(device)
            data_labels = data_labels.to(device)

            ## Step 2: Run the model on the input data
            preds = model(data_inputs)
            preds = preds.squeeze(dim=1) # Output is [Batch size, 1], but we want [Batch size]

            ## Step 3: Calculate the loss
            loss = loss_module(preds, data_labels.float())

            ## Step 4: Perform backpropagation
            # Before calculating the gradients, we need to ensure that they are all zero.
            # The gradients would not be overwritten, but actually added to the existing ones.
            optimizer.zero_grad()
            # Perform backpropagation
            loss.backward()

            ## Step 5: Update the parameters
            optimizer.step()
        loss_array.append(loss.item())
        val_acc = eval_model(model, validate_data_loader, 0.75, False)
        if is_printed:
            print(f'Epoch {epoch}: loss = {loss_array[epoch]}, validation accuracy = {val_acc}')

In [29]:
def eval_model(model, data_loader, threshold=0.5, is_printed=True):
    model.eval() # Set model to eval mode
    true_preds, num_preds = 0., 0.

    with torch.no_grad(): # Deactivate gradients for the following code
        for batch in data_loader:

            # Determine prediction of model on dev set
            data_inputs, data_labels = batch[1], batch[2]
            data_inputs, data_labels = data_inputs.to(device), data_labels.to(device)
            preds = model(data_inputs)
            preds = preds.squeeze(dim=1)
            preds = torch.sigmoid(preds) # Sigmoid to map predictions between 0 and 1
            pred_labels = (preds >= threshold).long() # Binarize predictions to 0 and 1

            # Keep records of predictions for the accuracy metric (true_preds=TP+TN, num_preds=TP+TN+FP+FN)
            true_preds += (pred_labels == data_labels).sum()
            num_preds += data_labels.shape[0]

    acc = true_preds / num_preds
    if is_printed:
        print(f"Accuracy of the model: {100.0*acc:4.2f}%")
        return None
    return acc

In [30]:
model = FakeNewsDetector()

In [31]:
model.to(device)

FakeNewsDetector(
  (embedding): Embedding(187837, 256, padding_idx=1)
  (conv1d_1): Sequential(
    (0): Conv1d(256, 128, kernel_size=(3,), stride=(1,))
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): SiLU()
    (3): Conv1d(128, 128, kernel_size=(3,), stride=(1,))
    (4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): SiLU()
    (6): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv1d_2): Sequential(
    (0): Conv1d(128, 64, kernel_size=(1,), stride=(1,))
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): SiLU()
    (3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (lstm): LSTM(64, 8, batch_first=True, bidirectional=True)
  (fn_act): Tanh()
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear): Linear(in_features=4080, out_features=1, bias=True)
)

In [32]:
loss_module = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adagrad(model.parameters(), lr=0.1)

In [33]:
train_model(model, optimizer, train_data_loader, loss_module)

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 0: loss = 0.0063687097281217575, validation accuracy = 0.9509803652763367
Epoch 1: loss = 0.03896888718008995, validation accuracy = 0.9670588374137878
Epoch 2: loss = 0.07235900312662125, validation accuracy = 0.8941176533699036
Epoch 3: loss = 0.09373366832733154, validation accuracy = 0.9419607520103455
Epoch 4: loss = 0.0015552560798823833, validation accuracy = 0.9650980234146118
Epoch 5: loss = 8.024663111427799e-05, validation accuracy = 0.9666666388511658
Epoch 6: loss = 0.0034102534409612417, validation accuracy = 0.9690195918083191
Epoch 7: loss = 0.0038583395071327686, validation accuracy = 0.9694117307662964
Epoch 8: loss = 9.049410437000915e-05, validation accuracy = 0.9678431153297424
Epoch 9: loss = 0.004774333443492651, validation accuracy = 0.9701960682868958


In [35]:
eval_model(model, test_data_loader, 0.75, True)

Accuracy of the model: 97.07%


In [None]:
state_dict = model.state_dict()
print(state_dict)

In [None]:
torch.save(state_dict, r"drive/MyDrive/BD_Model/FakeNewsDetector_model.tar")

In [37]:
import json

In [39]:
vocab_file = open("drive/MyDrive/BD_Model/vocabulary.json", "w")
json.dump(vocab, vocab_file)
vocab_file.close()