In [52]:
import unicodedata
import sys
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split
from transformers import AutoModel, BertTokenizerFast

In [53]:
STOP_WORDS = ["a", "an", "the", "this", "that", "of", "for", "or",
              "and", "on", "to", "be", "if", "we", "you", "in", "is",
              "at", "it", "rt", "mt", "with"]

STOP_PREFIXES = ("@", "#", "http", "&amp")

punctuations = string.punctuation + '–' + '…'

The datasets we use are two classified datasets containing news title, news text and other information.  

In [54]:
### Data cleaning

# Import data, add target classification, merge dataset, add label
fake = pd.read_csv("data/fake.csv")
true = pd.read_csv("data/true.csv")
fake['target']=['False']*len(fake)
true['target']=['True']*len(true)
data = true.append(fake).sample(frac=1).reset_index().drop(columns=['index'])
data['label']=pd.get_dummies(data.target)['False']

  data = true.append(fake).sample(frac=1).reset_index().drop(columns=['index'])


In [55]:
# Check the datasets for fake news and true news pieces:

data.head()

Unnamed: 0,title,text,subject,date,target,label
0,Russian with ties to former Trump aide wants i...,WASHINGTON (Reuters) - A Russian oligarch once...,politicsNews,"May 27, 2017",True,0
1,SURGE IN MUSLIM IMMIGRATION INTO WESTERN WORLD...,Thanks to the United Nations and our State Dep...,politics,"May 9, 2015",False,1
2,Trump campaign paid lawyer now representing so...,WASHINGTON (Reuters) - Donald Trump’s campaign...,politicsNews,"July 15, 2017",True,0
3,TRUMP CHALLENGES FAKE MEDIA: “Are we going to ...,You have to give it to President Trump who wen...,politics,"Aug 15, 2017",False,1
4,"WOW! STERLING HTS, MI CITY COUNCIL Caves On Me...","In September 2015, the Sterling Heights, MI Ci...",politics,"Feb 22, 2017",False,1


In [56]:
# Check the balance of different classifications

print('''
    The dataset has {} pieces of news in total.
    The portion of real news in this dataset is {}
    The portion of real news in this dataset is {}'''.format(
        len(data),
        len(data[data['target'] == 'True'])/len(data),
        len(data[data['target'] == 'False'])/len(data),
    ))


    The dataset has 44898 pieces of news in total.
    The portion of real news in this dataset is 0.47701456635039424
    The portion of real news in this dataset is 0.5229854336496058


Splitting dataset

In [85]:
# We take 5000 samples

sample = data

train_text, temp_text, train_labels, temp_labels = train_test_split(sample['title'], sample['label'], 
                                                                    random_state=1234, 
                                                                    test_size=0.3, 
                                                                    stratify=sample['target'])

val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=1234, 
                                                                test_size=0.6, 
                                                                stratify=temp_labels)

In [86]:
print('''
    Training data size: {},
    Validation data size: {},
    Testing data size: {}.
    '''.format(len(train_text), len(val_text), len(test_text))
)


    Training data size: 31428,
    Validation data size: 5388,
    Testing data size: 8082.
    


$Method$ 1: $BERT$

In [87]:
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [88]:
seq_len = [len(i.split()) for i in train_text]
np.mean(seq_len)

12.438589792541682

In [89]:
L = 13

def tokenize(text, max_len):
    '''
    '''
    tk = tokenizer.batch_encode_plus(
        text.tolist(),
        max_length = max_len,
        pad_to_max_length=True,
        truncation=True
        )
    return tk

train_tk = tokenize(train_text, L)
val_tk = tokenize(val_text, L)
test_tk = tokenize(test_text, L)



In [90]:
## convert lists to tensors
import torch

train_seq = torch.tensor(train_tk['input_ids'])
train_mask = torch.tensor(train_tk['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(val_tk['input_ids'])
val_mask = torch.tensor(val_tk['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(test_tk['input_ids'])
test_mask = torch.tensor(test_tk['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

In [91]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [92]:
for param in bert.parameters():
    param.requires_grad = False

In [93]:
import torch.nn as nn

class BERT_Arch(nn.Module):

    def __init__(self, bert):
      
      super(BERT_Arch, self).__init__()

      self.bert = bert 
      
      # dropout layer
      self.dropout = nn.Dropout(0.1)
      
      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      self.fc1 = nn.Linear(768,512)
      
      # dense layer 2 (Output layer)
      self.fc2 = nn.Linear(512,2)

      #softmax activation function
      self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      cls_hs = self.bert(sent_id, attention_mask=mask)['pooler_output']
      x = self.fc1(cls_hs)

      x = self.relu(x)

      x = self.dropout(x)

      # output layer
      x = self.fc2(x)
      
      # apply softmax activation
      x = self.softmax(x)

      return x

In [94]:
model = BERT_Arch(bert)

In [95]:
from transformers import AdamW
optimizer = AdamW(model.parameters(),lr = 1e-5)



In [96]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(train_labels),
                                        y = train_labels                                                    
                                    )
print("Class Weights:",class_weights)

Class Weights: [1.04815902 0.95607204]


In [97]:
weights = torch.tensor(class_weights,dtype=torch.float)
cross_entropy  = nn.NLLLoss(weight=weights) 
epochs = 10

In [98]:
def train():
  
  model.train()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save model predictions
  total_preds=[]
  
  # iterate over batches
  for step,batch in enumerate(train_dataloader):
    
    # progress update after every 50 batches.
    if step % 50 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

    # push the batch to gpu
    batch = [r for r in batch]
    sent_id, mask, labels = batch
    #print(type(labels),type(mask),type(sent_id))
    #print(sent_id)
    # clear previously calculated gradients 
    model.zero_grad()        
    # get model predictions for the current batch
    preds = model(sent_id, mask)

    # compute the loss between actual and predicted values
    loss = cross_entropy(preds, labels)

    # add on to the total loss
    total_loss = total_loss + loss.item()

    # backward pass to calculate the gradients
    loss.backward()

    # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update parameters
    optimizer.step()

    # model predictions are stored on GPU. So, push it to CPU
    preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

  # compute the training loss of the epoch
  avg_loss = total_loss / len(train_dataloader)
  
  # predictions are in the form of (no. of batches, size of batch, no. of classes).
  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  #returns the loss and predictions
  return avg_loss, total_preds


def evaluate():
  
  print("\nEvaluating...")
  
  # deactivate dropout layers
  model.eval()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save the model predictions
  total_preds = []

  # iterate over batches
  for step,batch in enumerate(val_dataloader):
    
    # Progress update every 50 batches.
    if step % 50 == 0 and not step == 0:
      
      # Calculate elapsed time in minutes.
      #elapsed = format_time(time.time() - t0)
            
      # Report progress.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

    # push the batch to gpu
    batch = [t for t in batch]

    sent_id, mask, labels = batch

    # deactivate autograd
    with torch.no_grad():
      
      # model predictions
      preds = model(sent_id, mask)

      # compute the validation loss between actual and predicted values
      loss = cross_entropy(preds,labels)

      total_loss = total_loss + loss.item()

      preds = preds.detach().cpu().numpy()

      total_preds.append(preds)

  # compute the validation loss of the epoch
  avg_loss = total_loss / len(val_dataloader) 

  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds

In [99]:
best_valid_loss = float('inf')

train_losses=[]
valid_losses=[]

for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, _ = train()
    
    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 10
  Batch    50  of    983.
  Batch   100  of    983.
  Batch   150  of    983.
  Batch   200  of    983.
  Batch   250  of    983.
  Batch   300  of    983.
  Batch   350  of    983.
  Batch   400  of    983.
  Batch   450  of    983.
  Batch   500  of    983.
  Batch   550  of    983.
  Batch   600  of    983.
  Batch   650  of    983.
  Batch   700  of    983.
  Batch   750  of    983.
  Batch   800  of    983.
  Batch   850  of    983.
  Batch   900  of    983.
  Batch   950  of    983.

Evaluating...
  Batch    50  of    169.
  Batch   100  of    169.
  Batch   150  of    169.

Training Loss: 0.579
Validation Loss: 0.509

 Epoch 2 / 10
  Batch    50  of    983.
  Batch   100  of    983.
  Batch   150  of    983.
  Batch   200  of    983.
  Batch   250  of    983.
  Batch   300  of    983.
  Batch   350  of    983.
  Batch   400  of    983.
  Batch   450  of    983.
  Batch   500  of    983.
  Batch   550  of    983.
  Batch   600  of    983.
  Batch   650  of    983.


In [102]:
#load weights of best model
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

with torch.no_grad():
    preds = model(test_seq, test_mask)
    preds = preds.detach().cpu().numpy()

In [112]:
test_y = np.array(test_y)
np.sum(test_y == preds)/len(test_y)

0.8648849294729027

The accuracy on testing data is 0.865