In [192]:
# Import packages
import unicodedata
import sys
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from transformers import AutoModel, BertTokenizerFast
import torch
from transformers import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler


$$Part 1: Preparation$$

In [193]:
# Import and merge data
true_data = pd.read_csv('data/true.csv')
fake_data = pd.read_csv('data/fake.csv')
true_data["label"] = 1
fake_data["label"] = 0
data = true_data.append(fake_data).sample(frac=1).reset_index().drop(columns=['index', 'subject', 'date', 'text'])

  data = true_data.append(fake_data).sample(frac=1).reset_index().drop(columns=['index', 'subject', 'date', 'text'])


Need from util.py: a helper function to remover "Reuters" from real news without modifying the dataset

In [194]:
data.head()

Unnamed: 0,title,label
0,America’s Stepdad Tim Kaine Just Burned Trump...,0
1,Trump Just Issued A New But Utterly USELESS T...,0
2,"DEM PARTY OFFICIAL, Chair Of Black Caucus, Ber...",0
3,MUSLIM SCHOLAR CRITICIZES OBAMA…Explains Why A...,0
4,US Presidential Debates Much More Corrupt Than...,0


$$Part 1: Preparation$$

In [195]:
# Split data into training (70%), validation (15%) and testing (15%)

# To make the process faster, we only take a small portion of data (1000 samples)
data = data[:5000]

train_text, temp_text, train_labels, temp_labels = train_test_split(data['title'], data['label'], 
                                                                    random_state=2018, 
                                                                    test_size=0.3, 
                                                                    stratify=data['label'])

val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=2018, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)

In [196]:
# Check the balance of different classifications

print('''
    The dataset has {} pieces of news in total.
    The portion of real news in this dataset is {}
    The portion of real news in this dataset is {}'''.format(
        len(data),
        len(data[data['label'] == 1])/len(data),
        len(data[data['label'] == 0])/len(data),
    ))

print('''
    Training data size: {},
    Validation data size: {},
    Testing data size: {}.
    '''.format(len(train_text), len(val_text), len(test_text))
)


    The dataset has 5000 pieces of news in total.
    The portion of real news in this dataset is 0.4756
    The portion of real news in this dataset is 0.5244

    Training data size: 3500,
    Validation data size: 750,
    Testing data size: 750.
    


$$Part 2: Model Building$$

In [197]:
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [198]:
# Measure the average length of the context (title)
np.mean([len(i.split()) for i in train_text])

12.449142857142856

In [199]:
# Now that we have the average length of our sample, we tokenize them into an object for BERT to learn

L = 20

def tokenize(text, max_len):
    '''
    A function to turn English text into a token by the max length
    Inputs: 
      text (str): the text to process
      max_len (int): the maximum length of a token
    Returns:
      tk: an BERT Encoding object to train the model
    '''

    tk = tokenizer.batch_encode_plus(
        text.tolist(),
        max_length = max_len,
        pad_to_max_length=True,
        truncation=True
        )

    return tk

train_tk = tokenize(train_text, L)
val_tk = tokenize(val_text, L)
test_tk = tokenize(test_text, L)



In [200]:
# Vectorizing every sentense, create a quantified database for machine learning

train_seq = torch.tensor(train_tk['input_ids'])
train_mask = torch.tensor(train_tk['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(val_tk['input_ids'])
val_mask = torch.tensor(val_tk['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(test_tk['input_ids'])
test_mask = torch.tensor(test_tk['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

In [201]:
# An intuition: what we are doing: Vectorizing every sentense
print(train_text[0])
print(train_tk['input_ids'][0])
print(train_tk['attention_mask'][0])
print(train_labels.tolist()[0])

 America’s Stepdad Tim Kaine Just Burned Trump HARD On Colbert
[101, 28514, 17248, 4187, 2342, 2005, 1057, 1012, 1055, 1012, 4040, 4335, 1010, 2231, 4804, 102, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]
1


In [202]:
# Convert
# In each epoch of training, the model randomly select training datasets

batch_size = 150

train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [203]:
for param in bert.parameters():
    param.requires_grad = False

class BERT_Arch(torch.nn.Module):

    def __init__(self, bert):
      
        super(BERT_Arch, self).__init__()

        self.bert = bert 
        self.dropout = torch.nn.Dropout(0.1)
        self.relu =  torch.nn.ReLU()
        self.fc1 = torch.nn.Linear(768,512)
        self.fc2 = torch.nn.Linear(512,2)
        self.softmax = torch.nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):
        cls_hs = self.bert(sent_id, attention_mask=mask)['pooler_output']
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [204]:
model = BERT_Arch(bert)

# define the optimizer
optimizer = AdamW(model.parameters(),lr = 1e-5) # learning rate



$$Part3: Training$$

In [205]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(train_labels),
                                        y = train_labels                                                    
                                    )
print("Class Weights:",class_weights)

Class Weights: [0.95367847 1.05105105]


In [206]:
weights= torch.tensor(class_weights,dtype=torch.float)

cross_entropy  = torch.nn.NLLLoss(weight=weights) 

epochs = 10

In [207]:
def train():
  
    model.train()

    total_loss, total_accuracy = 0, 0
    
    # empty list to save model predictions
    total_preds=[]
    
    # iterate over batches
    for step,batch in enumerate(train_dataloader):
    
        # progress update after every 50 batches.
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
        batch = [r for r in batch]
        sent_id, mask, labels = batch
        model.zero_grad()        
        preds = model(sent_id, mask)
        loss = cross_entropy(preds, labels)
        total_loss = total_loss + loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        preds=preds.detach().cpu().numpy()
        total_preds.append(preds)

    avg_loss = total_loss / len(train_dataloader)

    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [208]:
def evaluate():
    
    print("\nEvaluating...")
    
    model.eval()

    total_loss, total_accuracy = 0, 0
    total_preds = []
    for step,batch in enumerate(val_dataloader):
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

        batch = [t for t in batch]

        sent_id, mask, labels = batch

        with torch.no_grad():
            preds = model(sent_id, mask)
            loss = cross_entropy(preds,labels)
            total_loss = total_loss + loss.item()
            preds = preds.detach().cpu().numpy()
            total_preds.append(preds)

    avg_loss = total_loss / len(val_dataloader) 

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [209]:
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))

    train_loss, _ = train()

    valid_loss, _ = evaluate()

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')

    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 10

Evaluating...

Training Loss: 0.686
Validation Loss: 0.654

 Epoch 2 / 10

Evaluating...

Training Loss: 0.649
Validation Loss: 0.618

 Epoch 3 / 10

Evaluating...

Training Loss: 0.625
Validation Loss: 0.594

 Epoch 4 / 10

Evaluating...

Training Loss: 0.605
Validation Loss: 0.580

 Epoch 5 / 10

Evaluating...

Training Loss: 0.599
Validation Loss: 0.569

 Epoch 6 / 10

Evaluating...

Training Loss: 0.590
Validation Loss: 0.560

 Epoch 7 / 10

Evaluating...

Training Loss: 0.578
Validation Loss: 0.555

 Epoch 8 / 10

Evaluating...

Training Loss: 0.580
Validation Loss: 0.547

 Epoch 9 / 10

Evaluating...

Training Loss: 0.569
Validation Loss: 0.540

 Epoch 10 / 10

Evaluating...

Training Loss: 0.568
Validation Loss: 0.539


In [210]:
#load weights of best model
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))
with torch.no_grad():
    preds = model(test_seq, test_mask)
    preds = preds.detach().cpu().numpy()

$$Part4: Performance$$

In [211]:
# Check precision, recall and f1-score 
preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))

              precision    recall  f1-score   support

           0       0.88      0.56      0.68       394
           1       0.65      0.91      0.76       356

    accuracy                           0.73       750
   macro avg       0.76      0.74      0.72       750
weighted avg       0.77      0.73      0.72       750



In [212]:
def accuracy(matrix):
    tot =matrix[0][0] + matrix[0][1] + matrix[1][0] + matrix[1][1]
    part = matrix[1][1] + matrix[0][0]
    return part/tot

In [213]:
# Compute a confusion maatrix on prediction results 
x = confusion_matrix(preds,test_y)
print(x)
accuracy(x)

[[220  31]
 [174 325]]


0.7266666666666667