In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [78]:
#!pip install transformers

'''
# download Kaggle true fake news dataset
!pip install kaggle
import os
os.environ["KAGGLE_CONFIG_DIR"] = "/content/drive/MyDrive/Colab Notebooks/CS6120 NLP/Assignment 6 BERT"
!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset
'''

from sklearn.model_selection import train_test_split
from sklearn import model_selection                   # K fold library
from torch.utils.data import DataLoader                 
from torch.optim import AdamW
import torch
import pandas as pd
import os
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, average_precision_score, average_precision_score
from sklearn.metrics import classification_report, plot_confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import make_scorer  
from sklearn import metrics 

In [79]:
# Check GPU resources
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('Using GPU ', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('Using CPU')

Using CPU


In [80]:
# Prepare tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
print(tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [81]:
# Model configuration Settings
#NUM_LABELS = 1
BATCH_SIZE = 32
MAX_LEN = 30
EPOCHS = 20
LEARNING_RATE = 1e-5

In [82]:
# Prepare dataset
path = "/content/drive/MyDrive/Colab Notebooks/CS6120 NLP/Assignment 6 BERT/fake-and-real-news-dataset/"
df_real = pd.read_csv(path + 'True.csv')
df_fake = pd.read_csv(path + 'Fake.csv')

# Add y_true
df_real['Category'] = 1
df_fake['Category'] = 0

# Combine true news and fake news into one single file
df = df_real.append(df_fake)

  df = df_real.append(df_fake)


In [83]:
# Diminish dataset to 25% due to training time
dataset_75, dataset_25 = train_test_split(df, test_size=0.25, shuffle=True)
df = dataset_25

In [84]:
'''
# convert news title to input ids using BERT tokenizer
def process_data(df, tokenizer, max_len=30):
    """
    Process the data to feed into the pretrained model
    """
    tokenizer_dict = tokenizer(df.title.values.tolist(), return_tensors='pt', padding=True, truncation=True, max_length=10)
    tokens = tokenizer_dict['input_ids']
    attention_mask = tokenizer_dict['attention_mask']
    y = torch.tensor(df.Category.values)

    return tokens, attention_mask, y

print(process_data(df, tokenizer, max_len=MAX_LEN)[:10])
'''

'\n# convert news title to input ids using BERT tokenizer\ndef process_data(df, tokenizer, max_len=30):\n    """\n    Process the data to feed into the pretrained model\n    """\n    tokenizer_dict = tokenizer(df.title.values.tolist(), return_tensors=\'pt\', padding=True, truncation=True, max_length=10)\n    tokens = tokenizer_dict[\'input_ids\']\n    attention_mask = tokenizer_dict[\'attention_mask\']\n    y = torch.tensor(df.Category.values)\n\n    return tokens, attention_mask, y\n\nprint(process_data(df, tokenizer, max_len=MAX_LEN)[:10])\n'

In [90]:
from torch.utils.data import Dataset

class NewsDataset(Dataset):
  def __init__(self, train_kf, tokenizer, max_len):
    self.train_kf = train_kf
    self.tokenizer = tokenizer
    self.max_len = max_len
    

  def __getitem__(self, index):
    # convert news title to input ids using BERT tokenizer
    title = self.train_kf.iloc[index]["title"]
    category = self.train_kf.iloc[index]["Category"]

    #tokenizer_dict = tokenizer.encode_plus(title, add_special_tokens = True, return_attention_mask = True, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_len)
    tokenizer_dict = tokenizer(title, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_len)
    input_ids = tokenizer_dict['input_ids']
    attention_mask = tokenizer_dict['attention_mask'] 
    y = torch.tensor(category)

    #print(input_ids, attention_mask, y)
    #print(input_ids.shape, attention_mask.shape, y.shape)

    return input_ids, attention_mask, y

  def __len__(self):
    return len(self.train_kf)

In [91]:
def train(train_dataloader, model, device, lr=2e-5, warmup_steps=200):
    model.to(device)
    model.train()
    
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1)
    batch_loss = 0
  
    for batch in train_dataloader:
      input_ids, attention_mask, labels = tuple(item.to(device) for item in batch)
      
      # without these 3 statements, despite tensor shape [batch size, seq_len] is correctly output, 
      # at here it still in [batch size, 1, seq_len] shape, so need to reshape here
      input_ids = input_ids.squeeze(1)
      attention_mask = attention_mask.squeeze(1)
      labels = labels.unsqueeze(0)

      optimizer.zero_grad() 

      # forward pass, outputs object contains the model's predictions, as well as the loss and other optional outputs.
      outputs = model(input_ids, attention_mask, labels=labels)  

      # default loss function by huggingface BERT - binary_cls: nll, multiclass: cross entropy
      # extracts the loss value from the outputs object
      batch_loss += outputs.loss        # tensor in format 
      print("loss: ", outputs.loss)

      loss.backward() 
      optimizer.step() 
      scheduler.step()
    
    return batch_loss/len(train_dataloader)

In [None]:
def validate(val_dataloader, model, device, lr=2e-5, warmup_steps=200):
    model.to(device)
    model.eval()

    total_loss, total_acc = 0, 0
    y_val = []

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = tuple(item.to(device) for item in batch)

            input_ids = input_ids.squeeze(1)
            attention_mask = attention_mask.squeeze(1)
            labels = labels.unsqueeze(0)

            # outputs produces 2 parameters: outputs.loss, outputs.logits
            outputs = model(input_ids, attention_mask, labels=labels) 
            #val_loss += outputs.loss 

            # store prediction
            # outputs.logits represents the model's confidence scores for each class label
            # argmax(-1) means taking the class with the highest predicted probability
            y_val.append(outputs.logits.argmax(-1))
               
    print(classification_report(y_val, labels.to('cpu').numpy()))  
        

In [None]:
# Define learning rates to test
learning_rates = [0.0001, 0.001, 0.01]
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
best_lr = None
best_avg_loss = float('inf')

for lr in learning_rates:

    avg_loss = 0

    print("lr", lr)

    for train_index, valid_index in kf.split(df):    # kf.split() method returns an iterator that generates a set of train and validation indices for each fold.

        # Get the training and validation data for this fold
        train_kf = df.iloc[train_index]
        val_kf = df.iloc[valid_index]

        print(len(train_kf))  # 5 fold is 80%
        print(len(val_kf))   # 5 fold is 20%

        # Get tokens and attention mask
        train_dataset = NewsDataset(train_kf, tokenizer, max_len=MAX_LEN)
        val_dataset = NewsDataset(val_kf, tokenizer, max_len=MAX_LEN)

        # Get dataloader
        train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
        val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

        previous_loss = 100
        early_stopping_patience = 4

        for epoch in range(EPOCHS):

            num_epochs_no_improvement = 0

            # Train the model on this fold with the current learning rate
            loss = train(train_dataloader, model, device, lr=lr, warmup_steps=200)
            print("Epoch: {}, loss: {}".format(epoch, loss))

            if loss < previous_loss:
                previous_loss = loss
                num_epochs_no_improvement = 0
            else:
                num_epochs_no_improvement += 1
                if num_epochs_no_improvement == early_stopping_patience:
                    print(f"Validation loss has not improved for {early_stopping_patience} epochs. Stopping training.")
                    return

In [None]:
model.to(device)
model.train()

In [None]:
text_batch = ["I love Pixar.",
              "I don't care for Pixar.",
              "This is such a super duper long sentence with so many words you can barely understand it oh my gosh"]
encoding = tokenizer(text_batch, return_tensors='pt', padding=True, truncation=True, max_length=10)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

In [None]:
input_ids

In [None]:
attention_mask

In [None]:
from torch.nn import functional as F
labels = torch.tensor([1,0,0])
outputs = model(input_ids, attention_mask=attention_mask)
# softmaxed_output = softmax_fn(outputs)
loss = F.cross_entropy(outputs.logits, labels)

In [None]:
loss

In [None]:
loss.backward()

In [None]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=1e-5)

In [None]:
optimizer.step()

In [None]:
from transformers import get_linear_schedule_with_warmup

In [None]:
!tensorboard dev upload --logdir "/content/drive/MyDrive/Colab Notebooks/CS6120 NLP/CS6120 Project 6" --name "image captioning 6.5 epochs"
