In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#!pip install transformers

'''
# download Kaggle true fake news dataset
!pip install kaggle
import os
os.environ["KAGGLE_CONFIG_DIR"] = "/content/drive/MyDrive/Colab Notebooks"
!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset
'''

from sklearn.model_selection import train_test_split
from sklearn import model_selection                   # K fold library
from torch.utils.data import DataLoader                 
from torch.optim import AdamW
import torch
import pandas as pd
import os
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, average_precision_score, average_precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer  
from sklearn import metrics 
import torch.nn.functional as F

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Downloading fake-and-real-news-dataset.zip to /content
 71% 29.0M/41.0M [00:00<00:00, 85.3MB/s]
100% 41.0M/41.0M [00:00<00:00, 98.9MB/s]


In [5]:
# Check GPU resources
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('Using GPU ', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('Using CPU')

Using GPU  Tesla T4


In [6]:
# Prepare tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
print(tokenizer)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [7]:
# Model configuration Settings
#NUM_LABELS = 1
BATCH_SIZE = 32
MAX_LEN = 32
EPOCHS = 20

In [9]:
# Prepare dataset
path = "/content/drive/MyDrive/Colab Notebooks/CS6120_NLP/BERT/fake-and-real-news-dataset/"
df_real = pd.read_csv(path + 'True.csv')
df_fake = pd.read_csv(path + 'Fake.csv')

# Add y_true
df_real['Category'] = 1
df_fake['Category'] = 0

# Combine true news and fake news into one single file
df = df_real.append(df_fake)

  df = df_real.append(df_fake)


In [10]:
# Diminish dataset to 25% due to training time
dataset_75, dataset_25 = train_test_split(df, test_size=0.01, shuffle=True)
df = dataset_25

In [11]:
from torch.utils.data import Dataset

class NewsDataset(Dataset):
  def __init__(self, train_kf, tokenizer, max_len):
    self.train_kf = train_kf
    self.tokenizer = tokenizer
    self.max_len = max_len
    

  def __getitem__(self, index):
    # convert news docment to input ids using BERT tokenizer
    text = self.train_kf.iloc[index]["text"]
    category = self.train_kf.iloc[index]["Category"]

    tokenizer_dict = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_len)
    input_ids = tokenizer_dict['input_ids']
    attention_mask = tokenizer_dict['attention_mask'] 
    y = torch.tensor(category)

    #print(input_ids, attention_mask, y)
    #print(input_ids.shape, attention_mask.shape, y.shape)

    return input_ids, attention_mask, y

  def __len__(self):
    return len(self.train_kf)

In [12]:
def train(train_dataloader, model, device, lr=2e-5, warmup_steps=200):
    model.to(device)
    model.train()
    
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1)
    batch_loss = 0
  
    for batch in train_dataloader:
      input_ids, attention_mask, labels = tuple(item.to(device) for item in batch)
      
      # without these 3 statements, despite tensor shape [batch size, seq_len] is correctly output, 
      # at here it still in [batch size, 1, seq_len] shape, so need to reshape here
      input_ids = input_ids.squeeze(1)
      attention_mask = attention_mask.squeeze(1)
      labels = labels.unsqueeze(0)

      optimizer.zero_grad() 

      # forward pass, outputs object contains the model's predictions, as well as the loss and other optional outputs.
      outputs = model(input_ids, attention_mask, labels=labels)  

      # default loss function by huggingface BERT - binary_cls: nll, multiclass: cross entropy
      # extracts the loss value from the outputs object
      loss = outputs.loss
      batch_loss += outputs.loss        # tensor in format 

      loss.backward() 
      optimizer.step() 
      scheduler.step()
    
    return batch_loss/len(train_dataloader)

In [13]:
def validate(val_dataloader, model, device):
    model.to(device)
    model.eval()

    total_loss, total_acc = 0, 0
    y_val = []
    labels_list = []

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = tuple(item.to(device) for item in batch)

            input_ids = input_ids.squeeze(1)
            attention_mask = attention_mask.squeeze(1)
            labels = labels.unsqueeze(0)

            # outputs produces 2 parameters: outputs.loss, outputs.logits
            outputs = model(input_ids, attention_mask, labels=labels) 

            # store prediction
            # outputs.logits represents the model's confidence scores for each class label
            y_val.append(outputs.logits.argmax(axis=1).tolist())
            labels_list.append(labels.tolist())

            #print("outputs.logits.argmax(axis=1)\n", outputs.logits.argmax(axis=1))

            y_val_flatten = [item for sublist in y_val for item in sublist]
            labels_flatten = [item for sublist1 in labels_list for sublist2 in sublist1 for item in sublist2]

    print("y_val_flatten\n", y_val_flatten)  
    print("labels_flatten\n", labels_flatten)
    print(classification_report(y_val_flatten, labels_flatten))
        

In [14]:
# Define learning rates to test
learning_rates = [0.0001, 0.001, 0.01]
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
best_lr = None
best_avg_loss = float('inf')

for fold, (train_index, valid_index) in enumerate(kf.split(df)):    # kf.split() method returns an iterator that generates a set of train and validation indices for each fold.

    # Get the training and validation data for this fold
    train_kf = df.iloc[train_index]
    val_kf = df.iloc[valid_index]

    print("fold", fold)
    print(len(train_kf))  # 5 fold is 80%
    print(len(val_kf))   # 5 fold is 20%

    # Get tokens and attention mask
    train_dataset = NewsDataset(train_kf, tokenizer, max_len=MAX_LEN)
    val_dataset = NewsDataset(val_kf, tokenizer, max_len=MAX_LEN)

    # Get dataloader
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    early_stopping_patience = 4

    for lr in learning_rates:
        print("lr", lr)

        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
        previous_loss = 100

        for epoch in range(EPOCHS):

            # Train the model on this fold with the current learning rate
            loss = train(train_dataloader, model, device, lr=lr, warmup_steps=200)
            print("Epoch: {}, loss: {}".format(epoch, loss))

            if loss < previous_loss:
                previous_loss = loss
                num_epochs_no_improvement = 0
            else:
                num_epochs_no_improvement += 1
                if num_epochs_no_improvement == early_stopping_patience:
                    print(f"Validation loss has not improved for {early_stopping_patience} epochs. Stopping training.")
                    break

fold 0
359
90
lr 0.0001


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.6933373212814331
Epoch: 1, loss: 0.6306621432304382
Epoch: 2, loss: 0.5519832372665405
Epoch: 3, loss: 0.47289609909057617
Epoch: 4, loss: 0.39492639899253845
Epoch: 5, loss: 0.3263108730316162
Epoch: 6, loss: 0.26183566451072693
Epoch: 7, loss: 0.204838365316391
Epoch: 8, loss: 0.15628312528133392
Epoch: 9, loss: 0.11685194820165634
Epoch: 10, loss: 0.09029720723628998
Epoch: 11, loss: 0.06820162385702133
Epoch: 12, loss: 0.049122169613838196
Epoch: 13, loss: 0.035888563841581345
Epoch: 14, loss: 0.027322284877300262
Epoch: 15, loss: 0.019134994596242905
Epoch: 16, loss: 0.014420858584344387
Epoch: 17, loss: 0.010616103187203407
Epoch: 18, loss: 0.007773540448397398
Epoch: 19, loss: 0.005856617819517851
lr 0.001


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.5726157426834106
Epoch: 1, loss: 0.1076301783323288
Epoch: 2, loss: 0.017837565392255783
Epoch: 3, loss: 0.002397841541096568
Epoch: 4, loss: 0.0004013814905192703
Epoch: 5, loss: 6.212640437297523e-05
Epoch: 6, loss: 1.1269766218902078e-05
Epoch: 7, loss: 2.4452062916680006e-06
Epoch: 8, loss: 7.826212709005631e-07
Epoch: 9, loss: 3.477380801086838e-07
Epoch: 10, loss: 1.8768365350751992e-07
Epoch: 11, loss: 1.235111142250389e-07
Epoch: 12, loss: 9.069307083109379e-08
Epoch: 13, loss: 3.1354524310245324e-08
Epoch: 14, loss: 7.450580152834618e-09
Epoch: 15, loss: 1.8626450382086546e-09
Epoch: 16, loss: 6.208816794028849e-10
Epoch: 17, loss: 0.0
Epoch: 18, loss: 0.0
Epoch: 19, loss: 0.0
lr 0.01


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.3717775046825409
Epoch: 1, loss: 0.031136855483055115
Epoch: 2, loss: 0.085277259349823
Epoch: 3, loss: 0.038666460663080215
Epoch: 4, loss: 0.2792375683784485
Epoch: 5, loss: 0.003461233340203762
Epoch: 6, loss: 0.9927558302879333
Epoch: 7, loss: 0.1384667456150055
Epoch: 8, loss: 0.13701076805591583
Epoch: 9, loss: 1.1637580394744873
Validation loss has not improved for 4 epochs. Stopping training.
fold 1
359
90
lr 0.0001


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.675321102142334
Epoch: 1, loss: 0.5714397430419922
Epoch: 2, loss: 0.49107539653778076
Epoch: 3, loss: 0.4244074821472168
Epoch: 4, loss: 0.3671208620071411
Epoch: 5, loss: 0.313989520072937
Epoch: 6, loss: 0.2725454866886139
Epoch: 7, loss: 0.23188090324401855
Epoch: 8, loss: 0.19240133464336395
Epoch: 9, loss: 0.16692054271697998
Epoch: 10, loss: 0.1342887282371521
Epoch: 11, loss: 0.10876017063856125
Epoch: 12, loss: 0.08532239496707916
Epoch: 13, loss: 0.06848642230033875
Epoch: 14, loss: 0.05365026742219925
Epoch: 15, loss: 0.04161304235458374
Epoch: 16, loss: 0.03343868628144264
Epoch: 17, loss: 0.025806061923503876
Epoch: 18, loss: 0.019753899425268173
Epoch: 19, loss: 0.01587940938770771
lr 0.001


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.47390827536582947
Epoch: 1, loss: 0.1389220952987671
Epoch: 2, loss: 0.05389779806137085
Epoch: 3, loss: 0.014330225065350533
Epoch: 4, loss: 0.0027428537141531706
Epoch: 5, loss: 0.0004945567343384027
Epoch: 6, loss: 8.186270133592188e-05
Epoch: 7, loss: 1.7314265278400853e-05
Epoch: 8, loss: 4.824813458981225e-06
Epoch: 9, loss: 1.5779698969708988e-06
Epoch: 10, loss: 6.792445219616638e-07
Epoch: 11, loss: 3.562530253020668e-07
Epoch: 12, loss: 2.1007974737585755e-07
Epoch: 13, loss: 1.291433875394432e-07
Epoch: 14, loss: 1.0404203010239144e-07
Epoch: 15, loss: 4.767484185208559e-08
Epoch: 16, loss: 1.3659397168908072e-08
Epoch: 17, loss: 9.313225191043273e-10
Epoch: 18, loss: 6.208816794028849e-10
Epoch: 19, loss: 0.0
lr 0.01


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.35295936465263367
Epoch: 1, loss: 0.0008285859366878867
Epoch: 2, loss: 3.013049933997536e-07
Epoch: 3, loss: 3.1044083970144243e-10
Epoch: 4, loss: 0.0
Epoch: 5, loss: 0.0
Epoch: 6, loss: 0.0
Epoch: 7, loss: 0.0
Epoch: 8, loss: 0.0
Validation loss has not improved for 4 epochs. Stopping training.
fold 2
359
90
lr 0.0001


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.6748515367507935
Epoch: 1, loss: 0.6004210710525513
Epoch: 2, loss: 0.4972965121269226
Epoch: 3, loss: 0.3953087031841278
Epoch: 4, loss: 0.3189016282558441
Epoch: 5, loss: 0.2537478804588318
Epoch: 6, loss: 0.20475271344184875
Epoch: 7, loss: 0.16088852286338806
Epoch: 8, loss: 0.13002070784568787
Epoch: 9, loss: 0.10561826080083847
Epoch: 10, loss: 0.08172130584716797
Epoch: 11, loss: 0.06467749178409576
Epoch: 12, loss: 0.05101723223924637
Epoch: 13, loss: 0.04023638367652893
Epoch: 14, loss: 0.03112196922302246
Epoch: 15, loss: 0.024756968021392822
Epoch: 16, loss: 0.019038015976548195
Epoch: 17, loss: 0.01611737534403801
Epoch: 18, loss: 0.011785443872213364
Epoch: 19, loss: 0.009257662110030651
lr 0.001


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.5355309844017029
Epoch: 1, loss: 0.17111264169216156
Epoch: 2, loss: 0.029598599299788475
Epoch: 3, loss: 0.00577586842700839
Epoch: 4, loss: 0.0007275542593561113
Epoch: 5, loss: 0.00013872365525458008
Epoch: 6, loss: 2.1805048163514584e-05
Epoch: 7, loss: 4.02104251406854e-06
Epoch: 8, loss: 1.1093820830865297e-06
Epoch: 9, loss: 4.618472644324356e-07
Epoch: 10, loss: 2.3052449193983193e-07
Epoch: 11, loss: 1.431132261586754e-07
Epoch: 12, loss: 1.0674730077653294e-07
Epoch: 13, loss: 5.698806759824038e-08
Epoch: 14, loss: 1.2728074594292593e-08
Epoch: 15, loss: 3.104408285992122e-09
Epoch: 16, loss: 1.2417633588057697e-09
Epoch: 17, loss: 3.1044083970144243e-10
Epoch: 18, loss: 0.0
Epoch: 19, loss: 0.0
lr 0.01


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.2685356140136719
Epoch: 1, loss: 0.0329718142747879
Epoch: 2, loss: 0.08042903244495392
Epoch: 3, loss: 0.15772709250450134
Epoch: 4, loss: 0.2915842533111572
Epoch: 5, loss: 0.03631614148616791
Validation loss has not improved for 4 epochs. Stopping training.
fold 3
359
90
lr 0.0001


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.671873927116394
Epoch: 1, loss: 0.5847324132919312
Epoch: 2, loss: 0.4757404923439026
Epoch: 3, loss: 0.37809914350509644
Epoch: 4, loss: 0.3119433522224426
Epoch: 5, loss: 0.25296521186828613
Epoch: 6, loss: 0.19689498841762543
Epoch: 7, loss: 0.15343256294727325
Epoch: 8, loss: 0.11668255180120468
Epoch: 9, loss: 0.08979269117116928
Epoch: 10, loss: 0.06880930066108704
Epoch: 11, loss: 0.05349137634038925
Epoch: 12, loss: 0.0398697666823864
Epoch: 13, loss: 0.03131585568189621
Epoch: 14, loss: 0.024910500273108482
Epoch: 15, loss: 0.019262714311480522
Epoch: 16, loss: 0.014913946390151978
Epoch: 17, loss: 0.011793000623583794
Epoch: 18, loss: 0.009361863136291504
Epoch: 19, loss: 0.007232384290546179
lr 0.001


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.4657285213470459
Epoch: 1, loss: 0.11775851249694824
Epoch: 2, loss: 0.024110518395900726
Epoch: 3, loss: 0.0037870770320296288
Epoch: 4, loss: 0.0006374684744514525
Epoch: 5, loss: 0.00013025292719248682
Epoch: 6, loss: 3.0129442166071385e-05
Epoch: 7, loss: 7.93740400695242e-06
Epoch: 8, loss: 2.436691374896327e-06
Epoch: 9, loss: 9.238274287781678e-07
Epoch: 10, loss: 4.5178012442192994e-07
Epoch: 11, loss: 2.422325451334473e-07
Epoch: 12, loss: 1.4453239316480904e-07
Epoch: 13, loss: 1.1450832460013771e-07
Epoch: 14, loss: 6.368472327267227e-08
Epoch: 15, loss: 1.707424601704588e-08
Epoch: 16, loss: 3.104408285992122e-09
Epoch: 17, loss: 3.1044083970144243e-10
Epoch: 18, loss: 0.0
Epoch: 19, loss: 0.0
lr 0.01


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.34676656126976013
Epoch: 1, loss: 0.06390742212533951
Epoch: 2, loss: 0.0009935671696439385
Epoch: 3, loss: 9.889747616398381e-07
Epoch: 4, loss: 0.8907050490379333
Epoch: 5, loss: 1.4549498558044434
Epoch: 6, loss: 0.3267859220504761
Epoch: 7, loss: 0.00025574100436642766
Validation loss has not improved for 4 epochs. Stopping training.
fold 4
360
89
lr 0.0001


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.7137813568115234
Epoch: 1, loss: 0.6369782090187073
Epoch: 2, loss: 0.592963457107544
Epoch: 3, loss: 0.5337007641792297
Epoch: 4, loss: 0.4498696029186249
Epoch: 5, loss: 0.3756522834300995
Epoch: 6, loss: 0.30392521619796753
Epoch: 7, loss: 0.24760225415229797
Epoch: 8, loss: 0.21193699538707733
Epoch: 9, loss: 0.1835031360387802
Epoch: 10, loss: 0.1507362723350525
Epoch: 11, loss: 0.1330476999282837
Epoch: 12, loss: 0.11025282740592957
Epoch: 13, loss: 0.09186389297246933
Epoch: 14, loss: 0.0761229395866394
Epoch: 15, loss: 0.05707354471087456
Epoch: 16, loss: 0.04175810515880585
Epoch: 17, loss: 0.031134789809584618
Epoch: 18, loss: 0.024029970169067383
Epoch: 19, loss: 0.017606772482395172
lr 0.001


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.5783482789993286
Epoch: 1, loss: 0.16038468480110168
Epoch: 2, loss: 0.015141311101615429
Epoch: 3, loss: 0.002870416734367609
Epoch: 4, loss: 0.000390561152016744
Epoch: 5, loss: 5.723785943700932e-05
Epoch: 6, loss: 1.0933961675618775e-05
Epoch: 7, loss: 2.368660716456361e-06
Epoch: 8, loss: 8.055937996687135e-07
Epoch: 9, loss: 3.501772596337105e-07
Epoch: 10, loss: 1.9340464518791123e-07
Epoch: 11, loss: 1.260389836943432e-07
Epoch: 12, loss: 8.84756374830431e-08
Epoch: 13, loss: 4.0978189730367376e-08
Epoch: 14, loss: 4.3461718668424965e-09
Epoch: 15, loss: 1.8626450382086546e-09
Epoch: 16, loss: 0.0
Epoch: 17, loss: 6.208816794028849e-10
Epoch: 18, loss: 0.0
Epoch: 19, loss: 0.0
lr 0.01


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.3879358768463135
Epoch: 1, loss: 0.2394430935382843
Epoch: 2, loss: 0.039990607649087906
Epoch: 3, loss: 0.0005602133460342884
Epoch: 4, loss: 1.2749783309118357e-06
Epoch: 5, loss: 3.104408285992122e-09
Epoch: 6, loss: 0.0
Epoch: 7, loss: 0.1186998039484024
Epoch: 8, loss: 0.7502238750457764
Epoch: 9, loss: 0.10780690610408783
Epoch: 10, loss: 0.5293318033218384
Validation loss has not improved for 4 epochs. Stopping training.


In [20]:
train_df, val_df = train_test_split(df, test_size=0.25, shuffle=True)

train_dataset = NewsDataset(train_df, tokenizer, max_len=MAX_LEN)
val_dataset = NewsDataset(val_df, tokenizer, max_len=MAX_LEN)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

lr = 0.001
previous_loss = 100

model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

for epoch in range(EPOCHS):

    loss = train(train_dataloader, model, device, lr=lr, warmup_steps=200)
    print("Epoch: {}, loss: {}".format(epoch, loss))

    if loss < previous_loss:
        previous_loss = loss
        num_epochs_no_improvement = 0
    else:
        num_epochs_no_improvement += 1
        if num_epochs_no_improvement == early_stopping_patience:
            print(f"Validation loss has not improved for {early_stopping_patience} epochs. Stopping training.")
            break

validate(val_dataloader, model, device)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.5782375335693359
Epoch: 1, loss: 0.24238736927509308
Epoch: 2, loss: 0.06533688306808472
Epoch: 3, loss: 0.017122521996498108
Epoch: 4, loss: 0.003183237509801984
Epoch: 5, loss: 0.0006189445266500115
Epoch: 6, loss: 9.495987615082413e-05
Epoch: 7, loss: 1.899367998703383e-05
Epoch: 8, loss: 4.916691068501677e-06
Epoch: 9, loss: 1.6245639926637523e-06
Epoch: 10, loss: 6.553123625963053e-07
Epoch: 11, loss: 3.3798542631302553e-07
Epoch: 12, loss: 1.981177035759174e-07
Epoch: 13, loss: 1.337717776550562e-07
Epoch: 14, loss: 1.1040405212270343e-07
Epoch: 15, loss: 6.637789340402378e-08
Epoch: 16, loss: 2.2690402445846303e-08
Epoch: 17, loss: 8.127905459787144e-09
Epoch: 18, loss: 2.709301893943916e-09
Epoch: 19, loss: 1.015988182473393e-09
y_val_flatten
 [0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 

In [None]:
#!tensorboard dev upload --logdir "/content/drive/MyDrive/Colab Notebooks/CS6120 NLP/CS6120 Project 6" --name "image captioning 6.5 epochs"
