CS6120 NLP Assignment 7 - BERT to perform text classification<br>
Wing Man Casca, Kwok<br>
Apr 4 2023

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Import libraries

In [1]:
!pip install transformers

'''
# download Kaggle true fake news dataset
!pip install kaggle
import os
os.environ["KAGGLE_CONFIG_DIR"] = "/content/drive/MyDrive/Colab Notebooks"
!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset
'''

from sklearn.model_selection import train_test_split
from sklearn import model_selection                   # K fold library
from torch.utils.data import DataLoader                 
from torch.optim import AdamW
import torch
import pandas as pd
import os
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, average_precision_score, average_precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer  
from sklearn import metrics 
import torch.nn.functional as F

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Check GPU resources
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('Using GPU ', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('Using CPU')

Using GPU  Tesla T4


In [3]:
# Prepare tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
print(tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [4]:
# Model configuration Settings
#NUM_LABELS = 1
BATCH_SIZE = 32
MAX_LEN = 32
EPOCHS = 20

In [5]:
# Prepare dataset
path = "/content/drive/MyDrive/Colab Notebooks/CS6120_NLP/BERT/fake-and-real-news-dataset/"
df_real = pd.read_csv(path + 'True.csv')
df_fake = pd.read_csv(path + 'Fake.csv')

# Add y_true
df_real['Category'] = 1
df_fake['Category'] = 0

# Combine true news and fake news into one single file
df = df_real.append(df_fake)

  df = df_real.append(df_fake)


In [6]:
# Diminish dataset to 1% due to training time
dataset_99, dataset_1 = train_test_split(df, test_size=0.01, shuffle=True)
df = dataset_1

In [7]:
# Prepare dataset for dataloading
from torch.utils.data import Dataset

class NewsDataset(Dataset):
  def __init__(self, train_kf, tokenizer, max_len):
    self.train_kf = train_kf
    self.tokenizer = tokenizer
    self.max_len = max_len
    

  def __getitem__(self, index):
    # convert news docment to input ids using BERT tokenizer
    text = self.train_kf.iloc[index]["text"]
    category = self.train_kf.iloc[index]["Category"]

    tokenizer_dict = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_len)
    input_ids = tokenizer_dict['input_ids']
    attention_mask = tokenizer_dict['attention_mask'] 
    y = torch.tensor(category)

    return input_ids, attention_mask, y

  def __len__(self):
    return len(self.train_kf)

In [8]:
# model training 
def train(train_dataloader, model, device, lr=2e-5, warmup_steps=200):
    model.to(device)
    model.train()
    
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1)
    batch_loss = 0
  
    for batch in train_dataloader:
      input_ids, attention_mask, labels = tuple(item.to(device) for item in batch)
      
      # without these 3 statements, despite tensor shape [batch size, seq_len] is correctly output at the dataloading step, 
      # at here it has become [batch size, 1, seq_len] shape, so need to reshape here
      input_ids = input_ids.squeeze(1)
      attention_mask = attention_mask.squeeze(1)
      labels = labels.unsqueeze(0)

      optimizer.zero_grad() 

      # forward pass, outputs object contains the model's predictions, as well as the loss and other optional outputs.
      outputs = model(input_ids, attention_mask, labels=labels)  

      # default loss function by huggingface BERT - binary_cls: nll, multiclass: cross entropy
      # extracts the loss value from the outputs object
      loss = outputs.loss
      batch_loss += outputs.loss        # tensor in format 

      loss.backward() 
      optimizer.step() 
      scheduler.step()
    
    return batch_loss/len(train_dataloader)

In [9]:
# validate model
def validate(val_dataloader, model, device, whole_ds=False):
    model.to(device)
    model.eval()

    total_loss = 0
    y_val = []
    labels_list = []

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = tuple(item.to(device) for item in batch)

            input_ids = input_ids.squeeze(1)
            attention_mask = attention_mask.squeeze(1)
            labels = labels.unsqueeze(0)

            # outputs produces 2 parameters: outputs.loss, outputs.logits
            outputs = model(input_ids, attention_mask, labels=labels) 

            # store prediction
            # outputs.logits represents the model's confidence scores for each class label
            y_val.append(outputs.logits.argmax(axis=1).tolist())
            labels_list.append(labels.tolist())

            #print("outputs.logits.argmax(axis=1)\n", outputs.logits.argmax(axis=1))

            y_val_flatten = [item for sublist in y_val for item in sublist]
            labels_flatten = [item for sublist1 in labels_list for sublist2 in sublist1 for item in sublist2]

    print("y_val_flatten\n", y_val_flatten)  
    print("labels_flatten\n", labels_flatten)
    print("F1 score\n", f1_score(y_val_flatten, labels_flatten, average='micro'))
    if whole_ds == True:
        print(classification_report(y_val_flatten, labels_flatten))
        

In [10]:
# Define learning rates to test
learning_rates = [0.0001, 0.001, 0.01]

# Define k fold
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
early_stopping_patience = 4

# Perform KFold cross validation to finetune the language model
# train each fold with 3 different lr and select the best
# kf.split() method returns an iterator that generates a set of train and validation indices for each fold.

for fold, (train_index, valid_index) in enumerate(kf.split(df)):    

    # Get the training and validation data for this fold
    train_kf = df.iloc[train_index]
    val_kf = df.iloc[valid_index]

    print("fold", fold)
    print(len(train_kf))  # 5 fold is 80%
    print(len(val_kf))   # 5 fold is 20%

    # Get tokens and attention mask
    train_dataset = NewsDataset(train_kf, tokenizer, max_len=MAX_LEN)
    val_dataset = NewsDataset(val_kf, tokenizer, max_len=MAX_LEN)

    # Get dataloader
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    for lr in learning_rates:
        print("lr", lr)
        best_avg_loss = float('inf')

        # reinitialize model weight after individual training
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

        for epoch in range(EPOCHS):

            # Train the model on this fold with the current learning rate
            loss = train(train_dataloader, model, device, lr=lr, warmup_steps=200)
            print("Epoch: {}, loss: {}".format(epoch, loss))

            # determine if we need early stop
            if loss < best_avg_loss:
                best_avg_loss = loss
                num_epochs_no_improvement = 0
            else:
                num_epochs_no_improvement += 1
                if num_epochs_no_improvement == early_stopping_patience:
                    print(f"Validation loss has not improved for {early_stopping_patience} epochs. Stopping training.")
                    break

        # check F1 score of validation set after training
        validate(val_dataloader, model, device)

fold 0
359
90
lr 0.0001


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.6815046072006226
Epoch: 1, loss: 0.5478965044021606
Epoch: 2, loss: 0.3821585178375244
Epoch: 3, loss: 0.2740594446659088
Epoch: 4, loss: 0.21389099955558777
Epoch: 5, loss: 0.16713014245033264
Epoch: 6, loss: 0.1334145963191986
Epoch: 7, loss: 0.1109265685081482
Epoch: 8, loss: 0.09352567046880722
Epoch: 9, loss: 0.07697620242834091
Epoch: 10, loss: 0.06284703314304352
Epoch: 11, loss: 0.052361488342285156
Epoch: 12, loss: 0.04226803779602051
Epoch: 13, loss: 0.03532828763127327
Epoch: 14, loss: 0.027576133608818054
Epoch: 15, loss: 0.022041842341423035
Epoch: 16, loss: 0.017797401174902916
Epoch: 17, loss: 0.014349168166518211
Epoch: 18, loss: 0.011440555565059185
Epoch: 19, loss: 0.009380526840686798
y_val_flatten
 [1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.5237127542495728
Epoch: 1, loss: 0.15846599638462067
Epoch: 2, loss: 0.04347994551062584
Epoch: 3, loss: 0.015885667875409126
Epoch: 4, loss: 0.007186021655797958
Epoch: 5, loss: 0.0012055672705173492
Epoch: 6, loss: 9.608168329577893e-05
Epoch: 7, loss: 1.7128557374235243e-05
Epoch: 8, loss: 4.056032139487797e-06
Epoch: 9, loss: 1.1642414392554201e-06
Epoch: 10, loss: 4.665482435939339e-07
Epoch: 11, loss: 2.4569175138822175e-07
Epoch: 12, loss: 1.4626198208134156e-07
Epoch: 13, loss: 1.1047259107499485e-07
Epoch: 14, loss: 5.654457879700203e-08
Epoch: 15, loss: 1.459071974352355e-08
Epoch: 16, loss: 2.1730859334212482e-09
Epoch: 17, loss: 1.2417633588057697e-09
Epoch: 18, loss: 3.1044083970144243e-10
Epoch: 19, loss: 3.1044083970144243e-10
y_val_flatten
 [1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.37498709559440613
Epoch: 1, loss: 0.12406545877456665
Epoch: 2, loss: 0.46116065979003906
Epoch: 3, loss: 0.06247485801577568
Epoch: 4, loss: 0.2923194169998169
Epoch: 5, loss: 0.29637616872787476
Epoch: 6, loss: 0.03540700301527977
Epoch: 7, loss: 0.39987850189208984
Epoch: 8, loss: 0.2582245469093323
Epoch: 9, loss: 1.9356141090393066
Epoch: 10, loss: 0.9297534227371216
Validation loss has not improved for 4 epochs. Stopping training.
y_val_flatten
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
labels_flatten
 [1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.6801482439041138
Epoch: 1, loss: 0.5914452075958252
Epoch: 2, loss: 0.5346781015396118
Epoch: 3, loss: 0.46095508337020874
Epoch: 4, loss: 0.40346965193748474
Epoch: 5, loss: 0.35374388098716736
Epoch: 6, loss: 0.30199962854385376
Epoch: 7, loss: 0.2624644935131073
Epoch: 8, loss: 0.22376707196235657
Epoch: 9, loss: 0.18876442313194275
Epoch: 10, loss: 0.15299168229103088
Epoch: 11, loss: 0.12951891124248505
Epoch: 12, loss: 0.10710626095533371
Epoch: 13, loss: 0.0870715007185936
Epoch: 14, loss: 0.06957133114337921
Epoch: 15, loss: 0.0551266223192215
Epoch: 16, loss: 0.04402410238981247
Epoch: 17, loss: 0.0374133326113224
Epoch: 18, loss: 0.0296679325401783
Epoch: 19, loss: 0.02625170163810253
y_val_flatten
 [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.5624526739120483
Epoch: 1, loss: 0.20680934190750122
Epoch: 2, loss: 0.056972842663526535
Epoch: 3, loss: 0.014807017520070076
Epoch: 4, loss: 0.0013200503308326006
Epoch: 5, loss: 0.00018010876374319196
Epoch: 6, loss: 3.281419049017131e-05
Epoch: 7, loss: 5.758613951911684e-06
Epoch: 8, loss: 1.4407992239284795e-06
Epoch: 9, loss: 5.408322749644867e-07
Epoch: 10, loss: 2.5150143301289063e-07
Epoch: 11, loss: 1.498098782803936e-07
Epoch: 12, loss: 1.1486311279895745e-07
Epoch: 13, loss: 5.747590137161751e-08
Epoch: 14, loss: 1.5078555293257523e-08
Epoch: 15, loss: 1.8626450382086546e-09
Epoch: 16, loss: 9.313225191043273e-10
Epoch: 17, loss: 3.1487568108445885e-09
Epoch: 18, loss: 0.0
Epoch: 19, loss: 0.0
y_val_flatten
 [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.295303612947464
Epoch: 1, loss: 0.078341044485569
Epoch: 2, loss: 0.07774752378463745
Epoch: 3, loss: 0.2685577869415283
Epoch: 4, loss: 0.28067412972450256
Epoch: 5, loss: 0.1634032130241394
Epoch: 6, loss: 0.02445177733898163
Epoch: 7, loss: 0.05546518787741661
Epoch: 8, loss: 3.7344987504184246e-05
Epoch: 9, loss: 1.4811649322509766
Epoch: 10, loss: 1.4612674713134766
Epoch: 11, loss: 0.7027384638786316
Epoch: 12, loss: 0.8168746829032898
Validation loss has not improved for 4 epochs. Stopping training.
y_val_flatten
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
labels_flatten
 [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.6738466024398804
Epoch: 1, loss: 0.5766671895980835
Epoch: 2, loss: 0.4730331599712372
Epoch: 3, loss: 0.39425286650657654
Epoch: 4, loss: 0.3371387720108032
Epoch: 5, loss: 0.2852504253387451
Epoch: 6, loss: 0.2459568977355957
Epoch: 7, loss: 0.20998826622962952
Epoch: 8, loss: 0.17586080729961395
Epoch: 9, loss: 0.14801761507987976
Epoch: 10, loss: 0.12129811942577362
Epoch: 11, loss: 0.10454405844211578
Epoch: 12, loss: 0.08438308537006378
Epoch: 13, loss: 0.06891581416130066
Epoch: 14, loss: 0.05767641216516495
Epoch: 15, loss: 0.047928422689437866
Epoch: 16, loss: 0.041324831545352936
Epoch: 17, loss: 0.03456743806600571
Epoch: 18, loss: 0.025738000869750977
Epoch: 19, loss: 0.020817792043089867
y_val_flatten
 [1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.5455480813980103
Epoch: 1, loss: 0.19675123691558838
Epoch: 2, loss: 0.059578027576208115
Epoch: 3, loss: 0.023638024926185608
Epoch: 4, loss: 0.002290291478857398
Epoch: 5, loss: 0.00040708843152970076
Epoch: 6, loss: 7.156375068007037e-05
Epoch: 7, loss: 1.5693822206230834e-05
Epoch: 8, loss: 3.46890351465845e-06
Epoch: 9, loss: 1.1028186008843477e-06
Epoch: 10, loss: 4.752849349642929e-07
Epoch: 11, loss: 2.5841984552243957e-07
Epoch: 12, loss: 1.4404454873329087e-07
Epoch: 13, loss: 1.1393179022434197e-07
Epoch: 14, loss: 5.995943297421036e-08
Epoch: 15, loss: 1.5078555293257523e-08
Epoch: 16, loss: 3.068004161832505e-07
Epoch: 17, loss: 6.208816794028849e-10
Epoch: 18, loss: 0.0
Epoch: 19, loss: 0.0
y_val_flatten
 [1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.44918638467788696
Epoch: 1, loss: 0.36764252185821533
Epoch: 2, loss: 0.009587215259671211
Epoch: 3, loss: 0.43280646204948425
Epoch: 4, loss: 0.8796525001525879
Epoch: 5, loss: 1.1880242824554443
Epoch: 6, loss: 0.747535228729248
Validation loss has not improved for 4 epochs. Stopping training.
y_val_flatten
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
labels_flatten
 [1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1]
F1 score
 0.4444444444444444
fold 3
359
90
lr 0.0001


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.6944572925567627
Epoch: 1, loss: 0.6286720633506775
Epoch: 2, loss: 0.5370960235595703
Epoch: 3, loss: 0.44843846559524536
Epoch: 4, loss: 0.3628273904323578
Epoch: 5, loss: 0.3050786852836609
Epoch: 6, loss: 0.25128424167633057
Epoch: 7, loss: 0.1996871680021286
Epoch: 8, loss: 0.15627875924110413
Epoch: 9, loss: 0.12514711916446686
Epoch: 10, loss: 0.0965019017457962
Epoch: 11, loss: 0.07332255691289902
Epoch: 12, loss: 0.05900314450263977
Epoch: 13, loss: 0.045581601560115814
Epoch: 14, loss: 0.03285162150859833
Epoch: 15, loss: 0.02686314284801483
Epoch: 16, loss: 0.019699014723300934
Epoch: 17, loss: 0.014545018784701824
Epoch: 18, loss: 0.01171054970473051
Epoch: 19, loss: 0.009038178250193596
y_val_flatten
 [1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.5343654751777649
Epoch: 1, loss: 0.15462151169776917
Epoch: 2, loss: 0.047387223690748215
Epoch: 3, loss: 0.020261185243725777
Epoch: 4, loss: 0.002940199803560972
Epoch: 5, loss: 0.0002516770618967712
Epoch: 6, loss: 2.916858284152113e-05
Epoch: 7, loss: 5.198802682571113e-06
Epoch: 8, loss: 1.299016730627045e-06
Epoch: 9, loss: 4.4468433202382585e-07
Epoch: 10, loss: 2.2511396480240364e-07
Epoch: 11, loss: 1.3131648302078247e-07
Epoch: 12, loss: 9.814365142801762e-08
Epoch: 13, loss: 3.4458935260772705e-08
Epoch: 14, loss: 7.140139146599722e-09
Epoch: 15, loss: 2.1730859334212482e-09
Epoch: 16, loss: 9.313225191043273e-10
Epoch: 17, loss: 3.1044083970144243e-10
Epoch: 18, loss: 0.0
Epoch: 19, loss: 3.1044083970144243e-10
y_val_flatten
 [1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0,

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.4150233864784241
Epoch: 1, loss: 0.2253381907939911
Epoch: 2, loss: 0.2125651091337204
Epoch: 3, loss: 0.08583762496709824
Epoch: 4, loss: 0.41449999809265137
Epoch: 5, loss: 0.14732831716537476
Epoch: 6, loss: 0.05679789185523987
Epoch: 7, loss: 0.04826667904853821
Epoch: 8, loss: 0.2511391341686249
Epoch: 9, loss: 0.8311008810997009
Epoch: 10, loss: 0.8887655138969421
Epoch: 11, loss: 0.7157316207885742
Validation loss has not improved for 4 epochs. Stopping training.
y_val_flatten
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
labels_flatten
 [1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.6691948175430298
Epoch: 1, loss: 0.5739316940307617
Epoch: 2, loss: 0.45790597796440125
Epoch: 3, loss: 0.383735716342926
Epoch: 4, loss: 0.31793493032455444
Epoch: 5, loss: 0.26698219776153564
Epoch: 6, loss: 0.22489634156227112
Epoch: 7, loss: 0.18529759347438812
Epoch: 8, loss: 0.1521136462688446
Epoch: 9, loss: 0.12010088562965393
Epoch: 10, loss: 0.09649302065372467
Epoch: 11, loss: 0.07742787152528763
Epoch: 12, loss: 0.06194230541586876
Epoch: 13, loss: 0.04915152117609978
Epoch: 14, loss: 0.03914428874850273
Epoch: 15, loss: 0.030947215855121613
Epoch: 16, loss: 0.02426809072494507
Epoch: 17, loss: 0.020137740299105644
Epoch: 18, loss: 0.016130130738019943
Epoch: 19, loss: 0.012820957228541374
y_val_flatten
 [1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.5427283048629761
Epoch: 1, loss: 0.1266109049320221
Epoch: 2, loss: 0.019600633531808853
Epoch: 3, loss: 0.0022214194759726524
Epoch: 4, loss: 0.00023544064606539905
Epoch: 5, loss: 3.213989475625567e-05
Epoch: 6, loss: 6.38822530163452e-06
Epoch: 7, loss: 1.2920543213112978e-06
Epoch: 8, loss: 4.547958383227524e-07
Epoch: 9, loss: 2.297262255979149e-07
Epoch: 10, loss: 1.3069559656742058e-07
Epoch: 11, loss: 9.840974257713242e-08
Epoch: 12, loss: 4.31512745535656e-08
Epoch: 13, loss: 9.623666308300471e-09
Epoch: 14, loss: 1.552204142996061e-09
Epoch: 15, loss: 3.1044083970144243e-10
Epoch: 16, loss: 3.1044083970144243e-10
Epoch: 17, loss: 0.0
Epoch: 18, loss: 0.0
Epoch: 19, loss: 0.0
y_val_flatten
 [1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0]
label

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.6611869931221008
Epoch: 1, loss: 0.49268388748168945
Epoch: 2, loss: 0.0766557902097702
Epoch: 3, loss: 0.34869584441185
Epoch: 4, loss: 0.9363691210746765
Epoch: 5, loss: 0.16223829984664917
Epoch: 6, loss: 0.1727580726146698
Validation loss has not improved for 4 epochs. Stopping training.
y_val_flatten
 [1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0]
labels_flatten
 [1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0]
F1 score
 0.9887640449438202


As from the result above, 0.001 was selected as the learning rate, because it returns both the highest F1 score and least average cost.

In [11]:
# Evaluate the whole model using the found optimal learning rate
train_df, val_df = train_test_split(df, test_size=0.25, shuffle=True)

train_dataset = NewsDataset(train_df, tokenizer, max_len=MAX_LEN)
val_dataset = NewsDataset(val_df, tokenizer, max_len=MAX_LEN)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

lr = 0.001
previous_loss = 100

model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

for epoch in range(EPOCHS):

    loss = train(train_dataloader, model, device, lr=lr, warmup_steps=200)
    print("Epoch: {}, loss: {}".format(epoch, loss))

    if loss < previous_loss:
        previous_loss = loss
        num_epochs_no_improvement = 0
    else:
        num_epochs_no_improvement += 1
        if num_epochs_no_improvement == early_stopping_patience:
            print(f"Validation loss has not improved for {early_stopping_patience} epochs. Stopping training.")
            break

validate(val_dataloader, model, device, whole_ds=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 0, loss: 0.577066957950592
Epoch: 1, loss: 0.2207532823085785
Epoch: 2, loss: 0.054890044033527374
Epoch: 3, loss: 0.017414940521121025
Epoch: 4, loss: 0.0034328140318393707
Epoch: 5, loss: 0.0006608632393181324
Epoch: 6, loss: 0.00014309595280792564
Epoch: 7, loss: 2.6409838028484955e-05
Epoch: 8, loss: 6.890405984449899e-06
Epoch: 9, loss: 2.2097719920566306e-06
Epoch: 10, loss: 9.238717666448792e-07
Epoch: 11, loss: 4.565173696846614e-07
Epoch: 12, loss: 2.7533280899660895e-07
Epoch: 13, loss: 1.6729939034121344e-07
Epoch: 14, loss: 1.283531787521497e-07
Epoch: 15, loss: 9.821219038030904e-08
Epoch: 16, loss: 4.9783420053017835e-08
Epoch: 17, loss: 1.9303776355172886e-08
Epoch: 18, loss: 4.063952729893572e-09
Epoch: 19, loss: 1.6933137114705232e-09
y_val_flatten
 [1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1,