In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install datasets

In [None]:
pip install transformers

In [None]:
pip install GPUtil 

In [None]:
# reference - https://towardsdatascience.com/text-classification-with-cnns-in-pytorch-1113df31e79f

In [None]:
# modules
import pandas as pd
import os
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import tqdm
import numpy as np
from numpy import mean
from numpy import std
from datasets import load_metric
import random
from IPython.display import FileLink, FileLinks

import torch
from torch.utils.data import DataLoader, RandomSampler, TensorDataset

import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam, SGD
from transformers import pipeline

import transformers
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer, AutoModel, BertTokenizer, BertModel
from sklearn.model_selection import KFold
import gc
from GPUtil import showUtilization as gpu_usage

In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 32.3 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 62.3 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 73.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0


In [None]:
from transformers import AutoTokenizer, AutoModel
import random, math
from typing import Union
import numpy as np
import torch
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import transformers
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold 
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def load_datasets() -> Union[dict, dict]:

    import json
    import os
    
    train = {}
    test = {}

    train_df = pd.read_csv(r'/content/drive/MyDrive/dreaddit/dreaddit-train.csv')
    test_df = pd.read_csv(r'/content/drive/MyDrive/dreaddit/dreaddit-test.csv')
        
    train["texts"] = train_df.text.tolist() 
    train["label"] = train_df.label.tolist() 

    test["texts"] = test_df.text.tolist()
    test["label"] = test_df.label.tolist()
    
    return train, test



# K-Fold Cross-Validation
def cross_validation(model, _X, _y, _cv=5):

      '''Function to perform 5 Folds Cross-Validation
       Parameters
       ----------
      model: Python Class, default=None
              This is the machine learning algorithm to be used for training.
      _X: array
           This is the matrix of features.
      _y: array
           This is the target variable.
      _cv: int, default=5
          Determines the number of folds for cross-validation.
       Returns
       -------
       The function returns a dictionary containing the metrics 'accuracy', 'precision',
       'recall', 'f1' for both training set and validation set.
      '''
      _scoring = ['accuracy', 'precision', 'recall', 'f1']
      results = cross_validate(estimator=model,
                               X=_X,
                               y=_y,
                               cv=_cv,
                               scoring=_scoring,
                               return_train_score=True)
      
      return {"Training Accuracy scores": results['train_accuracy'],
              "Mean Training Accuracy": results['train_accuracy'].mean()*100,
              "Training Precision scores": results['train_precision'],
              "Mean Training Precision": results['train_precision'].mean(),
              "Training Recall scores": results['train_recall'],
              "Mean Training Recall": results['train_recall'].mean(),
              "Training F1 scores": results['train_f1'],
              "Mean Training F1 Score": results['train_f1'].mean(),
              "Validation Accuracy scores": results['test_accuracy'],
              "Mean Validation Accuracy": results['test_accuracy'].mean()*100,
              "Validation Precision scores": results['test_precision'],
              "Mean Validation Precision": results['test_precision'].mean(),
              "Validation Recall scores": results['test_recall'],
              "Mean Validation Recall": results['test_recall'].mean(),
              "Validation F1 scores": results['test_f1'],
              "Mean Validation F1 Score": results['test_f1'].mean()
              }


class Dataset(torch.utils.data.Dataset):
    def __init__(self, data_dict: dict):
        self.data_dict = data_dict
        dd = data_dict

    def __len__(self):
        return len(self.data_dict["texts"])

    def __getitem__(self, idx):
        dd = self.data_dict
        return dd["texts"][idx], dd["label"][idx]


def train_CNN(model, loader, device):
    model.train()
    criterion = model.get_criterion()
    total_loss = 0.0
    
    for X,y in tqdm(loader):
        print(len(X))
        optimizer.zero_grad()

        inputs = model.embedding(X).to(device)
        target = y.to(device, dtype=torch.float32)

        pred = model(inputs)

        # loss = criterion(pred, target)
        loss = F.binary_cross_entropy(pred, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)

@torch.no_grad()
def eval_mentalbert(model, loader, device):
    model.eval()

    targets = []
    preds = []
    for X,y in tqdm(loader):
        print(len(X))
        optimizer.zero_grad()

        inputs = model.embedding(X).to(device)
        target = y.to(device, dtype=torch.float32)

        pred = model(inputs)

        preds.append(pred)
        targets.append(target)

    return torch.cat(preds), torch.cat(targets)

class ConvNeuralNet(nn.Module):
	#  Determine what layers and their order in CNN object 
    def __init__(self):
        super(ConvNeuralNet, self).__init__()
        
        self.tokenizer = AutoTokenizer.from_pretrained("mental/mental-bert-base-uncased")
        self.pretrained_model = AutoModel.from_pretrained("mental/mental-bert-base-uncased")

        # Parameters regarding text preprocessing
        self.seq_len = 100
        self.num_words = 50000
        self.embedding_size = 768
        
        # Dropout definition
        self.dropout = nn.Dropout(0.25)
        
        # CNN parameters definition
        # Kernel sizes
        self.kernel_1 = 2
        self.kernel_2 = 3
        self.kernel_3 = 4
        self.kernel_4 = 5
        
        # Output size for each convolution
        self.out_size = 768
        # Number of strides for each convolution
        self.stride = 1
        
        # Embedding layer definition
        self.embedding = nn.Embedding(self.num_words + 1, self.embedding_size, padding_idx=0)
        
        # Convolution layers definition
        self.conv_1 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_1, self.stride)
        self.conv_2 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_2, self.stride)
        self.conv_3 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_3, self.stride)
        self.conv_4 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_4, self.stride)
        
        # Max pooling layers definition
        self.pool_1 = nn.MaxPool1d(self.kernel_1, self.stride)
        self.pool_2 = nn.MaxPool1d(self.kernel_2, self.stride)
        self.pool_3 = nn.MaxPool1d(self.kernel_3, self.stride)
        self.pool_4 = nn.MaxPool1d(self.kernel_4, self.stride)
        
        # Fully connected layer definition
        self.fc = nn.Linear(self.in_features_fc(), 1)
    
        
        self.criterion = nn.CrossEntropyLoss()
        
        
    def get_pretrained_model(self):
        return self.pretrained_model

    def get_tokenizer(self):
        return self.tokenizer
    
    def get_criterion(self):
        return self.criterion

    def assign_optimizer(self, lr=0.0001, weight_decay = 0.005, momentum = 0.9):
        optimizer = torch.optim.SGD(model.parameters(), lr=0.0001, weight_decay = 0.005, momentum = 0.9)
        return optimizer

    def tokenize(
                self,
                texts: "list[str]",
                max_length: int = 100,
                truncation: bool = True,
                padding: bool = True,
            ):
                
                tokenizer = self.get_tokenizer()
                # print('------->>>',list(texts))

                tokenized = tokenizer(list(texts), 
                                        max_length = max_length, 
                                        truncation = True, 
                                        padding = True,
                                        return_tensors ="pt")
        
                return tokenized        


    def embedding(self, inputs):
        Model = self.get_pretrained_model()
        encoded_input = self.tokenize(inputs)
        # for t in encoded_input:
        # print('------->>>',encoded_input['input_ids'].shape)
        Model.eval()
        # sentence_embeddings = []
        with torch.no_grad():
                model_output_train = Model(**encoded_input.to(device))
                hid_states = model_output_train[0]
                # print('------->>>',hid_states.shape)
                # tok_vecs = hid_states[-2]
                # print(tok_vecs.shape)
                # sentence_embeddings.extend(torch.mean(tok_vecs,dim=1))
        # print(len(sentence_embeddings))
        return hid_states.to(device)
    
    # Progresses data across layers    
   
    
    def in_features_fc(self):
          '''Calculates the number of output features after Convolution + Max pooling
             
          Convolved_Features = ((embedding_size + (2 * padding) - dilation * (kernel - 1) - 1) / stride) + 1
          Pooled_Features = ((embedding_size + (2 * padding) - dilation * (kernel - 1) - 1) / stride) + 1
          
          source: https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
          '''
          # Calcualte size of convolved/pooled features for convolution_1/max_pooling_1 features
          out_conv_1 = ((self.embedding_size - 1 * (self.kernel_1 - 1) - 1) / self.stride) + 1
          out_conv_1 = math.floor(out_conv_1)
          out_pool_1 = ((out_conv_1 - 1 * (self.kernel_1 - 1) - 1) / self.stride) + 1
          out_pool_1 = math.floor(out_pool_1)
          
          # Calcualte size of convolved/pooled features for convolution_2/max_pooling_2 features
          out_conv_2 = ((self.embedding_size - 1 * (self.kernel_2 - 1) - 1) / self.stride) + 1
          out_conv_2 = math.floor(out_conv_2)
          out_pool_2 = ((out_conv_2 - 1 * (self.kernel_2 - 1) - 1) / self.stride) + 1
          out_pool_2 = math.floor(out_pool_2)
          
          # Calcualte size of convolved/pooled features for convolution_3/max_pooling_3 features
          out_conv_3 = ((self.embedding_size - 1 * (self.kernel_3 - 1) - 1) / self.stride) + 1
          out_conv_3 = math.floor(out_conv_3)
          out_pool_3 = ((out_conv_3 - 1 * (self.kernel_3 - 1) - 1) / self.stride) + 1
          out_pool_3 = math.floor(out_pool_3)
          
          # Calcualte size of convolved/pooled features for convolution_4/max_pooling_4 features
          out_conv_4 = ((self.embedding_size - 1 * (self.kernel_4 - 1) - 1) / self.stride) + 1
          out_conv_4 = math.floor(out_conv_4)
          out_pool_4 = ((out_conv_4 - 1 * (self.kernel_4 - 1) - 1) / self.stride) + 1
          out_pool_4 = math.floor(out_pool_4)
          
          # Returns "flattened" vector (input for fully connected layer)
          return (out_pool_1 + out_pool_2 + out_pool_3 + out_pool_4) * self.out_size    
   
    def forward(self, x):
    
          # Sequence of tokes is filterd through an embedding layer
          # x = self.embedding(x)
          
          # Convolution layer 1 is applied
          x1 = self.conv_1(x)
          x1 = torch.relu(x1)
          x1 = self.pool_1(x1)
          
          # Convolution layer 2 is applied
          x2 = self.conv_2(x)
          x2 = torch.relu((x2))
          x2 = self.pool_2(x2)
       
          # Convolution layer 3 is applied
          x3 = self.conv_3(x)
          x3 = torch.relu(x3)
          x3 = self.pool_3(x3)
          
          # Convolution layer 4 is applied
          x4 = self.conv_4(x)
          x4 = torch.relu(x4)
          x4 = self.pool_4(x4)
          
          # The output of each convolutional layer is concatenated into a unique vector
          union = torch.cat((x1, x2, x3, x4), 2)
          union = union.reshape(union.size(0), -1)
    
          # The "flattened" vector is passed through a fully connected layer
          out = self.fc(union)
          # Dropout is applied		
          out = self.dropout(out)
          # Activation function is applied
          out = torch.sigmoid(out)
          
          return out.squeeze() 

if __name__ == "__main__":
    import pandas as pd

    random.seed(2022)
    torch.manual_seed(2022)

    sample_size = None  
    batch_size = 64
    n_epochs = 10
    num_words = 50000

    # If you use GPUs, use the code below:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_raw, test_raw = load_datasets()
    

    if sample_size is not None:
        for key in ["texts", "label"]:
            train_raw[key] = train_raw[key][:sample_size]
            test_raw[key] = test_raw[key][:sample_size]
    
    # print(len(train_raw['texts']))
    
    print("=" * 80)
    print("Running test code for part 1")
    print("-" * 80)


    test_loader = torch.utils.data.DataLoader(
        Dataset(test_raw), batch_size=batch_size, shuffle=False
    )


    X = np.array(train_raw['texts'])
    y = np.array(train_raw['label'])

    k = 10
    kf = KFold(n_splits=k, random_state=None)
    
    acc_train = []
    pr_train = []
    re_train = []
    f1_train = []

    acc_valid = []
    pr_valid = []
    re_valid = []
    f1_valid = []
     
    f = 0
    for train_index , test_index in kf.split(X):
        f+=1
        
        train = {}
        valid = {}
        
        X_train , X_test = X[train_index], X[test_index]
        y_train , y_test = y[train_index] , y[test_index]
        
        train['texts'] = X_train.tolist()
        train['label'] = y_train.tolist()
        
        valid['texts'] = X_test.tolist()
        valid['label'] = y_test.tolist()

        train_loader = torch.utils.data.DataLoader(
            Dataset(train), batch_size=batch_size, shuffle=True
        )

        valid_loader = torch.utils.data.DataLoader(
            Dataset(valid), batch_size=batch_size, shuffle=False
        )
        
        model = ConvNeuralNet().to(device)
        optimizer = model.assign_optimizer(lr=1e-4)
        
        train_accuracies = []
        train_precisions = []
        train_recalls = []
        train_f1_scores = []

        
        valid_accuracies = []
        valid_precisions = []
        valid_recalls = []
        valid_f1_scores = []        
        
        valid_accuracies = []
        
        previous_valid_accuracy = -1
        
        for epoch in range(n_epochs):

            loss = train_CNN (model, train_loader, device=device)
    
            train_preds, train_targets = eval_mentalbert(model, train_loader, device=device)
            train_preds = train_preds.round()
            
            train_accuracy = accuracy_score(train_targets.cpu(), train_preds.cpu())
            train_accuracies.append(train_accuracy)
            train_precision = precision_score(train_targets.cpu(), train_preds.cpu())
            train_precisions.append(train_precision)
            train_recall = recall_score(train_targets.cpu(), train_preds.cpu())
            train_recalls.append(train_recall)
            train_f1 = f1_score(train_targets.cpu(), train_preds.cpu())
            train_f1_scores.append(train_f1)          
            

            eval_preds, eval_targets = eval_mentalbert(model, valid_loader, device=device)
            eval_preds = eval_preds.round()

            eval_accuracy = accuracy_score(eval_targets.cpu(), eval_preds.cpu())
            valid_accuracies.append(eval_accuracy)
            valid_precision = precision_score(eval_targets.cpu(), eval_preds.cpu())
            valid_precisions.append(valid_precision)
            valid_recall = recall_score(eval_targets.cpu(), eval_preds.cpu())
            valid_recalls.append(valid_recall)
            valid_f1_score = f1_score(eval_targets.cpu(), eval_preds.cpu())
            valid_f1_scores.append(valid_f1_score)
           
            print("Epoch:", epoch)
            print("Training loss:", loss)
            print("Train Accuracy:", train_accuracy)
            print("Validation Accuracy:", eval_accuracy)
    
            print('---'*10)   
            
            #if eval_accuracy < previous_valid_accuracy:
            #    break
            #else:
            #    previous_valid_accuracy = eval_accuracy
        
     
        acc_train.append(sum(train_accuracies)/len(train_accuracies))
        pr_train.append(sum(train_precisions)/len(train_precisions))
        re_train.append(sum(train_recalls)/len(train_recalls))
        f1_train.append(sum(train_f1_scores)/len(train_f1_scores))

        acc_valid.append(sum(valid_accuracies)/len(valid_accuracies))
        pr_valid.append(sum(valid_precisions)/len(valid_precisions))
        re_valid.append(sum(valid_recalls)/len(valid_recalls))
        f1_valid.append(sum(valid_f1_scores)/len(valid_f1_scores))
        
        print('Fold',f, '>>>...'*10)
    
    print("Total Training Accuracy:", sum(acc_train)/len(acc_train))
    print("Total Training Preciion:", sum(pr_train)/len(pr_train))
    print("Total Training Recall:", sum(re_train)/len(re_train))
    print("Total Training F1_Score:", sum(f1_train)/len(f1_train))
    
    print('>>><<<'*10)

    print("Total Validation Accuracy:", sum(acc_valid)/len(acc_valid))
    print("Total Validation Precision:", sum(pr_valid)/len(pr_valid))
    print("Total Validation Recall:", sum(re_valid)/len(re_valid))
    print("Total Validation F1_Score:", sum(f1_valid)/len(f1_valid))

    print('>>><<<'*10)
    
    test_preds, test_targets = eval_mentalbert(model, test_loader, device=device)
    test_preds = test_preds.round()

    test_accuracy = accuracy_score(test_targets.cpu(), test_preds.cpu())
    test_precision = precision_score(test_targets.cpu(), test_preds.cpu())
    test_recall = recall_score(test_targets.cpu(), test_preds.cpu())
    test_f1_score = f1_score(test_targets.cpu(), test_preds.cpu())
   
    print("Total TEST Accuracy:", test_accuracy)
    print("Total TEST Precision:", test_precision)
    print("Total TEST Recall:", test_recall)
    print("Total TEST F1_Score:", test_f1_score)


Running test code for part 1
--------------------------------------------------------------------------------


Some weights of the model checkpoint at mental/mental-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.p

  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 0
Training loss: 1.4098035112023353
Train Accuracy: 0.5219263899765074
Validation Accuracy: 0.545774647887324
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 1
Training loss: 1.7861625842750073
Train Accuracy: 0.8574784651527017
Validation Accuracy: 0.7605633802816901
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 2
Training loss: 0.37295989841222765
Train Accuracy: 0.8441660140955364
Validation Accuracy: 0.75
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 3
Training loss: 0.6659469345584512
Train Accuracy: 0.9385277995301488
Validation Accuracy: 0.7852112676056338
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 4
Training loss: 0.25943852718919513
Train Accuracy: 0.9138606108065779
Validation Accuracy: 0.7676056338028169
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 5
Training loss: 0.23260679170489312
Train Accuracy: 0.9690681284259984
Validation Accuracy: 0.7535211267605634
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 6
Training loss: 0.10824923720210791
Train Accuracy: 0.9600626468285043
Validation Accuracy: 0.75
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 7
Training loss: 0.12555863047018648
Train Accuracy: 0.9933437744714174
Validation Accuracy: 0.778169014084507
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 8
Training loss: 0.07458470109850168
Train Accuracy: 0.9949099451840251
Validation Accuracy: 0.795774647887324
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 9
Training loss: 0.0633189044892788
Train Accuracy: 0.9980422866092404
Validation Accuracy: 0.7887323943661971
------------------------------
Fold 1 >>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...


Some weights of the model checkpoint at mental/mental-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.p

  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 0
Training loss: 1.942244605720043
Train Accuracy: 0.7325763508222396
Validation Accuracy: 0.6338028169014085
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 1
Training loss: 1.3452190980315208
Train Accuracy: 0.6714956930305404
Validation Accuracy: 0.5880281690140845
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 2
Training loss: 0.5258067052811384
Train Accuracy: 0.8958496476115897
Validation Accuracy: 0.7570422535211268
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 3
Training loss: 0.4022868718951941
Train Accuracy: 0.8794048551292091
Validation Accuracy: 0.7112676056338029
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 4
Training loss: 0.19042825233191252
Train Accuracy: 0.9612372748629601
Validation Accuracy: 0.7640845070422535
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 5
Training loss: 0.124464625492692
Train Accuracy: 0.9792482380579484
Validation Accuracy: 0.7746478873239436
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 6
Training loss: 0.11324091292917729
Train Accuracy: 0.9487079091620987
Validation Accuracy: 0.7323943661971831
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 7
Training loss: 0.09148663450032472
Train Accuracy: 0.9949099451840251
Validation Accuracy: 0.7746478873239436
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 8
Training loss: 0.08059937153011561
Train Accuracy: 0.9988253719655442
Validation Accuracy: 0.7605633802816901
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 9
Training loss: 0.06614867877215147
Train Accuracy: 0.9980422866092404
Validation Accuracy: 0.7711267605633803
------------------------------
Fold 2 >>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...


Some weights of the model checkpoint at mental/mental-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.p

  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 0
Training loss: 37.540328186750415
Train Accuracy: 0.5250587314017228
Validation Accuracy: 0.5176056338028169
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 1
Training loss: 47.487877178192136
Train Accuracy: 0.5250587314017228
Validation Accuracy: 0.5176056338028169
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 2
Training loss: 47.49595909118652
Train Accuracy: 0.5250587314017228
Validation Accuracy: 0.5176056338028169
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 3
Training loss: 47.50404090881348
Train Accuracy: 0.5250587314017228
Validation Accuracy: 0.5176056338028169
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 4
Training loss: 47.49595909118652
Train Accuracy: 0.5250587314017228
Validation Accuracy: 0.5176056338028169
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 5
Training loss: 47.5
Train Accuracy: 0.5250587314017228
Validation Accuracy: 0.5176056338028169
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 6
Training loss: 47.5
Train Accuracy: 0.5250587314017228
Validation Accuracy: 0.5176056338028169
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 7
Training loss: 47.487877178192136
Train Accuracy: 0.5250587314017228
Validation Accuracy: 0.5176056338028169
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 8
Training loss: 47.51616382598877
Train Accuracy: 0.5250587314017228
Validation Accuracy: 0.5176056338028169
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 9
Training loss: 47.51616382598877
Train Accuracy: 0.5250587314017228
Validation Accuracy: 0.5176056338028169
------------------------------
Fold 3 >>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...


Some weights of the model checkpoint at mental/mental-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.p

  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 0
Training loss: 1.6657104007899761
Train Accuracy: 0.5411119812059515
Validation Accuracy: 0.47183098591549294
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 1
Training loss: 0.6069211006164551
Train Accuracy: 0.7552858261550509
Validation Accuracy: 0.6514084507042254
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 2
Training loss: 0.43489958830177783
Train Accuracy: 0.8234142521534847
Validation Accuracy: 0.6690140845070423
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 3
Training loss: 0.24299126565456391
Train Accuracy: 0.89389193422083
Validation Accuracy: 0.7676056338028169
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 4
Training loss: 0.22870861254632474
Train Accuracy: 0.9768989819890368
Validation Accuracy: 0.7992957746478874
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 5
Training loss: 0.1374806050211191
Train Accuracy: 0.9882537196554424
Validation Accuracy: 0.778169014084507
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 6
Training loss: 0.18808551803231238
Train Accuracy: 0.6832419733750978
Validation Accuracy: 0.5669014084507042
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 7
Training loss: 0.22422890327870845
Train Accuracy: 0.9964761158966328
Validation Accuracy: 0.7922535211267606
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 8
Training loss: 0.094987434707582
Train Accuracy: 0.9972592012529365
Validation Accuracy: 0.7922535211267606
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 9
Training loss: 0.06655817413702607
Train Accuracy: 0.9964761158966328
Validation Accuracy: 0.7852112676056338
------------------------------
Fold 4 >>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...


Some weights of the model checkpoint at mental/mental-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.p

  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 0
Training loss: 32.898042215406896
Train Accuracy: 0.5211433046202036
Validation Accuracy: 0.5528169014084507
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 1
Training loss: 47.878502178192136
Train Accuracy: 0.5211433046202036
Validation Accuracy: 0.5528169014084507
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 2
Training loss: 47.878502178192136
Train Accuracy: 0.5211433046202036
Validation Accuracy: 0.5528169014084507
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 3
Training loss: 47.88658409118652
Train Accuracy: 0.5211433046202036
Validation Accuracy: 0.5528169014084507
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 4
Training loss: 47.890625
Train Accuracy: 0.5211433046202036
Validation Accuracy: 0.5528169014084507
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 5
Training loss: 47.902747821807864
Train Accuracy: 0.5211433046202036
Validation Accuracy: 0.5528169014084507
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 6
Training loss: 47.87446117401123
Train Accuracy: 0.5211433046202036
Validation Accuracy: 0.5528169014084507
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 7
Training loss: 47.890625
Train Accuracy: 0.5211433046202036
Validation Accuracy: 0.5528169014084507
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 8
Training loss: 47.89466590881348
Train Accuracy: 0.5211433046202036
Validation Accuracy: 0.5528169014084507
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 9
Training loss: 47.862338352203366
Train Accuracy: 0.5211433046202036
Validation Accuracy: 0.5528169014084507
------------------------------
Fold 5 >>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...


Some weights of the model checkpoint at mental/mental-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.p

  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 0
Training loss: 0.9687975704669952
Train Accuracy: 0.6319498825371965
Validation Accuracy: 0.5950704225352113
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 1
Training loss: 0.7840157553553582
Train Accuracy: 0.8057948316366484
Validation Accuracy: 0.7288732394366197
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 2
Training loss: 0.7850919481366873
Train Accuracy: 0.9036805011746281
Validation Accuracy: 0.7746478873239436
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 3
Training loss: 0.3371526677161455
Train Accuracy: 0.922866092404072
Validation Accuracy: 0.7676056338028169
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 4
Training loss: 0.2460744708776474
Train Accuracy: 0.9686765857478465
Validation Accuracy: 0.7992957746478874
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 5
Training loss: 0.20552024021744728
Train Accuracy: 0.9812059514487079
Validation Accuracy: 0.795774647887324
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 6
Training loss: 0.09967048279941082
Train Accuracy: 0.9956930305403289
Validation Accuracy: 0.8063380281690141
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 7
Training loss: 0.08351591397076845
Train Accuracy: 0.9988253719655442
Validation Accuracy: 0.795774647887324
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 8
Training loss: 0.08034364329650998
Train Accuracy: 0.9984338292873923
Validation Accuracy: 0.7887323943661971
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 9
Training loss: 0.0685228538699448
Train Accuracy: 0.9992169146436961
Validation Accuracy: 0.795774647887324
------------------------------
Fold 6 >>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...


Some weights of the model checkpoint at mental/mental-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.p

  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 0
Training loss: 2.968592632561922
Train Accuracy: 0.5509005481597494
Validation Accuracy: 0.5105633802816901
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 1
Training loss: 0.8270316652953624
Train Accuracy: 0.7858261550509006
Validation Accuracy: 0.7077464788732394
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 2
Training loss: 0.3413453184068203
Train Accuracy: 0.7564604541895067
Validation Accuracy: 0.6795774647887324
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 3
Training loss: 0.43378783129155635
Train Accuracy: 0.5712607674236492
Validation Accuracy: 0.5211267605633803
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 4
Training loss: 0.3209982100874186
Train Accuracy: 0.9490994518402506
Validation Accuracy: 0.7887323943661971
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 5
Training loss: 0.1529525952413678
Train Accuracy: 0.9577133907595928
Validation Accuracy: 0.7816901408450704
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 6
Training loss: 0.1316866115666926
Train Accuracy: 0.8496476115896633
Validation Accuracy: 0.7112676056338029
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 7
Training loss: 0.12622862309217453
Train Accuracy: 0.9909945184025059
Validation Accuracy: 0.8028169014084507
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 8
Training loss: 0.0895279142074287
Train Accuracy: 0.9988253719655442
Validation Accuracy: 0.8133802816901409
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 9
Training loss: 0.09671111740171909
Train Accuracy: 0.9960845732184808
Validation Accuracy: 0.7992957746478874
------------------------------
Fold 7 >>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...


Some weights of the model checkpoint at mental/mental-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.p

  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 0
Training loss: 0.8736565738916398
Train Accuracy: 0.6143304620203602
Validation Accuracy: 0.5985915492957746
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 1
Training loss: 0.875999779254198
Train Accuracy: 0.884886452623336
Validation Accuracy: 0.7816901408450704
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 2
Training loss: 0.38600695356726644
Train Accuracy: 0.7439310884886453
Validation Accuracy: 0.6584507042253521
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 3
Training loss: 0.4195846213027835
Train Accuracy: 0.8884103367267032
Validation Accuracy: 0.7359154929577465
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 4
Training loss: 0.18315517976880075
Train Accuracy: 0.8574784651527017
Validation Accuracy: 0.6901408450704225
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 5
Training loss: 0.23364175017923117
Train Accuracy: 0.9862960062646828
Validation Accuracy: 0.795774647887324
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 6
Training loss: 0.10190398693084717
Train Accuracy: 0.9702427564604542
Validation Accuracy: 0.7570422535211268
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 7
Training loss: 0.10497630890458823
Train Accuracy: 0.9823805794831637
Validation Accuracy: 0.7570422535211268
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 8
Training loss: 0.08121287040412425
Train Accuracy: 0.9976507439310884
Validation Accuracy: 0.7992957746478874
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
58


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
28
Epoch: 9
Training loss: 0.06574068143963814
Train Accuracy: 0.9984338292873923
Validation Accuracy: 0.7887323943661971
------------------------------
Fold 8 >>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...


Some weights of the model checkpoint at mental/mental-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.p

  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
27
Epoch: 0
Training loss: 2.406443925946951
Train Accuracy: 0.5350293542074364
Validation Accuracy: 0.5653710247349824
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
27
Epoch: 1
Training loss: 1.0438283037394285
Train Accuracy: 0.7898238747553816
Validation Accuracy: 0.7385159010600707
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
27
Epoch: 2
Training loss: 0.5319370184093714
Train Accuracy: 0.8857142857142857
Validation Accuracy: 0.8021201413427562
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
27
Epoch: 3
Training loss: 0.24763338267803192
Train Accuracy: 0.9495107632093933
Validation Accuracy: 0.8127208480565371
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
27
Epoch: 4
Training loss: 0.17037013582885266
Train Accuracy: 0.9812133072407045
Validation Accuracy: 0.7985865724381626
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
27
Epoch: 5
Training loss: 0.14631592016667128
Train Accuracy: 0.9608610567514677
Validation Accuracy: 0.7950530035335689
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
27
Epoch: 6
Training loss: 0.11088586058467627
Train Accuracy: 0.9632093933463797
Validation Accuracy: 0.7809187279151943
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
27
Epoch: 7
Training loss: 0.1238863597624004
Train Accuracy: 0.9800391389432486
Validation Accuracy: 0.784452296819788
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
27
Epoch: 8
Training loss: 0.08152355113998055
Train Accuracy: 0.997651663405088
Validation Accuracy: 0.784452296819788
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
27
Epoch: 9
Training loss: 0.06531934915110468
Train Accuracy: 0.9984344422700587
Validation Accuracy: 0.7985865724381626
------------------------------
Fold 9 >>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...


Some weights of the model checkpoint at mental/mental-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.p

  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
27
Epoch: 0
Training loss: 1.2897528767585755
Train Accuracy: 0.5232876712328767
Validation Accuracy: 0.5335689045936396
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
27
Epoch: 1
Training loss: 1.064134930074215
Train Accuracy: 0.6900195694716242
Validation Accuracy: 0.6607773851590106
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
27
Epoch: 2
Training loss: 0.47840975113213063
Train Accuracy: 0.761252446183953
Validation Accuracy: 0.6890459363957597
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
27
Epoch: 3
Training loss: 0.2616744298487902
Train Accuracy: 0.773385518590998
Validation Accuracy: 0.6890459363957597
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
27
Epoch: 4
Training loss: 0.21246044114232063
Train Accuracy: 0.9522504892367906
Validation Accuracy: 0.8056537102473498
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
27
Epoch: 5
Training loss: 0.13922757301479577
Train Accuracy: 0.9240704500978474
Validation Accuracy: 0.7597173144876325
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
27
Epoch: 6
Training loss: 0.1330608768388629
Train Accuracy: 0.9682974559686889
Validation Accuracy: 0.7809187279151943
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
27
Epoch: 7
Training loss: 0.10235722884535789
Train Accuracy: 0.9968688845401175
Validation Accuracy: 0.8197879858657244
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
27
Epoch: 8
Training loss: 0.08040987430140376
Train Accuracy: 0.9980430528375733
Validation Accuracy: 0.8162544169611308
------------------------------


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/40 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
59


  0%|          | 0/5 [00:00<?, ?it/s]

64
64
64
64
27
Epoch: 9
Training loss: 0.07228826014325022
Train Accuracy: 0.9980430528375733
Validation Accuracy: 0.8091872791519434
------------------------------
Fold 10 >>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...>>>...
Total Training Accuracy: 0.8133021652080232
Total Training Preciion: 0.8266327994467476
Total Training Recall: 0.9434487553600631
Total Training F1_Score: 0.8553878884837054
>>><<<>>><<<>>><<<>>><<<>>><<<>>><<<>>><<<>>><<<>>><<<>>><<<
Total Validation Accuracy: 0.6967262230627582
Total Validation Precision: 0.7198916892791158
Total Validation Recall: 0.8385343312084517
Total Validation F1_Score: 0.7408843869017463
>>><<<>>><<<>>><<<>>><<<>>><<<>>><<<>>><<<>>><<<>>><<<>>><<<


  0%|          | 0/12 [00:00<?, ?it/s]

64
64
64
64
64
64
64
64
64
64
64
11
Total TEST Accuracy: 0.7454545454545455
Total TEST Precision: 0.8213058419243986
Total TEST Recall: 0.6476964769647696
Total TEST F1_Score: 0.7242424242424242
