In [2]:
# base libraries
import numpy as np
import pandas as pd
import regex as re
import itertools
import sklearn.metrics as sk
from functools import reduce

# deep learning libraries
import torch
import transformers
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel, ClassificationArgs

# hyperparameter optimization
import optuna
from optuna.samplers import TPESampler
import optuna.visualization.matplotlib as oviz

# file system manipulation
import os
import shutil
from pathlib import Path
import pickle
import copy

# logging
import logging
import time# base libraries
import numpy as np
import pandas as pd
import regex as re
import itertools
import sklearn.metrics as sk
from functools import reduce

# deep learning libraries
import torch
import transformers
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel, ClassificationArgs

# hyperparameter optimization
import optuna
from optuna.samplers import TPESampler
import optuna.visualization.matplotlib as oviz

# file system manipulation
import os
import shutil
from pathlib import Path
import pickle
import copy

# logging
import logging
import time

Matplotlib created a temporary config/cache directory at /scratch/matplotlib-7p99_1fw because the default path (/users/tt377/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [3]:
# define directories
data_dir = Path("Storage/Bert/")
results_dir = Path("Storage/Bert/Results")

In [4]:
# set seeds to make computations deterministic
np.random.seed(42)
torch.manual_seed(42)

# check CUDA availability
cuda_available = torch.cuda.is_available()
print("Is CUDA available? ", "Yes" if cuda_available else "No")

Is CUDA available?  Yes


In [5]:
# configure logging options
logging.basicConfig(level = logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [6]:
def prepare_data(trial):
    train_data = pd.read_csv(r"Storage/Bert/train_8_11.csv")
    val_data = pd.read_csv(r"Storage/Bert/valid_8_11.csv")
    test_data = pd.read_csv(r"Storage/Bert/test_8_11.csv")
        
    # always_patterns = pd.read_csv("Storage/Bert/always_patterns.csv") 
    # always_patterns = always_patterns[['Unnamed: 0', 'patient_id', 'sequence','original', 'label']]
    # always_patterns.columns = ['Unnamed: 0', 'patient_id', 'sequence','original', 'annotator_label']
    
    # manual_review = pd.read_csv("Storage/Bert/manual_review.csv")
    # manual_review = manual_review[['Unnamed: 0', 'patient_id', 'sequence','original', 'label']]
    # manual_review.columns = ['Unnamed: 0', 'patient_id', 'sequence','original', 'annotator_label']
    
    # # stratification among always patterns
    # X = always_patterns[["patient_id", "sequence"]]
    # y = always_patterns["annotator_label"]

    # y_label = y.to_numpy()
    # X_train, X_test_valid, y_train, y_test_valid = train_test_split(X, y, random_state=None, test_size=0.10, stratify=y_label)

    # y_test_valid_label = y_test_valid.to_numpy()
    # X_valid, X_test, y_valid, y_test = train_test_split(X_test_valid, y_test_valid, random_state=None, test_size=(0.25), stratify=y_test_valid_label)
    
    # # stratification among manually reviewed sequences
    # X_2 = manual_review[["patient_id", "sequence"]]
    # y_2 = manual_review["annotator_label"]

    # y_label_2 = y_2.to_numpy()
    # X_train_2, X_test_valid_2, y_train_2, y_test_valid_2 = train_test_split(X_2,y_2,random_state=None,test_size=0.3, stratify=y_label_2)

    # y_test_valid_label_2 = y_test_valid_2.to_numpy()
    # X_valid_2, X_test_2, y_valid_2, y_test_2 = train_test_split(X_test_valid_2, y_test_valid_2, random_state=None, test_size=(0.15/0.3), stratify=y_test_valid_label_2)
    
    # # combining
    # X_train = X_train.append(X_train_2)
    # y_train = y_train.append(y_train_2)

    # X_test = X_test.append(X_test_2)
    # y_test = y_test.append(y_test_2)

    # X_valid = X_valid.append(X_valid_2)
    # y_valid = y_valid.append(y_valid_2)
    
    # # data = X + y
    # train_data = pd.concat([X_train.reset_index(drop = True), y_train.reset_index(drop = True)], axis = 1)
    # val_data = pd.concat([X_valid.reset_index(drop = True), y_valid.reset_index(drop = True)], axis = 1)
    # test_data = pd.concat([X_test.reset_index(drop = True), y_test.reset_index(drop = True)], axis = 1)
    
    train_data.columns = val_data.columns = test_data.columns = ["PatientID", "text", "labels"]

    return train_data, val_data, test_data

In [7]:
def define_model(trial, trial_dir):
    # hyperparameter tuning
    learning_rate = trial.suggest_float("learning_rate", 1e-8, 1e-4, log = True)
    adam_epsilon = trial.suggest_float("adam_epilson", 1e-8, 1e-4, log = True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 3)
    early_stopping_patience = trial.suggest_int("early_stopping_patience", 1, 3)

    print("- Learning Rate: {}".format(learning_rate))
    print("- Adam Epsilon: {}".format(adam_epsilon)) 
    print("- Training Epochs: {}".format(num_train_epochs))
    print("- Early Stopping Patience: {}".format(early_stopping_patience))

    # define model name
    model_type = "bert"
    model_name = "emilyalsentzer/Bio_ClinicalBERT"
    max_seq_length = 512 

    model_args = ClassificationArgs(

    ## NLP ARGUMENTS
    sliding_window = False,
    learning_rate = learning_rate, # default 4e-5
    adam_epsilon = adam_epsilon, # default 1e-8
    train_batch_size = 8, # default 8
    eval_batch_size = 4, # default 8
    num_train_epochs = num_train_epochs,  # default 1 (number of epochs model will be trained for)
    do_lower_case = False, # default False
    max_seq_length = max_seq_length, # default 128 (maximum sequence length the model will support)
    
    ## TRAINING LOOP
    logging_steps = 50, # default 50
    manual_seed = 1234, # default None (necessary for reproducible results)
    n_gpu = 2, # default 1 (number of GPUs to use)
    save_steps = 2000, # default 2000 (save a model checkpoint at every specified number of steps)
    output_dir = trial_dir, 
    overwrite_output_dir = True, # default False (if True, then the trained model will be saved to the ouput_dir and will overwrite existing saved models in the same directory)
    
    ## EVALUATE DURING TRAINING
    evaluate_during_training = True, # default False
    evaluate_during_training_steps = 2000, # default  2000  
    evaluate_during_training_verbose = True, # default False
    
    ## EARLY STOPPING
    use_early_stopping = True, # default False
    early_stopping_delta = 0, # default 0 (improvement over best_eval_loss necessary to count as a better checkpoint)
    early_stopping_metric = "eval_loss", # default eval_loss 
    early_stopping_metric_minimize = True, # default True
    early_stopping_patience = early_stopping_patience, # default value 3 (terminate training after these many epochs if there is no improvement in early_stopping_metric then early_stopping_delta)
    
    )
    
    # create the classification model
    model = ClassificationModel(
        model_type, model_name,
        num_labels = 3,
        args = model_args,
        use_cuda = cuda_available
    )
    
    return model

In [8]:
def objective(trial):
    # log time
    start_time = time.localtime()

    # log message
    print("\n-------- TRIAL #{} --------".format(trial.number))

    # create output directory
    trial_dir = "Storage/Bert/Results/trial_{}".format(trial.number)
    if os.path.isdir(trial_dir):
        shutil.rmtree(trial_dir)
        print("\n>>> {}: Removing Directory {}\n".format(time.strftime("%H:%M:%S", time.localtime()), trial_dir))
    os.mkdir(trial_dir)

    # log message
    print("\n>>> {}: Preparing Data\n".format(time.strftime("%H:%M:%S", time.localtime())))

    train_data, val_data, test_data = prepare_data(trial)

    assert len(train_data["labels"].unique() == 3)
    
    # save test dataset to file
    f = open(Path(trial_dir, "data_{}.pkl".format(trial.number)), "wb")
    pickle.dump([train_data, val_data, test_data], f)
    f.close()

    # log message
    print("\n>>> {}: Defining Model\n".format(time.strftime("%H:%M:%S", time.localtime())))

    model = define_model(trial, trial_dir)

    # log message
    print("\n>>> {}: Started Training\n".format(time.strftime("%H:%M:%S", time.localtime())))

    # train model
    model.train_model(
        train_data,
        eval_df = val_data,
        # auc = sk.roc_auc_score,
        # acc = sk.accuracy_score
    )

    print("\n>>> {}: Started Evaluation on Validation Set\n".format(time.strftime("%H:%M:%S", time.localtime())))
    
    results, model_outputs, wrong_predictions = model.eval_model(
        val_data,
        # auc = sk.roc_auc_score,
        # acc = sk.accuracy_score
    )

    # save to file
    f = open(Path(trial_dir, "training_results_{}.pkl".format(trial.number)), "wb")
    pickle.dump([model, results, model_outputs, wrong_predictions], f)
    f.close()

    # output message, initialize empty list
    print(">>> {}: Get Sequence Probabilities\n".format(time.strftime("%H:%M:%S", time.localtime())))
    df_list = []

    # extract context window probabilities
    max_prob_list = []
    val_prob_list = []
    val_pred_list = []
    for i in range(len(val_data)):
        # prob_list = list(torch.softmax(torch.from_numpy(model_outputs[i]), axis=0)[:,1])
        prob_list = torch.softmax(torch.from_numpy(model_outputs[i]), axis=0)
        #print("Prob List: ", prob_list, type(prob_list))

        extracted_prob_list = []
        for i in range(len(prob_list)):
            extracted_prob_list.append(float(prob_list[i]))

        #print("Extracted Prob List: ", extracted_prob_list)
        # find max one in each submatrix of length 3
        max_proba = max(extracted_prob_list)

        # identify model prediction based on location of max_proba within extracted_prob_list
        if (extracted_prob_list[0] == max_proba):
            val_pred_list.append(0)
        elif (extracted_prob_list[1] == max_proba):
            val_pred_list.append(1)
        else:
            val_pred_list.append(2)

        max_prob_list.append(max_proba)
        val_prob_list.append(extracted_prob_list)
    
    cw_probs = pd.DataFrame(columns = ["PatientID", "Prob", "Pred"])
    cw_probs["PatientID"] = val_data["PatientID"]
    cw_probs["Prob"] = max_prob_list
    cw_probs["Pred"] = val_pred_list
    cw_probs.to_csv(trial_dir + "/sequence_probabilities{}.csv".format(trial.number))

    # compute metrics
    # print("Shapes of Y-True and Y-Pred", val_data["labels"].shape, cw_probs["Prob"].shape) 
    best_auc = sk.roc_auc_score(val_data["labels"].to_list(), val_prob_list, multi_class = "ovr", average = "weighted")
    # best_auc = get_auc(val_prob_list, val_data["labels"])
    # best_acc, best_threshold = get_best_acc(cw_probs, val_data)
    best_acc = sk.accuracy_score(val_data["labels"].to_list(), cw_probs["Pred"].to_list())
    print(">>> {}: Current AUC: {}\n".format(time.strftime("%H:%M:%S", time.localtime()), best_auc))
    print(">>> {}: Current ACC: {}\n".format(time.strftime("%H:%M:%S", time.localtime()), best_acc))
    #print(">>> {}: Threshold for Validation Accuracy: {}\n".format(time.strftime("%H:%M:%S", time.localtime()), best_threshold))
    print(">>> {}: Start Training Time\n".format(time.strftime("%H:%M:%S", start_time)))
    print(">>> {}: Finish Training Time\n".format(time.strftime("%H:%M:%S", time.localtime())))

    return best_acc

In [9]:
# add stream handler of stdout to show the messages
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))

# unique identifier of the study
study_name = "slat-study" 

# create study database
storage_name = "sqlite:///{}.db".format("Storage/Bert/Results/" + study_name)
study = optuna.create_study(direction = "maximize", sampler = TPESampler(seed = 1234, multivariate = True), study_name = study_name, storage = storage_name, load_if_exists = True)
study.optimize(objective, n_trials = 20, gc_after_trial = True)

pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

print("\nStudy Statistics:")
print("- Finished Trials: ", len(study.trials))
print("- Pruned Trials: ", len(pruned_trials))
print("- Complete Trials: ", len(complete_trials))

print("\nBest Trial:")
best_trial = study.best_trial

print("- Number: ", best_trial.number)
print("- Value: ", best_trial.value)
print("- Hyperparameters: ")

for key, value in best_trial.params.items():
    print("   - {}: {}".format(key, value))

[32m[I 2021-09-06 09:25:41,339][0m Using an existing study with name 'slat-study' instead of creating a new one.[0m


Using an existing study with name 'slat-study' instead of creating a new one.

-------- TRIAL #15 --------

>>> 09:25:41: Preparing Data


>>> 09:25:41: Defining Model

- Learning Rate: 2.3253948332153446e-05
- Adam Epsilon: 5.179831195176861e-06
- Training Epochs: 3
- Early Stopping Patience: 1


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model


>>> 09:25:44: Started Training



INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/7669 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_bert_512_3_2


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8459588487627712, 'eval_loss': 0.35998507888837794}


Running Epoch 1 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8695427891205677, 'eval_loss': 0.3057558276187414}


Running Epoch 2 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8645777834427238, 'eval_loss': 0.3686744125410058}
INFO:simpletransformers.classification.classification_model: No improvement in eval_loss
INFO:simpletransformers.classification.classification_model: Current step: 1
INFO:simpletransformers.classification.classification_model: Early stopping patience: 1
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8500776064840067, 'eval_loss': 0.35360427491966334}
INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to Storage/Bert/Results/trial_15.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.



>>> 09:59:41: Started Evaluation on Validation Set



  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2


Running Evaluation:   0%|          | 0/174 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.8500776064840067, 'eval_loss': 0.35360427491966334}
[32m[I 2021-09-06 10:00:02,302][0m Trial 15 finished with value: 0.909221902017291 and parameters: {'learning_rate': 2.3253948332153446e-05, 'adam_epilson': 5.179831195176861e-06, 'num_train_epochs': 3, 'early_stopping_patience': 1}. Best is trial 13 with value: 0.9279538904899135.[0m


>>> 10:00:02: Get Sequence Probabilities

>>> 10:00:02: Current AUC: 0.987457090339034

>>> 10:00:02: Current ACC: 0.909221902017291

>>> 09:25:41: Start Training Time

>>> 10:00:02: Finish Training Time

Trial 15 finished with value: 0.909221902017291 and parameters: {'learning_rate': 2.3253948332153446e-05, 'adam_epilson': 5.179831195176861e-06, 'num_train_epochs': 3, 'early_stopping_patience': 1}. Best is trial 13 with value: 0.9279538904899135.

-------- TRIAL #16 --------

>>> 10:00:02: Preparing Data


>>> 10:00:02: Defining Model

- Learning Rate: 9.272030580359681e-05
- Adam Epsilon: 7.82420326054716e-07
- Training Epochs: 2
- Early Stopping Patience: 3


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model


>>> 10:00:04: Started Training



  0%|          | 0/7669 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_bert_512_3_2


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/959 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8360645529092684, 'eval_loss': 0.47070251662155677}


Running Epoch 1 of 2:   0%|          | 0/959 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8644108709992477, 'eval_loss': 0.2964124720672081}
INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to Storage/Bert/Results/trial_16.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.



>>> 10:22:37: Started Evaluation on Validation Set



  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2


Running Evaluation:   0%|          | 0/174 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.8644108709992477, 'eval_loss': 0.2964124720672081}
[32m[I 2021-09-06 10:23:00,479][0m Trial 16 finished with value: 0.9164265129682997 and parameters: {'learning_rate': 9.272030580359681e-05, 'adam_epilson': 7.82420326054716e-07, 'num_train_epochs': 2, 'early_stopping_patience': 3}. Best is trial 13 with value: 0.9279538904899135.[0m


>>> 10:23:00: Get Sequence Probabilities

>>> 10:23:00: Current AUC: 0.9809571719565615

>>> 10:23:00: Current ACC: 0.9164265129682997

>>> 10:00:02: Start Training Time

>>> 10:23:00: Finish Training Time

Trial 16 finished with value: 0.9164265129682997 and parameters: {'learning_rate': 9.272030580359681e-05, 'adam_epilson': 7.82420326054716e-07, 'num_train_epochs': 2, 'early_stopping_patience': 3}. Best is trial 13 with value: 0.9279538904899135.

-------- TRIAL #17 --------

>>> 10:23:00: Preparing Data


>>> 10:23:00: Defining Model

- Learning Rate: 9.964962382649844e-05
- Adam Epsilon: 1.1317996951007706e-06
- Training Epochs: 3
- Early Stopping Patience: 2


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model


>>> 10:23:02: Started Training



  0%|          | 0/7669 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_bert_512_3_2


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8176325100465327, 'eval_loss': 0.447599904290561}


Running Epoch 1 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8428333840252515, 'eval_loss': 0.3539542960024428}


Running Epoch 2 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.856578551714867, 'eval_loss': 0.31879039468436404}
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8606960900960889, 'eval_loss': 0.28777324194195625}
INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to Storage/Bert/Results/trial_17.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.



>>> 10:57:09: Started Evaluation on Validation Set



  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2


Running Evaluation:   0%|          | 0/174 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.8606960900960889, 'eval_loss': 0.28777324194195625}
[32m[I 2021-09-06 10:57:32,525][0m Trial 17 finished with value: 0.9135446685878963 and parameters: {'learning_rate': 9.964962382649844e-05, 'adam_epilson': 1.1317996951007706e-06, 'num_train_epochs': 3, 'early_stopping_patience': 2}. Best is trial 13 with value: 0.9279538904899135.[0m


>>> 10:57:32: Get Sequence Probabilities

>>> 10:57:32: Current AUC: 0.9757027444432944

>>> 10:57:32: Current ACC: 0.9135446685878963

>>> 10:23:00: Start Training Time

>>> 10:57:32: Finish Training Time

Trial 17 finished with value: 0.9135446685878963 and parameters: {'learning_rate': 9.964962382649844e-05, 'adam_epilson': 1.1317996951007706e-06, 'num_train_epochs': 3, 'early_stopping_patience': 2}. Best is trial 13 with value: 0.9279538904899135.

-------- TRIAL #18 --------

>>> 10:57:32: Preparing Data


>>> 10:57:32: Defining Model

- Learning Rate: 2.660982629344484e-05
- Adam Epsilon: 3.7512264002029025e-08
- Training Epochs: 2
- Early Stopping Patience: 1


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model


>>> 10:57:34: Started Training



  0%|          | 0/7669 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_bert_512_3_2


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/959 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8676032763441391, 'eval_loss': 0.3677904372927786}


Running Epoch 1 of 2:   0%|          | 0/959 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8646482749400378, 'eval_loss': 0.27947936181364386}
INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to Storage/Bert/Results/trial_18.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.



>>> 11:20:16: Started Evaluation on Validation Set



  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2


Running Evaluation:   0%|          | 0/174 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.8646482749400378, 'eval_loss': 0.27947936181364386}
[32m[I 2021-09-06 11:20:39,426][0m Trial 18 finished with value: 0.9178674351585014 and parameters: {'learning_rate': 2.660982629344484e-05, 'adam_epilson': 3.7512264002029025e-08, 'num_train_epochs': 2, 'early_stopping_patience': 1}. Best is trial 13 with value: 0.9279538904899135.[0m


>>> 11:20:39: Get Sequence Probabilities

>>> 11:20:39: Current AUC: 0.9870058343829111

>>> 11:20:39: Current ACC: 0.9178674351585014

>>> 10:57:32: Start Training Time

>>> 11:20:39: Finish Training Time

Trial 18 finished with value: 0.9178674351585014 and parameters: {'learning_rate': 2.660982629344484e-05, 'adam_epilson': 3.7512264002029025e-08, 'num_train_epochs': 2, 'early_stopping_patience': 1}. Best is trial 13 with value: 0.9279538904899135.

-------- TRIAL #19 --------

>>> 11:20:39: Preparing Data


>>> 11:20:39: Defining Model

- Learning Rate: 2.478655195871051e-08
- Adam Epsilon: 1.8639103000924185e-08
- Training Epochs: 1
- Early Stopping Patience: 1


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model


>>> 11:20:41: Started Training



  0%|          | 0/7669 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_bert_512_3_2


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/959 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
INFO:simpletransformers.classification.classification_model:{'mcc': 0.0, 'eval_loss': 0.9697079713317169}
INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to Storage/Bert/Results/trial_19.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.



>>> 11:32:03: Started Evaluation on Validation Set



  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2


Running Evaluation:   0%|          | 0/174 [00:00<?, ?it/s]

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
INFO:simpletransformers.classification.classification_model:{'mcc': 0.0, 'eval_loss': 0.9697079713317169}
[32m[I 2021-09-06 11:32:26,353][0m Trial 19 finished with value: 0.5259365994236311 and parameters: {'learning_rate': 2.478655195871051e-08, 'adam_epilson': 1.8639103000924185e-08, 'num_train_epochs': 1, 'early_stopping_patience': 1}. Best is trial 13 with value: 0.9279538904899135.[0m


>>> 11:32:26: Get Sequence Probabilities

>>> 11:32:26: Current AUC: 0.7165658761656053

>>> 11:32:26: Current ACC: 0.5259365994236311

>>> 11:20:39: Start Training Time

>>> 11:32:26: Finish Training Time

Trial 19 finished with value: 0.5259365994236311 and parameters: {'learning_rate': 2.478655195871051e-08, 'adam_epilson': 1.8639103000924185e-08, 'num_train_epochs': 1, 'early_stopping_patience': 1}. Best is trial 13 with value: 0.9279538904899135.

-------- TRIAL #20 --------

>>> 11:32:26: Preparing Data


>>> 11:32:26: Defining Model

- Learning Rate: 2.920735249891083e-06
- Adam Epsilon: 2.6230083765667476e-06
- Training Epochs: 2
- Early Stopping Patience: 3


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model


>>> 11:32:28: Started Training



  0%|          | 0/7669 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_bert_512_3_2


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/959 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7738119759594191, 'eval_loss': 0.43575980745512866}


Running Epoch 1 of 2:   0%|          | 0/959 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7931451453770462, 'eval_loss': 0.43729386384459745}
INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to Storage/Bert/Results/trial_20.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.



>>> 11:54:58: Started Evaluation on Validation Set



  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2


Running Evaluation:   0%|          | 0/174 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.7931451453770462, 'eval_loss': 0.43729386384459745}
[32m[I 2021-09-06 11:55:22,337][0m Trial 20 finished with value: 0.8731988472622478 and parameters: {'learning_rate': 2.920735249891083e-06, 'adam_epilson': 2.6230083765667476e-06, 'num_train_epochs': 2, 'early_stopping_patience': 3}. Best is trial 13 with value: 0.9279538904899135.[0m


>>> 11:55:22: Get Sequence Probabilities

>>> 11:55:22: Current AUC: 0.9614324518429661

>>> 11:55:22: Current ACC: 0.8731988472622478

>>> 11:32:26: Start Training Time

>>> 11:55:22: Finish Training Time

Trial 20 finished with value: 0.8731988472622478 and parameters: {'learning_rate': 2.920735249891083e-06, 'adam_epilson': 2.6230083765667476e-06, 'num_train_epochs': 2, 'early_stopping_patience': 3}. Best is trial 13 with value: 0.9279538904899135.

-------- TRIAL #21 --------

>>> 11:55:22: Preparing Data


>>> 11:55:22: Defining Model

- Learning Rate: 2.359720345321485e-06
- Adam Epsilon: 9.996366669466755e-05
- Training Epochs: 3
- Early Stopping Patience: 3


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model


>>> 11:55:24: Started Training



  0%|          | 0/7669 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_bert_512_3_2


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.6267114658745542, 'eval_loss': 0.5447068488460848}


Running Epoch 1 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7151631296776112, 'eval_loss': 0.4715135947041128}


Running Epoch 2 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7151007426111852, 'eval_loss': 0.46186374795847923}
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7260927944666324, 'eval_loss': 0.45222556454011764}
INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to Storage/Bert/Results/trial_21.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.



>>> 12:29:39: Started Evaluation on Validation Set



  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2


Running Evaluation:   0%|          | 0/174 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.7260927944666324, 'eval_loss': 0.45222556454011764}
[32m[I 2021-09-06 12:30:03,041][0m Trial 21 finished with value: 0.8328530259365994 and parameters: {'learning_rate': 2.359720345321485e-06, 'adam_epilson': 9.996366669466755e-05, 'num_train_epochs': 3, 'early_stopping_patience': 3}. Best is trial 13 with value: 0.9279538904899135.[0m


>>> 12:30:02: Get Sequence Probabilities

>>> 12:30:03: Current AUC: 0.9447438405934814

>>> 12:30:03: Current ACC: 0.8328530259365994

>>> 11:55:22: Start Training Time

>>> 12:30:03: Finish Training Time

Trial 21 finished with value: 0.8328530259365994 and parameters: {'learning_rate': 2.359720345321485e-06, 'adam_epilson': 9.996366669466755e-05, 'num_train_epochs': 3, 'early_stopping_patience': 3}. Best is trial 13 with value: 0.9279538904899135.

-------- TRIAL #22 --------

>>> 12:30:03: Preparing Data


>>> 12:30:03: Defining Model

- Learning Rate: 4.4729180145319904e-05
- Adam Epsilon: 3.917636609396379e-05
- Training Epochs: 3
- Early Stopping Patience: 2


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model


>>> 12:30:05: Started Training



  0%|          | 0/7669 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_bert_512_3_2


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.851039806366591, 'eval_loss': 0.3637489655922199}


Running Epoch 1 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8852443707156854, 'eval_loss': 0.26824407536407996}


Running Epoch 2 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8739113968626494, 'eval_loss': 0.3107228734712491}
INFO:simpletransformers.classification.classification_model: No improvement in eval_loss
INFO:simpletransformers.classification.classification_model: Current step: 1
INFO:simpletransformers.classification.classification_model: Early stopping patience: 2
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.879540974025724, 'eval_loss': 0.29862670240731076}
INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to Storage/Bert/Results/trial_22.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.



>>> 13:04:18: Started Evaluation on Validation Set



  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2


Running Evaluation:   0%|          | 0/174 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.879540974025724, 'eval_loss': 0.29862670240731076}
[32m[I 2021-09-06 13:04:41,924][0m Trial 22 finished with value: 0.9265129682997119 and parameters: {'learning_rate': 4.4729180145319904e-05, 'adam_epilson': 3.917636609396379e-05, 'num_train_epochs': 3, 'early_stopping_patience': 2}. Best is trial 13 with value: 0.9279538904899135.[0m


>>> 13:04:41: Get Sequence Probabilities

>>> 13:04:41: Current AUC: 0.9883149848972057

>>> 13:04:41: Current ACC: 0.9265129682997119

>>> 12:30:03: Start Training Time

>>> 13:04:41: Finish Training Time

Trial 22 finished with value: 0.9265129682997119 and parameters: {'learning_rate': 4.4729180145319904e-05, 'adam_epilson': 3.917636609396379e-05, 'num_train_epochs': 3, 'early_stopping_patience': 2}. Best is trial 13 with value: 0.9279538904899135.

-------- TRIAL #23 --------

>>> 13:04:42: Preparing Data


>>> 13:04:42: Defining Model

- Learning Rate: 2.561091114086183e-08
- Adam Epsilon: 1.5652121859194013e-08
- Training Epochs: 2
- Early Stopping Patience: 3


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model


>>> 13:04:44: Started Training



  0%|          | 0/7669 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_bert_512_3_2


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/959 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
INFO:simpletransformers.classification.classification_model:{'mcc': 0.0, 'eval_loss': 0.9614071900817169}


Running Epoch 1 of 2:   0%|          | 0/959 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
INFO:simpletransformers.classification.classification_model:{'mcc': 0.0, 'eval_loss': 0.9529997726966595}
INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to Storage/Bert/Results/trial_23.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.



>>> 13:27:19: Started Evaluation on Validation Set



  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2


Running Evaluation:   0%|          | 0/174 [00:00<?, ?it/s]

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
INFO:simpletransformers.classification.classification_model:{'mcc': 0.0, 'eval_loss': 0.9529997726966595}
[32m[I 2021-09-06 13:27:43,328][0m Trial 23 finished with value: 0.5259365994236311 and parameters: {'learning_rate': 2.561091114086183e-08, 'adam_epilson': 1.5652121859194013e-08, 'num_train_epochs': 2, 'early_stopping_patience': 3}. Best is trial 13 with value: 0.9279538904899135.[0m


>>> 13:27:43: Get Sequence Probabilities

>>> 13:27:43: Current AUC: 0.7684939948549364

>>> 13:27:43: Current ACC: 0.5259365994236311

>>> 13:04:42: Start Training Time

>>> 13:27:43: Finish Training Time

Trial 23 finished with value: 0.5259365994236311 and parameters: {'learning_rate': 2.561091114086183e-08, 'adam_epilson': 1.5652121859194013e-08, 'num_train_epochs': 2, 'early_stopping_patience': 3}. Best is trial 13 with value: 0.9279538904899135.

-------- TRIAL #24 --------

>>> 13:27:43: Preparing Data


>>> 13:27:43: Defining Model

- Learning Rate: 1.5454548042997283e-05
- Adam Epsilon: 8.566176825278376e-05
- Training Epochs: 3
- Early Stopping Patience: 2


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model


>>> 13:27:45: Started Training



  0%|          | 0/7669 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_bert_512_3_2


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7910497793850809, 'eval_loss': 0.47258988194081974}


Running Epoch 1 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8333935258273405, 'eval_loss': 0.39858346423883545}


Running Epoch 2 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8406106146290592, 'eval_loss': 0.4227320627234448}
INFO:simpletransformers.classification.classification_model: No improvement in eval_loss
INFO:simpletransformers.classification.classification_model: Current step: 1
INFO:simpletransformers.classification.classification_model: Early stopping patience: 2
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8376933824997106, 'eval_loss': 0.40373977847482967}
INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to Storage/Bert/Results/trial_24.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.



>>> 14:01:58: Started Evaluation on Validation Set



  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2


Running Evaluation:   0%|          | 0/174 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.8376933824997106, 'eval_loss': 0.40373977847482967}
[32m[I 2021-09-06 14:02:22,456][0m Trial 24 finished with value: 0.9005763688760807 and parameters: {'learning_rate': 1.5454548042997283e-05, 'adam_epilson': 8.566176825278376e-05, 'num_train_epochs': 3, 'early_stopping_patience': 2}. Best is trial 13 with value: 0.9279538904899135.[0m


>>> 14:02:22: Get Sequence Probabilities

>>> 14:02:22: Current AUC: 0.979920571927459

>>> 14:02:22: Current ACC: 0.9005763688760807

>>> 13:27:43: Start Training Time

>>> 14:02:22: Finish Training Time

Trial 24 finished with value: 0.9005763688760807 and parameters: {'learning_rate': 1.5454548042997283e-05, 'adam_epilson': 8.566176825278376e-05, 'num_train_epochs': 3, 'early_stopping_patience': 2}. Best is trial 13 with value: 0.9279538904899135.

-------- TRIAL #25 --------

>>> 14:02:22: Preparing Data


>>> 14:02:22: Defining Model

- Learning Rate: 6.399573419599201e-05
- Adam Epsilon: 8.358075530495174e-05
- Training Epochs: 3
- Early Stopping Patience: 2


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model


>>> 14:02:24: Started Training



  0%|          | 0/7669 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_bert_512_3_2


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8492510126699169, 'eval_loss': 0.3670082010071853}


Running Epoch 1 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8699800355220089, 'eval_loss': 0.28199116971300936}


Running Epoch 2 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8770998868417814, 'eval_loss': 0.3319356801181004}
INFO:simpletransformers.classification.classification_model: No improvement in eval_loss
INFO:simpletransformers.classification.classification_model: Current step: 1
INFO:simpletransformers.classification.classification_model: Early stopping patience: 2
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8742904177154086, 'eval_loss': 0.3006961606014734}
INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to Storage/Bert/Results/trial_25.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.



>>> 14:36:37: Started Evaluation on Validation Set



  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2


Running Evaluation:   0%|          | 0/174 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.8742904177154086, 'eval_loss': 0.3006961606014734}
[32m[I 2021-09-06 14:37:02,257][0m Trial 25 finished with value: 0.9236311239193083 and parameters: {'learning_rate': 6.399573419599201e-05, 'adam_epilson': 8.358075530495174e-05, 'num_train_epochs': 3, 'early_stopping_patience': 2}. Best is trial 13 with value: 0.9279538904899135.[0m


>>> 14:37:02: Get Sequence Probabilities

>>> 14:37:02: Current AUC: 0.9888553624622908

>>> 14:37:02: Current ACC: 0.9236311239193083

>>> 14:02:22: Start Training Time

>>> 14:37:02: Finish Training Time

Trial 25 finished with value: 0.9236311239193083 and parameters: {'learning_rate': 6.399573419599201e-05, 'adam_epilson': 8.358075530495174e-05, 'num_train_epochs': 3, 'early_stopping_patience': 2}. Best is trial 13 with value: 0.9279538904899135.

-------- TRIAL #26 --------

>>> 14:37:02: Preparing Data


>>> 14:37:02: Defining Model

- Learning Rate: 4.3237680408766715e-08
- Adam Epsilon: 2.1962679832144807e-05
- Training Epochs: 3
- Early Stopping Patience: 1


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model


>>> 14:37:04: Started Training



  0%|          | 0/7669 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_bert_512_3_2


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
INFO:simpletransformers.classification.classification_model:{'mcc': 0.0, 'eval_loss': 0.9587942539960489}


Running Epoch 1 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.04237320884104599, 'eval_loss': 0.938555838047773}


Running Epoch 2 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.04237320884104599, 'eval_loss': 0.9373793327945402}
  torch.nn.utils.clip_grad_norm_(
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.04237320884104599, 'eval_loss': 0.93206787109375}
INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to Storage/Bert/Results/trial_26.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.



>>> 15:11:25: Started Evaluation on Validation Set



  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2


Running Evaluation:   0%|          | 0/174 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.04237320884104599, 'eval_loss': 0.93206787109375}
[32m[I 2021-09-06 15:11:49,643][0m Trial 26 finished with value: 0.5273775216138329 and parameters: {'learning_rate': 4.3237680408766715e-08, 'adam_epilson': 2.1962679832144807e-05, 'num_train_epochs': 3, 'early_stopping_patience': 1}. Best is trial 13 with value: 0.9279538904899135.[0m


>>> 15:11:49: Get Sequence Probabilities

>>> 15:11:49: Current AUC: 0.8110726492993022

>>> 15:11:49: Current ACC: 0.5273775216138329

>>> 14:37:02: Start Training Time

>>> 15:11:49: Finish Training Time

Trial 26 finished with value: 0.5273775216138329 and parameters: {'learning_rate': 4.3237680408766715e-08, 'adam_epilson': 2.1962679832144807e-05, 'num_train_epochs': 3, 'early_stopping_patience': 1}. Best is trial 13 with value: 0.9279538904899135.

-------- TRIAL #27 --------

>>> 15:11:49: Preparing Data


>>> 15:11:49: Defining Model

- Learning Rate: 4.025335798567959e-06
- Adam Epsilon: 3.128773758142213e-06
- Training Epochs: 3
- Early Stopping Patience: 2


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model


>>> 15:11:52: Started Training



  0%|          | 0/7669 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_bert_512_3_2


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7916056192633907, 'eval_loss': 0.45641659046041555}


Running Epoch 1 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.821358860905201, 'eval_loss': 0.42163711580736885}


Running Epoch 2 of 3:   0%|          | 0/959 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8175646322636638, 'eval_loss': 0.44134882400775777}
INFO:simpletransformers.classification.classification_model: No improvement in eval_loss
INFO:simpletransformers.classification.classification_model: Current step: 1
INFO:simpletransformers.classification.classification_model: Early stopping patience: 2
  torch.nn.utils.clip_grad_norm_(
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8244941055525482, 'eval_loss': 0.4384940739335685}
INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to Storage/Bert/Results/trial_27.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.



>>> 15:46:08: Started Evaluation on Validation Set



  0%|          | 0/694 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_512_3_2


Running Evaluation:   0%|          | 0/174 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.8244941055525482, 'eval_loss': 0.4384940739335685}
[32m[I 2021-09-06 15:46:32,966][0m Trial 27 finished with value: 0.8919308357348703 and parameters: {'learning_rate': 4.025335798567959e-06, 'adam_epilson': 3.128773758142213e-06, 'num_train_epochs': 3, 'early_stopping_patience': 2}. Best is trial 13 with value: 0.9279538904899135.[0m


>>> 15:46:32: Get Sequence Probabilities

>>> 15:46:32: Current AUC: 0.975311501096442

>>> 15:46:32: Current ACC: 0.8919308357348703

>>> 15:11:49: Start Training Time

>>> 15:46:32: Finish Training Time

Trial 27 finished with value: 0.8919308357348703 and parameters: {'learning_rate': 4.025335798567959e-06, 'adam_epilson': 3.128773758142213e-06, 'num_train_epochs': 3, 'early_stopping_patience': 2}. Best is trial 13 with value: 0.9279538904899135.

-------- TRIAL #28 --------

>>> 15:46:33: Preparing Data


>>> 15:46:33: Defining Model

- Learning Rate: 8.073120236427379e-05
- Adam Epsilon: 3.9441532970755586e-05
- Training Epochs: 2
- Early Stopping Patience: 3


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model


>>> 15:46:35: Started Training



  0%|          | 0/7669 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_bert_512_3_2


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/959 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(


KeyboardInterrupt: 