For colab

In [1]:
import os
import sys
from google.colab import drive

# Mount google drive.
drive.mount('/gdrive')

!pip install simpletransformers
!pip install wandb -qU

from simpletransformers.classification import (MultiLabelClassificationModel, MultiLabelClassificationArgs)
import pandas as pd
import logging
import sklearn
import torch
import wandb

Colab=True

Mounted at /gdrive
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simpletransformers
  Downloading simpletransformers-0.63.11-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.7/250.7 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Collecting transformers>=4.6.0 (from simpletransformers)
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m109.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets (from simpletransformers)
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.5 MB/s[0m eta [36m0:

wandb.login()

In [2]:
#Query the current cuda version
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [3]:
#For Colab
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Using device:', device)

Using device: cuda


## Dataset prep

Final Adjustments to the dataset before training. Some outputs are removed to protect Propriety information

In [None]:
pd.set_option('display.max_columns', None)
if Colab==False:
    train_df=pd.read_csv('path to/train_data.csv', encoding='ISO-8859-1')
    test_df=pd.read_csv('path to/test_data.csv', encoding='ISO-8859-1')
    val_df=pd.read_csv('path to/val_data.csv', encoding='ISO-8859-1')
if Colab==True:
    train_df=pd.read_csv('path to/train_data.csv', encoding='ISO-8859-1')
    test_df=pd.read_csv('path to/test_data.csv', encoding='ISO-8859-1')
    val_df=pd.read_csv('path to/val_data.csv', encoding='ISO-8859-1')

train_df.head()

In [None]:
train_df['labels'] = train_df['Discipline'].astype(str) + train_df['Object Type'].astype(str)
test_df['labels'] = test_df['Discipline'].astype(str) + test_df['Object Type'].astype(str)
val_df['labels'] = val_df['Discipline'].astype(str) + val_df['Object Type'].astype(str)

# Convert the concatenated values to a list of integers
train_df['labels'] = train_df['labels'].apply(lambda x: [int(val) for val in x])
test_df['labels'] = test_df['labels'].apply(lambda x: [int(val) for val in x])
val_df['labels'] = val_df['labels'].apply(lambda x: [int(val) for val in x])

# Rename the "Object Text" column to "text"
train_df.rename(columns={'Object Text': 'text'}, inplace=True)
test_df.rename(columns={'Object Text': 'text'}, inplace=True)
val_df.rename(columns={'Object Text': 'text'}, inplace=True)


# Drop all columns except "text" and "labels"
train_df_MC = train_df[['text', 'labels']]
test_df_MC = test_df[['text', 'labels']]
val_df_MC = val_df[['text', 'labels']]

train_df_MC.head()

In [6]:
train_df_MC.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9542 entries, 0 to 9541
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    9542 non-null   object
 1   labels  9542 non-null   object
dtypes: object(2)
memory usage: 149.2+ KB


## Configuring the model

In [7]:
sweep_config = {
    "method": "grid",  # grid, random, bayes
    "metric": {"name": 'eval_loss', "goal": 'minimize'},
    "parameters": {
        "optimizer": {"values": ['AdamW', 'Adafactor']},
        "train_batch_size" :{"values": [32, 8, 4]},
        "learning_rate": {"values": [4e-5, 1e-5]}
        #"learning_rate": {"min": 5e-5, "max": 4e-4},
    },
    "early_terminate" : {
        "type": "hyperband",
        "eta": 2,
        "min_iter":2
    }
}

sweep_id = wandb.sweep(sweep_config, project="MultiLabelRobertaV1.01")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Create sweep with ID: 6t9no7db
Sweep URL: https://wandb.ai/alanpaddy/MultiLabelRobertaV1.01/sweeps/6t9no7db


In [8]:
ML_args={
    'output_dir':'/gdrive/MyDrive/Transformers/V2/MultiLabel/Checkpoints/', #where all outputs will be stored. This includes model checkpoints and evaluation results.
    'best_model_dir':'/gdrive/MyDrive/Transformers/V2/MultiLabel/BestModel/', #The directory where the best model (model checkpoints) will be saved (based on eval_during_training)
    'cache_dir': '/gdrive/MyDrive/Transformers/V2/MultiLabel/Cache/', #The directory where cached files will be saved.
    'tensorboard_dir': '/gdrive/MyDrive/Transformers/V2/MultiLabel/TensorBoard/', 
    'overwrite_output_dir': True, # trained model will be saved to the ouput_dir but will overwrite existing saved models in the same directory.
    'reprocess_input_data': True, # input data will be reprocessed even if a cached file of the input data exists in the cache_dir.
    'train_batch_size':32, #Larger batch sizes train faster but consume more memory and might show lower accuracy
    'eval_batch_size': 8, #Evaluation batch size
    'gradient_accumulation_steps':1, #The number of training steps to execute before performing a optimizer.step(). Effectively increases the training batch size while sacrificing training time to lower memory consumption.
    'learning_rate': 4e-5,#Lower LR increases accuracy but also increases training time.
    'adafactor_relative_step': True,# If True, time-dependent learning rate is computed instead of external learning rate.
    'num_train_epochs': 4, #3 passes through the training set
    'show_running_loss': True, #Log the loss to the console
    'max_seq_length': 128, #512 #Max number of input tokens
    'weight_decay': 0, #Prevents overfitting by penalizing large weights via adding the L2 penalty.
    'adam_epsilon': 1e-8, #Epsilon parameter used in AdamOptimizer. AdamOptimizer is the extended version of SGD.
    'warmup_ratio': 0.06, #Ratio of total training steps where learning rate will “warm up”.The learning rate is increased linearly over the warm-up period.
    'warmup_steps': 0,#Overrides warmup_ratio if not zero.
    'max_grad_norm': 1.0, #Maximum Gradient clipping. Avoids exploding gradients.
    'logging_steps': 1, #Log training loss and learning at every specified number of steps
    'evaluate_during_training': True, #perform evaluation while training model
    'save_steps': 500, #Save a model checkpoint at every specified number of steps
    'use_early_stopping' : True, #Use early stopping to stop training when early_stopping_metric doesn’t improve (based on early_stopping_patience, and early_stopping_delta)
    'early_stopping_metric': 'eval_loss', #Use validation loss as the early stopping metric
    'early_stopping_patience': 3, #Terminate training after this many evaluations without an improvement in the evaluation metric greater then early_stopping_delta.
    'early_stopping_delta': 0.005, #The improvement over best_eval_loss necessary to count as a better checkpoint.
    #'encoding': #Specify an encoding to be used when reading text files.
    'use_tensorboard': True,
    'optimizer': 'AdamW', #OR Adafactor
    'polynomial_decay_schedule_lr_end': 1e-7, #The end learning rate.
    'scheduler' : 'linear_schedule_with_warmup', #The scheduler to use when training. 
    'fp16': False, #fp16 mode requires NVidia Apex library. Can process up to 8 times faster than FP32 on modern GPUs and require less memory.
    'n_gpu': 1, #Number of GPUs to be used
    'wandb_project':'MultiLabelRobertaV1.01', #Name of W&B project. This will log all hyperparameter values, training losses, and evaluation metrics to the given project.
    'wandb_kwargs' :{'dir': '/gdrive/MyDrive/Transformers/V2/MultiLabel/WandB/' }
}

In [10]:
def train():
    wandb.init(dir='/gdrive/MyDrive/Transformers/V2/MultiLabel/WandB/')

    modelML = MultiLabelClassificationModel('roberta', 'roberta-base', num_labels=2, args=ML_args, use_cuda=True)

    modelML.train_model(train_df_MC,
                        output_dir='/gdrive/MyDrive/Transformers/V2/MultiLabel/Checkpoints/',
                        show_running_loss=True,
                        eval_df=val_df_MC,
                        verbose=True
                        )
    modelML.eval_model(val_df_MC)
    wandb.join()

wandb.agent(sweep_id, train, count=14) #Count should really be 12, since there are only 12 combinations. But wandb agent has a known bug of creating duplicates https://github.com/wandb/wandb/issues/3522

[34m[1mwandb[0m: Agent Starting Run: u1dlmca1 with config:
[34m[1mwandb[0m: 	learning_rate: 4e-05
[34m[1mwandb[0m: 	optimizer: AdamW
[34m[1mwandb[0m: 	train_batch_size: 32
[34m[1mwandb[0m: Currently logged in as: [33malanpaddy[0m. Use [1m`wandb login --relogin`[0m to force relogin


Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMultiLabelSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultiLabelSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

  0%|          | 0/9542 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/299 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.391 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.002957…

0,1
LRAP,▆▁▆█
Training loss,█▆▄▂▃▂▂▁▂▁▂▁▁▂▂▃▁▁▃▂▂▁▁▁▁▁▂▁▃▁▃▁▁▂▂▂▂▁▁▁
eval_loss,█▄▂▁
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
lr,▃▄▇███▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train_loss,▇▂█▁

0,1
LRAP,0.99832
Training loss,0.00185
eval_loss,0.07114
global_step,1196.0
lr,0.0
train_loss,0.00185


[34m[1mwandb[0m: Agent Starting Run: 54bqbgo2 with config:
[34m[1mwandb[0m: 	learning_rate: 4e-05
[34m[1mwandb[0m: 	optimizer: AdamW
[34m[1mwandb[0m: 	train_batch_size: 8


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMultiLabelSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultiLabelSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 

  0%|          | 0/9542 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/299 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.396 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.002921…

0,1
LRAP,▁▄▇█
Training loss,█▅▄▃▃▃▄▂▂▂▁▁▃▂▂▁▂▂▁▂▁▁▂▁▁▂▁▂▂▁▁▁▁▁▁▂▁▁▂▁
eval_loss,█▁▂▁
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
lr,▃▄▇███▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train_loss,▂█▁▁

0,1
LRAP,0.99874
Training loss,0.00243
eval_loss,0.07244
global_step,1196.0
lr,0.0
train_loss,0.00243


[34m[1mwandb[0m: Agent Starting Run: wyk45m11 with config:
[34m[1mwandb[0m: 	learning_rate: 4e-05
[34m[1mwandb[0m: 	optimizer: AdamW
[34m[1mwandb[0m: 	train_batch_size: 4


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMultiLabelSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultiLabelSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 

  0%|          | 0/9542 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/299 [00:00<?, ?it/s]

0,1
LRAP,▁██▇
Training loss,█▆▄▃▃▃▃▂▂▁▁▂▃▂▂▂▁▁▂▂▂▂▁▁▁▁▁▂▁▁▂▁▁▂▁▁▁▁▂▁
eval_loss,█▂▁▂
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
lr,▃▄▇███▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train_loss,█▁▁▁

0,1
LRAP,0.99832
Training loss,0.00213
eval_loss,0.07503
global_step,1196.0
lr,0.0
train_loss,0.00213


[34m[1mwandb[0m: Agent Starting Run: gnjpc7m3 with config:
[34m[1mwandb[0m: 	learning_rate: 4e-05
[34m[1mwandb[0m: 	optimizer: Adafactor
[34m[1mwandb[0m: 	train_batch_size: 32


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMultiLabelSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultiLabelSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 

  0%|          | 0/9542 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/299 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
LRAP,▁███
Training loss,█▇▅▃▄▃▃▂▃▂▁▁▂▂▂▁▂▁▁▁▂▂▃▁▁▂▂▁▂▂▁▁▃▂▁▁▁▁▁▁
eval_loss,█▁▁▂
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
lr,▃▄▇███▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train_loss,▃█▁▁

0,1
LRAP,0.99832
Training loss,0.00263
eval_loss,0.07635
global_step,1196.0
lr,0.0
train_loss,0.00263


[34m[1mwandb[0m: Agent Starting Run: 6ds9myk5 with config:
[34m[1mwandb[0m: 	learning_rate: 4e-05
[34m[1mwandb[0m: 	optimizer: Adafactor
[34m[1mwandb[0m: 	train_batch_size: 8


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMultiLabelSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultiLabelSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 

  0%|          | 0/9542 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

0,1
LRAP,▁▁
Training loss,█▆▅▄▃▂▂▄▂▃▂▃▃▃▂▁▂▃▁▁▁▂▂▂▂▁▃▁▁▁▁▁▂▁▁▂▂▃▁▁
eval_loss,█▁
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
lr,▁▃▅████▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂
train_loss,█▁

0,1
LRAP,0.99853
Training loss,0.07125
eval_loss,0.09048
global_step,889.0
lr,1e-05
train_loss,0.00315


[34m[1mwandb[0m: Agent Starting Run: ssckcrv2 with config:
[34m[1mwandb[0m: 	learning_rate: 4e-05
[34m[1mwandb[0m: 	optimizer: Adafactor
[34m[1mwandb[0m: 	train_batch_size: 4


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMultiLabelSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultiLabelSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 

  0%|          | 0/9542 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/299 [00:00<?, ?it/s]

0,1
LRAP,▁█▅█
Training loss,█▅▄▃▃▂▃▃▃▃▁▂▂▁▂▁▂▁▂▁▁▁▁▂▁▂▁▁▂▁▂▂▂▁▁▂▂▁▁▁
eval_loss,█▁▃▄
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
lr,▃▄▇███▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train_loss,█▂▄▁

0,1
LRAP,0.99832
Training loss,0.00276
eval_loss,0.07485
global_step,1196.0
lr,0.0
train_loss,0.00276


[34m[1mwandb[0m: Agent Starting Run: yyaqzyem with config:
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	optimizer: AdamW
[34m[1mwandb[0m: 	train_batch_size: 32


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMultiLabelSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultiLabelSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 

  0%|          | 0/9542 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

0,1
LRAP,▁█
Training loss,█▆▄▃▃▂▂▃▂▂▁▃▃▂▃▂▁▁▂▂▁▂▂▁▂▂▂▁▂▁▁▁▁▁▁▂▂▁▂▁
eval_loss,█▁
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
lr,▁▃▅▇████▇▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃
train_loss,█▁

0,1
LRAP,0.99895
Training loss,0.02128
eval_loss,0.09747
global_step,829.0
lr,1e-05
train_loss,0.0073


[34m[1mwandb[0m: Agent Starting Run: f0kbvl2a with config:
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	optimizer: AdamW
[34m[1mwandb[0m: 	train_batch_size: 8


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMultiLabelSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultiLabelSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 

  0%|          | 0/9542 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/299 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
LRAP,▁▃█▅
Training loss,█▆▄▄▃▂▃▂▃▁▁▂▃▁▂▂▂▂▁▁▁▁▁▁▂▄▁▁▁▁▁▁▁▂▂▁▁▁▁▁
eval_loss,█▃▃▁
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
lr,▃▄▇███▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train_loss,█▇▁▁

0,1
LRAP,0.99832
Training loss,0.00195
eval_loss,0.06805
global_step,1196.0
lr,0.0
train_loss,0.00195


[34m[1mwandb[0m: Agent Starting Run: tu9dnxc1 with config:
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	optimizer: AdamW
[34m[1mwandb[0m: 	train_batch_size: 4


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMultiLabelSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultiLabelSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 

  0%|          | 0/9542 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/299 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
LRAP,▁▅▁█
Training loss,█▇▃▄▂▃▂▂▃▄▃▃▂▃▁▁▁▁▂▂▁▁▃▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
eval_loss,█▁▄▃
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
lr,▃▄▇███▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train_loss,█▃▅▁

0,1
LRAP,0.99874
Training loss,0.00135
eval_loss,0.07315
global_step,1196.0
lr,0.0
train_loss,0.00135


[34m[1mwandb[0m: Agent Starting Run: zuam5rpq with config:
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	optimizer: Adafactor
[34m[1mwandb[0m: 	train_batch_size: 32


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMultiLabelSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultiLabelSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 

  0%|          | 0/9542 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
LRAP,█▁
Training loss,█▆▄▄▂▂▂▂▂▂▁▂▂▂▃▃▂▁▁▂▂▂▂▁▁▂▁▁▁▁▃▁▁▁▂▁▁▂▁▁
eval_loss,▁█
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
lr,▁▃▅▆████▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▅▅▅▅▅▅▅▄▄▄▄▄▄▄▄▃
train_loss,█▁

0,1
LRAP,0.9979
Training loss,0.03149
eval_loss,0.09782
global_step,730.0
lr,2e-05
train_loss,0.00693


[34m[1mwandb[0m: Agent Starting Run: 0dml4ocm with config:
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	optimizer: Adafactor
[34m[1mwandb[0m: 	train_batch_size: 8


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMultiLabelSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultiLabelSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 

  0%|          | 0/9542 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

0,1
LRAP,▁█
Training loss,█▆▅▄▃▄▄▅▃▂▁▂▃▃▂▃▂▁▂▁▂▁▂▁▃▃▂▁▂▁▂▁▁▁▁▁▁▁▁▁
eval_loss,█▁
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
lr,▁▃▅▇████▇▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃
train_loss,█▁

0,1
LRAP,0.99832
Training loss,0.02662
eval_loss,0.09193
global_step,816.0
lr,1e-05
train_loss,0.01234


[34m[1mwandb[0m: Agent Starting Run: rmu5q238 with config:
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	optimizer: Adafactor
[34m[1mwandb[0m: 	train_batch_size: 4


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMultiLabelSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultiLabelSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 

  0%|          | 0/9542 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/2386 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/299 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
LRAP,▁█
Training loss,█▆▆▃▄▂▃▂▅▃▂▂▃▃▂▁▃▂▁▂▁▂▃▂▁▂▂▂▃▁▁▁▁▁▁▂▂▂▂▂
eval_loss,▁█
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
lr,▁▃▅████▇▇▇▇▇▇▆▆▆▆▆▆▆▅▅▅▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃
train_loss,█▁

0,1
LRAP,0.99853
Training loss,0.06109
eval_loss,0.10574
global_step,853.0
lr,1e-05
train_loss,0.01035


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


In [12]:
modelMLTrained= MultiLabelClassificationModel('roberta', '/gdrive/MyDrive/Transformers/V2/MultiLabel/BestModel/',num_labels=2)
wandb.init() 
result, model_outputs, wrong_predictions = modelMLTrained.eval_model(test_df_MC,
                                                            output_dir='/gdrive/MyDrive/Transformers/V2/MultiLabel/Results/',
                                                            verbose=True)

VBox(children=(Label(value='0.001 MB of 0.010 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.119389…

  0%|          | 0/2983 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/373 [00:00<?, ?it/s]