### Manually evaluating model and dataset

In [1]:
import transformers
from transformers import BertForSequenceClassification
import torch
import torch.nn as nn
from common.trainers.bert_glue_trainer import BertGLUETrainer
from data.bert_processors.processors import *
from common.evaluators.bert_glue_evaluator import BertGLUEEvaluator
from loguru import logger
import pandas as pd
from data.h5_processors.h5_processors import *
import numpy as np

In [130]:
class Args(object):
    def __init__(self, 
                 max_seq_length = 128, 
                 num_labels = 2, 
                 model = 'RTE',
                 checkpoint = 'bert-base-uncased'):
        if 'base' in checkpoint and 'QQP' in model:
            self.state = r"C:\w266\data2\checkpoints\BERT-qqpairs_epoch_1.pt"
        elif 'base' in checkpoint:
            self.state = "C:\w266\data\embed_checkpoints\%s_epoch_1.pt" %model
        else:
            self.state = "C:\w266\data\embed_checkpoints\\bert_large\%s_epoch_1.pt" %model
            #self.state = r"C:\BERTVision\code\torch\model_checkpoints\bert-large-uncased\CoLA\2021-03-28_17-41-17.pt"
        self.batch_size = 16
        self.num_workers = 0
        self.n_gpu = 1
        
        if num_labels > 1:
            self.criterion = nn.CrossEntropyLoss()
        else:
            self.criterion = nn.MSELoss()
        self.checkpoint = checkpoint
        self.max_seq_length = max_seq_length
        self.num_labels = num_labels
        self.model = model
        self.error = False
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self = Args()

In [123]:
self.checkpoint = 'bert-base-uncased'
model = BertForSequenceClassification.from_pretrained(self.checkpoint,
                                                       num_labels=self.num_labels,
                                                       output_hidden_states=True).to('cuda')
model.eval();

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [131]:
model_r = BertForSequenceClassification.from_pretrained(self.checkpoint,
                                                       num_labels=1,
                                                       output_hidden_states=True).to('cuda')
model_r.eval();

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [117]:
model_3 = BertForSequenceClassification.from_pretrained(self.checkpoint,
                                                       num_labels=3,
                                                       output_hidden_states=True).to('cuda')
model_3.load_state_dict(torch.load("C:\w266\data\embed_checkpoints\\bert_large\%s_epoch_1.pt" %"MNLI"))
model_3.eval();

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [125]:
def evaluate_results(processor, **kwargs):
    self = Args(**kwargs)
    
    if kwargs['model'] == 'STSB':
        m = model_r
    else:
        m = model
    
    m.load_state_dict(torch.load(self.state))
    return BertGLUEEvaluator(m, 
                             processor, 
                             self, 
                             logger,
                             standalone_eval = True).get_loss(type='dev')

def evaluate_mnli(processor, **kwargs):
    self = Args(**kwargs)
    
    m = model_3
    
    #m.load_state_dict(torch.load(self.state))
    return (BertGLUEEvaluator(m, 
                             processor, 
                             self, 
                             logger,
                             standalone_eval = True).get_loss(type='dev_matched'),
            BertGLUEEvaluator(m, 
                             processor, 
                             self, 
                             logger,
                             standalone_eval = True).get_loss(type='dev_mismatched'))

In [122]:
#bert large for STSB
kwarg = kwargs[6]
evaluate_results(kwarg[0], **kwarg[1])

2021-03-28 21:50:19.696 | INFO     | common.evaluators.bert_glue_evaluator:get_loss:88 - Generating metrics
Evaluating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 94/94 [00:12<00:00,  7.60it/s]


(0.8622907880795017, 0.8618186687172965, 0.5767521862971022)

In [132]:
#bert base for STSB
kwarg = kwargs[6]
kwarg[1]['checkpoint'] = 'bert-base-uncased'
evaluate_results(kwarg[0], **kwarg[1])

2021-03-28 21:52:49.584 | INFO     | common.evaluators.bert_glue_evaluator:get_loss:88 - Generating metrics
Evaluating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 94/94 [00:04<00:00, 21.36it/s]


(0.8735816166189053, 0.8685228797436788, 0.5343236320830406)

In [97]:
#bert-base
results = {}
kwargs = [(CoLA, {'model': 'CoLA', 'max_seq_length' : 128}),
          (MSR, {'model': 'MSR', 'max_seq_length' : 128}),
          (QNLI, {'model': 'QNLI', 'max_seq_length' : 128}),
          (QQP, {'model': 'QQP', 'max_seq_length' : 128}),
          (RTE, {'model': 'RTE', 'max_seq_length' : 250}),
          (SST, {'model': 'SST', 'max_seq_length' : 128}),
          (STSB, {'model': 'STSB', 'max_seq_length' : 128, 'num_labels': 1}),
          #(MNLI, {'model': 'MNLI', 'max_seq_length' : 128})
         ]

for kwarg in kwargs:
    results[kwarg[1]['model']] = evaluate_results(kwarg[0], **kwarg[1])[0]

2021-03-28 20:30:13.357 | INFO     | common.evaluators.bert_glue_evaluator:get_loss:88 - Generating metrics
Evaluating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 66/66 [00:02<00:00, 22.39it/s]
2021-03-28 20:30:16.866 | INFO     | common.evaluators.bert_glue_evaluator:get_loss:88 - Generating metrics
Evaluating: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108/108 [00:05<00:00, 20.80it/s]
b'Skipping line 660: expected 4 fields, saw 5\n'
2021-03-28 20:30:22.662 | INFO     | common.evaluators.bert_glue_evaluator:get_loss:88 - Generating metrics
Evaluating: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 330/330 [00:16<00:00, 20.14it/s]
2021-03-28 20:30:39.708 | INFO     | common.evaluators.bert_

In [98]:
matched, mismatched = evaluate_mnli(MNLI, **{'model': 'MNLI', 'max_seq_length' : 128})
results['MNLI_matched'] = matched[0]
results['MNLI_mismatched'] = mismatched[0]

2021-03-28 20:32:49.038 | INFO     | common.evaluators.bert_glue_evaluator:get_loss:88 - Generating metrics
Evaluating: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 614/614 [00:29<00:00, 21.01it/s]
2021-03-28 20:33:18.517 | INFO     | common.evaluators.bert_glue_evaluator:get_loss:88 - Generating metrics
Evaluating: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 615/615 [00:29<00:00, 20.99it/s]


In [106]:
#bert-base
for i in results:
    print(i, results[i])

CoLA 0.5336603793351659
MSR 0.8202898550724638
QNLI 0.9018230155715914
QQP 0.8962156814246847
RTE 0.6389891696750902
SST 0.926605504587156
STSB 0.8735816171329647
MNLI_matched 0.8231278655119715
MNLI_mismatched 0.8327908868999186


In [114]:
#bert-large
#bert-base
results2 = {}
kwargs = [(CoLA, {'model': 'CoLA', 'max_seq_length' : 128}),
          (MSR, {'model': 'MSR', 'max_seq_length' : 128}),
          (QNLI, {'model': 'QNLI', 'max_seq_length' : 128}),
          (QQP, {'model': 'QQP', 'max_seq_length' : 128}),
          (RTE, {'model': 'RTE', 'max_seq_length' : 250}),
          (SST, {'model': 'SST', 'max_seq_length' : 128}),
          (STSB, {'model': 'STSB', 'max_seq_length' : 128, 'num_labels': 1}),
          #(MNLI, {'model': 'MNLI', 'max_seq_length' : 128})
         ]

for kwarg in kwargs:
    kwarg[1]['checkpoint'] = 'bert-large-uncased'
    results2[kwarg[1]['model']] = evaluate_results(kwarg[0], **kwarg[1])[0]

2021-03-28 20:40:40.860 | INFO     | common.evaluators.bert_glue_evaluator:get_loss:88 - Generating metrics
Evaluating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 66/66 [00:08<00:00,  7.80it/s]
2021-03-28 20:40:51.237 | INFO     | common.evaluators.bert_glue_evaluator:get_loss:88 - Generating metrics
Evaluating: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108/108 [00:14<00:00,  7.51it/s]
b'Skipping line 660: expected 4 fields, saw 5\n'
2021-03-28 20:41:07.017 | INFO     | common.evaluators.bert_glue_evaluator:get_loss:88 - Generating metrics
Evaluating: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 330/330 [00:44<00:00,  7.47it/s]
2021-03-28 20:42:02.063 | INFO     | common.evaluators.bert_

In [119]:
matched, mismatched = evaluate_mnli(MNLI, **{'model': 'MNLI', 'max_seq_length' : 128})
results2['MNLI_matched'] = matched[0]
results2['MNLI_mismatched'] = mismatched[0]

2021-03-28 20:52:30.081 | INFO     | common.evaluators.bert_glue_evaluator:get_loss:88 - Generating metrics
Evaluating: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 614/614 [01:21<00:00,  7.52it/s]
2021-03-28 20:53:52.060 | INFO     | common.evaluators.bert_glue_evaluator:get_loss:88 - Generating metrics
Evaluating: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 615/615 [01:22<00:00,  7.47it/s]


In [120]:
#bert-large
for i in results2:
    print(i, results2[i])

CoLA 0.20666007059005728
MSR 0.7634782608695653
QNLI 0.9073300417774401
QQP 0.8962651496413554
RTE 0.5306859205776173
SST 0.9288990825688074
STSB 0.862290788524249
MNLI_matched 0.8518593988792664
MNLI_mismatched 0.8513018714401953


## Example

In [45]:
self = Args(model = 'CoLA', checkpoint = 'bert-large-uncased')

In [46]:
model = BertForSequenceClassification.from_pretrained(self.checkpoint,
                                                       num_labels=self.num_labels,
                                                       output_hidden_states=True).to('cuda')
model.load_state_dict(torch.load(self.state))
model.eval();

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [47]:
results = BertGLUEEvaluator(model, 
                             CoLA, 
                             self, 
                             logger,
                             standalone_eval = True).get_loss(type='dev')


2021-03-28 18:17:09.886 | INFO     | common.evaluators.bert_glue_evaluator:get_loss:88 - Generating metrics
Evaluating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 58/58 [00:08<00:00,  6.80it/s]


In [48]:
results

(0.20666007059005728, 0.5573520136290583)