# PYTORCH HUGGINGFACE TRANSFORMERS BERT
Additional reference: https://mccormickml.com/2019/07/22/BERT-fine-tuning/

!pip install transformers
!pip install sentencepiece
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_lg
!pip install -U nltk
!pip install ftfy
!pip install pycld2
!pip install emoji
!pip install tqdm
!pip install openpyxl
!pip3 install ipywidgets

### RESTART KERNEL AFTER INSTALLING `IPYWIDGETS`

In [2]:
import numpy as np
import pandas as pd
import os
import itertools
import random
import time
import gc
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

from transformers import DistilBertTokenizer
from transformers import DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [3]:
seed = random.randint(0,500)
#seed = 195
random.seed( seed )
np.random.seed( seed )
torch.manual_seed( seed )
torch.cuda.manual_seed_all( seed )
print('Seed:', seed)

Seed: 103


In [4]:
def plot_confusion_matrix(cm, classes,                          
                          title='CONFUSION MATRIX',
                          cmap=plt.cm.PuBu):               # originally plt.cm.Blues; also good: BuPu,RdPu,PuRd,OrRd,Oranges
    '''
    Plot the confusion matrix    
    '''
    plt.rcParams['xtick.bottom'] = plt.rcParams['xtick.labelbottom'] = False
    plt.rcParams['xtick.top'] = plt.rcParams['xtick.labeltop'] = True
            
    plt.figure(figsize=(5,5))
    im = plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar(im, fraction=0.046, pad=0.05)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)
    
    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True labels')
    plt.xlabel('Predicted labels')
    plt.tight_layout()
    plt.show()

In [5]:
def dedupe( df1, df2, col_ ):
    '''
        df2 should not contain annything from df1 in column col_
        (e.g. df2 = smaller test set OR smaller set for 1 category)
        to preserve smaller df2, duplicates are deleted from df1
    '''
    original_length = df1.shape[0]
    df2_sents = df2[col_].values
    df1 = df1[ ~df1[col_].isin(df2_sents) ]
    print( f'\tDropping {original_length - df1.shape[0]} duplicates')
    return df1, df2


def upsample( df_, to_oversample_ ):
    '''
        Upsample df_ by to_oversample_ more samples excluding re-evaluation data
    '''    
    # EXCLUDE RE-EVALUATED DATA FROM OVERSMPLING IF IT'S PRESENT    
    if 'source' in df_.columns:
        df_to_oversample = df_[ df_['source'] != 'reeval_2021' ]
        print( '\tData shape for this category without re-eval:', df_to_oversample.shape )
    else:
        df_to_oversample = df_
        
    # OVERSAMPLE AND CONCAT W/ORIGINAL DF_
    replace = False
    if len(df_to_oversample) < to_oversample_:
        replace = True        

    df_upsampled = df_to_oversample.sample( n=to_oversample_, replace=replace )
    df_          = pd.concat([ df_, df_upsampled ])
        
    return df_.sample( frac=1 )


def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return ( round( f1_score(labels_flat, preds_flat, average='micro'), 4 ),
             round( f1_score(labels_flat, preds_flat, average='macro'), 4 ),
           )

## Load data

In [6]:
file = ''
df   = pd.read_csv( file , sep='\t', encoding='utf-8' )

In [7]:
ml_categories = [ ]
#df = df[ df['is_subtle'] == 0 ]
df = df[ df['label'].isin( ml_categories ) ]
df['target'] = df['label'].apply( lambda x: 0 if x == 'unk' else 1 )
df = df.drop(['award_reason', 'is_group_award', 'group_award_id',], axis=1)

In [8]:
# DEDUPE BETWEEN CATEGORIES. FAVOR CATEGORY 1
print(df.shape)
df1 = df[ df['target'] == 1 ].copy()
df0 = df[ df['target'] == 0 ].copy()
df0, df1 = dedupe( df0, df1, 'sentence' )
df = pd.concat([ df0, df1 ]).copy().sample(frac=1).reset_index(drop=True)
print(df.shape)

(10843, 12)
	Dropping 6 duplicates
(10837, 12)


#### DEDUPE TRAIN / VAL / TEST SETS. FAVOR TEST, THEN VAL SET - better results if not doing it, but it's not right
print(df.shape)
df_train = df[ df['subset'] == 'train' ].copy()
df_val   = df[ df['subset'] == 'val' ].copy()
df_test  = df[ df['subset'] == 'test' ].copy()

df_train, df_test = dedupe( df_train, df_test, 'sentence' )
df_val, df_test   = dedupe( df_val, df_test, 'sentence' )
df_train, df_val  = dedupe( df_train, df_val, 'sentence' )
df = pd.concat([ df_train, df_val, df_test ]).copy().sample(frac=1).reset_index(drop=True)
print(df.shape)

In [9]:
# INCORRECT WAY TO ESTIMATE MAXLEN - USE BERT TOKENIZER
df['length'] = df['sentence'].apply( lambda x: len(x.split()) )
maxlen = df['length'].max()
maxlen

99

### TRAIN-TEST SPLIT

#### IF TRAINING ON FULL DATASET
X_train = df['sentence'].values
y_train = df['target'].values
X_val   = df[ df['subset'] == 'val' ]['sentence'].values
y_val   = df[ df['subset'] == 'val' ]['target'].values
X_test  = df[ df['subset'] == 'test' ]['sentence'].values
y_test  = df[ df['subset'] == 'test' ]['target'].values

In [25]:
X_train = df[ df['subset'] == 'train' ]['sentence'].values
y_train = df[ df['subset'] == 'train' ]['target'].values
X_val   = df[ df['subset'] == 'val' ]['sentence'].values
y_val   = df[ df['subset'] == 'val' ]['target'].values
X_test  = df[ df['subset'] == 'test' ]['sentence'].values
y_test  = df[ df['subset'] == 'test' ]['target'].values

In [26]:
print( X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape, )

(24476,) (24476,) (2794,) (2794,) (918,) (918,)


## Train

In [27]:
def get_dataloader( X, y, tokenizer, batch_size, maxlen ):
    
    # `batch_encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_data = tokenizer.batch_encode_plus( X, 
                                                add_special_tokens    = True, 
                                                return_attention_mask = True, 
                                                pad_to_max_length     = True, 
                                                max_length            = maxlen,
                                                return_tensors        = 'pt',
                                              )
    
    input_ids       = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']
    labels          = torch.tensor( y )
    
    dataset         = TensorDataset( input_ids, attention_masks, labels )
    dataloader      = DataLoader(    dataset, 
                                     sampler    = RandomSampler( dataset ), 
                                     batch_size = batch_size,
                                 )
    
    return dataloader

In [28]:
def evaluate( dataloader ):

    # put model in eval mode
    model.eval()
    
    loss_val_total = 0
    preds, true_vals = [], []
    
    for batch in dataloader:
        
        # add batch to device (GPU)
        batch = tuple(b.to(device) for b in batch)
        
        # unpack inputs from dataloader
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        # tell the model not to compute gradients => save memory, speed up prediction
        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        loss_val_total += loss.item()
        logits = outputs[1]  

        # move logits, labels to CPU (logits = raw classifier output)
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
                
        preds.append( logits )
        true_vals.append( label_ids )
    
    loss_val_avg = loss_val_total/len(dataloader) 
    
    preds = np.concatenate(preds, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, preds, true_vals

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


#### BERT HAS A LOT MORE TOKENS IN A SENTENCE THAN ACCORDING TO PYTHON'S SENTENCES.SPLIT()!!!!
* maxlen per split() - 99
* maxlen per Bert    - 152!!!!

In [30]:
# MAXLEN BASED ON BERT TOKENIZATION
#data_tokenized = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(i)) for i in X_train]
#mlen = max([len(i) for i in data_tokenized])
#mlen
# OUTPUT FOR SUMMER 2021 DATASET - 152

In [31]:
maxlen = 200

In [32]:
seed

103

In [33]:
epochs         = 5
maxlen         = maxlen
learning_rates = [5e-5]
batch_sizes    = [16]                      # [ 32, 16, 8, ]
douts          = [0.0]                     # 0.4, 0.5    Done: 0.1, 0.25, 

all_combinations = list(itertools.product( *[learning_rates, batch_sizes, douts] ))

time_stamp1 = time.strftime("%Y%m%dT%H%M") 
file_name   = f'logs/log_{time_stamp1}.txt'
wdir        = 'ckpts/current/'

with open( file_name, 'w', encoding='utf-8' ) as f:
    experiment_name = 'BERT PYTORCH\n'
    f.write( experiment_name )
    for LR, batch_size, dout in all_combinations:

        time_stamp = time.strftime("%Y%m%dT%H%M") 
        params1    = f'LR={LR}, batch_size={batch_size}, classifier_dropout={dout}'
        params2    = f'epochs={epochs}, maxlen={maxlen}, seed={seed}'
        print( 'Timestamp:', time_stamp, '\n', params1, '\n', params2, sep='')
        f.write( '\nTimestamp: ' + time_stamp + '\n' + params1 + '\n' + params2 + '\n' )
        
        tokenizer  = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased', 
                                                           do_lower_case=True,
                                                           padding_side = 'right',
                                                        )
        dataloader_train = get_dataloader( X_train, y_train, tokenizer, batch_size, maxlen )
        dataloader_val   = get_dataloader( X_val, y_val, tokenizer, batch_size, maxlen )
        dataloader_test  = get_dataloader( X_test, y_test, tokenizer, batch_size, maxlen )

        model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased',
                                                                      num_labels=2,
                                                                      output_attentions=False,
                                                                      output_hidden_states=False,
                                                                      #classifier_dropout = dout,
                                                                      #attention_probs_dropout_prob=dout,
                                                                      #hidden_dropout_prob=dout,
                                                                   )
        optimizer = AdamW( model.parameters(),
                           lr=LR,                 # 1e-5
                           eps=1e-8,              # very small number to avoid division by 0
                         )             
        
        # Note: len(dataloader_train) = len(X_train) / batch_size
        # in case of augmented / oversampled data, len(X_train) == 24459, steps = 1529
        scheduler = get_linear_schedule_with_warmup( optimizer, 
                                                     num_warmup_steps=500,
                                                     num_training_steps=len(dataloader_train)*epochs,
                                                   )
        model.to(device)
        for epoch in tqdm(range(1, epochs+1)):

            model.train()
            loss_train_total = 0
            progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
                        
            for batch in progress_bar:

                model.zero_grad()
                batch = tuple(b.to(device) for b in batch)
                inputs = {'input_ids':      batch[0],
                          'attention_mask': batch[1],
                          'labels':         batch[2],
                         }       

                outputs = model(**inputs)
                loss = outputs[0]
                loss_train_total += loss.item()
                loss.backward()

                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                optimizer.step()
                scheduler.step()

                progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

            tqdm.write( f'\nEpoch {epoch}' )
            f.write( f'\nEpoch {epoch}' + '\n' )

            loss_train_avg = round( loss_train_total/len(dataloader_train), 4 )
            val_loss, preds, y_val = evaluate( dataloader_val )
            val_loss = round( val_loss, 4 )
            val_f1 = f1_score_func( preds, y_val )
            
            metrics = f'Training loss: {loss_train_avg}\n' + f'Validation loss: {val_loss}\n' +\
                      f'F1 Score (micro): {val_f1[0]}\n' + f'F1 Score (macro): {val_f1[1]}\n'
            
            tqdm.write( metrics )
            f.write( metrics + '\n')
            
            filepath = wdir + time_stamp + f'-epoch_{epoch}-val_loss_{val_loss}-f1micro_{val_f1[0]}-f1macro{val_f1[1]}.model'
            torch.save(model.state_dict(), filepath )

Timestamp:20211010T0608
LR=5e-05, batch_size=16, classifier_dropout=0.0
epochs=5, maxlen=200, seed=103


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=231508.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=28.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=466062.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=483.0), HTML(value='')))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.





HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=267967963.0), HTML(value='')))




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=1530.0), HTML(value='')))

[2021-10-10 06:09:26.095 pytorch-1-6-gpu-py3-ml-g4dn-xlarge-cfec521e9f0eef638bc93c1751d2:2073 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2021-10-10 06:09:26.129 pytorch-1-6-gpu-py3-ml-g4dn-xlarge-cfec521e9f0eef638bc93c1751d2:2073 INFO profiler_config_parser.py:102] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.

Epoch 1
Training loss: 0.3389
Validation loss: 0.6347
F1 Score (micro): 0.7856
F1 Score (macro): 0.7763



HBox(children=(HTML(value='Epoch 2'), FloatProgress(value=0.0, max=1530.0), HTML(value='')))


Epoch 2
Training loss: 0.1217
Validation loss: 0.8273
F1 Score (micro): 0.8017
F1 Score (macro): 0.7954



HBox(children=(HTML(value='Epoch 3'), FloatProgress(value=0.0, max=1530.0), HTML(value='')))


Epoch 3
Training loss: 0.0473
Validation loss: 1.1179
F1 Score (micro): 0.8128
F1 Score (macro): 0.8063



HBox(children=(HTML(value='Epoch 4'), FloatProgress(value=0.0, max=1530.0), HTML(value='')))


Epoch 4
Training loss: 0.0239
Validation loss: 1.3356
F1 Score (micro): 0.7989
F1 Score (macro): 0.7952



HBox(children=(HTML(value='Epoch 5'), FloatProgress(value=0.0, max=1530.0), HTML(value='')))


Epoch 5
Training loss: 0.007
Validation loss: 1.5414
F1 Score (micro): 0.7931
F1 Score (macro): 0.7872




In [None]:
unknown_variable

In [34]:
del model
gc.collect()
torch.cuda.empty_cache()

## Inference: saved models on test set

In [35]:
maxlen

200

In [37]:
maxlen     = maxlen
batch_size = 16
tokenizer  = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased', 
                                                   do_lower_case=True,
                                                   padding_side = 'right',
                                                )

In [38]:
wdir = 'ckpts/current/'
res  = []

dataloader_test = get_dataloader( X_test, y_test, tokenizer, batch_size, maxlen )
for path, directories, files in os.walk( wdir ):
    for file in sorted(files):

        try:
            if not file.endswith( '.model' ):
                continue

            model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased',
                                                                         num_labels=2,
                                                                         output_attentions=False,
                                                                         output_hidden_states=False,
                                                                         #classifier_dropout = dout,
                                                                         #attention_probs_dropout_prob=dout,
                                                                         #hidden_dropout_prob=dout,
                                                                       )

            model.to(device)
            model.load_state_dict( torch.load( wdir + file, map_location=torch.device('cpu') ) )
            _, preds, y_test = evaluate( dataloader_test )

            preds_flat  = np.argmax( preds, axis=1 ).flatten()
            y_test_flat = y_test.flatten()
            clf_report = classification_report( y_test_flat, preds_flat, digits=4 )
            print( file, '\n', clf_report )            
            res.append((file, clf_report))
            
        except Exception as e:
            print(e)
                        
        print( '='*50, '\n' )

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from

20211010T0608-epoch_1-val_loss_0.6347-f1micro_0.7856-f1macro0.7763.model 
               precision    recall  f1-score   support

           0     0.7827    0.8289    0.8051       491
           1     0.7889    0.7354    0.7612       427

    accuracy                         0.7854       918
   macro avg     0.7858    0.7821    0.7832       918
weighted avg     0.7856    0.7854    0.7847       918




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

20211010T0608-epoch_2-val_loss_0.8273-f1micro_0.8017-f1macro0.7954.model 
               precision    recall  f1-score   support

           0     0.8012    0.7882    0.7947       491
           1     0.7609    0.7752    0.7680       427

    accuracy                         0.7821       918
   macro avg     0.7811    0.7817    0.7813       918
weighted avg     0.7825    0.7821    0.7823       918




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

20211010T0608-epoch_3-val_loss_1.1179-f1micro_0.8128-f1macro0.8063.model 
               precision    recall  f1-score   support

           0     0.7947    0.7963    0.7955       491
           1     0.7653    0.7635    0.7644       427

    accuracy                         0.7810       918
   macro avg     0.7800    0.7799    0.7799       918
weighted avg     0.7810    0.7810    0.7810       918




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

20211010T0608-epoch_4-val_loss_1.3356-f1micro_0.7989-f1macro0.7952.model 
               precision    recall  f1-score   support

           0     0.8188    0.7821    0.8000       491
           1     0.7617    0.8009    0.7808       427

    accuracy                         0.7908       918
   macro avg     0.7902    0.7915    0.7904       918
weighted avg     0.7922    0.7908    0.7911       918




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

20211010T0608-epoch_5-val_loss_1.5414-f1micro_0.7931-f1macro0.7872.model 
               precision    recall  f1-score   support

           0     0.8151    0.7902    0.8025       491
           1     0.7670    0.7939    0.7802       427

    accuracy                         0.7919       918
   macro avg     0.7910    0.7921    0.7913       918
weighted avg     0.7927    0.7919    0.7921       918




In [None]:
for i in res:
    print(i[0])
    print(i[1])
    print()