In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import load_dataset, Dataset
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os

from utils import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

In [2]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [3]:
MODEL_NAMES = ['bert-base-uncased'] # ['bert-base-uncased', 'xlnet-base-cased']

In [8]:
use_pretrain = False

for t in ['ORIG', 'INV', 'SIB']: 
    for MODEL_NAME in MODEL_NAMES:
        
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        
        dataset = load_dataset('glue', 'sst2')['train']
        dataset.rename_column_('sentence', 'text')
        dataset = dataset.train_test_split(test_size=0.1)
        train_dataset = dataset['train']
        test_dataset = dataset['test']
        
        if t == 'ORIG':
            checkpoint = 'pretrained/bert-base-uncased-sst2-ORIG'
            if use_pretrain and os.path.exists(checkpoint):
                MODEL_NAME = checkpoint
        if t == 'INV':
            text = npy_load("./assets/SST2/sentiment/INV/text.npy")
            label = npy_load("./assets/SST2/sentiment/INV/label.npy")
            df = pd.DataFrame({'text': text, 'label': label})
            df.text = df.text.astype(str)
            df.label = df.label.astype(int)
            train_dataset = Dataset.from_pandas(df)
            checkpoint = 'pretrained/bert-base-uncased-sst2-INV'
            if use_pretrain and os.path.exists(checkpoint):
                MODEL_NAME = checkpoint
        if t == 'SIB':
            text = npy_load("./assets/SST2/sentiment/SIB/text.npy")
            label = npy_load("./assets/SST2/sentiment/SIB/label.npy")
            df = pd.DataFrame({'text': text, 'label': label})
            df.text = df.text.astype(str)
            df.label = df.label.astype(int)
            train_dataset = Dataset.from_pandas(df)
            checkpoint = 'pretrained/bert-base-uncased-sst2-SIB'
            if use_pretrain and os.path.exists(checkpoint):
                MODEL_NAME = checkpoint
        # if t == 'BOTH':
        #     text = npy_load("./assets/SST2/sentiment/BOTH/text.npy")
        #     label = npy_load("./assets/SST2/sentiment/BOTH/label.npy")
        #     df = pd.DataFrame({'text': text, 'label': label})
        #     df.text = df.text.astype(str)
        #     df.label = df.label.astype(int)
        #     train_dataset = Dataset.from_pandas(df)
        
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(device)
            
        train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
        test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
        train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
        test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

        training_args = TrainingArguments(
            output_dir='./pretrained/' + MODEL_NAME + '-sst2-' + t,
            overwrite_output_dir=True,
            max_steps=100000,
            save_steps=10000,
            save_total_limit=1,
            per_device_train_batch_size=20,
            per_device_eval_batch_size=20,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=10000,
            load_best_model_at_end=True,
            metric_for_best_model="loss",
            greater_is_better=False,
            evaluation_strategy="steps"
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            compute_metrics=compute_metrics,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            # callbacks=[EarlyStoppingCallback]
        )

        trainer.train()
        out = trainer.evaluate()
        print(out)

Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Loading cached split indices for dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-e21c50a49f2c027a.arrow and C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-f230ca07febc706e.arrow
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
10000,0.164793,0.276628,0.944024,0.94913,0.94543,0.952858
20000,0.079714,0.319972,0.940015,0.94449,0.958182,0.931184
30000,0.06005,0.294245,0.943727,0.94854,0.950735,0.946356
40000,0.051904,0.312163,0.943727,0.948666,0.948537,0.948794
50000,0.042097,0.320658,0.942836,0.947569,0.952629,0.942563
60000,0.041617,0.293014,0.944321,0.949221,0.948836,0.949607
70000,0.028881,0.396054,0.942242,0.947764,0.939563,0.956109
80000,0.017275,0.44307,0.941797,0.947184,0.942107,0.952316
90000,0.008014,0.480969,0.944024,0.949226,0.94376,0.954755
100000,0.003313,0.521173,0.944024,0.948964,0.948323,0.949607


{'eval_loss': 0.2766284644603729, 'eval_accuracy': 0.9440237564959169, 'eval_f1': 0.9491296721090271, 'eval_precision': 0.9454301075268817, 'eval_recall': 0.9528583039826605, 'epoch': 32.99241174529858}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Loading cached split indices for dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-e21c50a49f2c027a.arrow and C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-f230ca07febc706e.arrow
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-a33a9b97bac9636e.arrow





Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
10000,0.336297,0.152509,0.950705,0.95422,0.971637,0.937415
20000,0.167413,0.182129,0.954863,0.959206,0.950279,0.968301
30000,0.113621,0.209561,0.954566,0.958827,0.952419,0.965321
40000,0.098432,0.241176,0.955902,0.959859,0.957659,0.96207
50000,0.083618,0.24063,0.955902,0.959924,0.956183,0.963695
60000,0.082648,0.210261,0.957981,0.961523,0.965066,0.958006
70000,0.063428,0.264043,0.951893,0.956777,0.942444,0.971552
80000,0.041135,0.272859,0.957535,0.961362,0.958771,0.963966
90000,0.03141,0.268824,0.957832,0.961486,0.962531,0.960444
100000,0.025219,0.275968,0.958872,0.962613,0.959118,0.966134


{'eval_loss': 0.15250948071479797, 'eval_accuracy': 0.9507052709725315, 'eval_f1': 0.9542195256480971, 'eval_precision': 0.9716371805672563, 'eval_recall': 0.93741533459767, 'epoch': 29.69121140142518}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Loading cached split indices for dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-e21c50a49f2c027a.arrow and C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-f230ca07febc706e.arrow
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-a33a9b97bac9636e.arrow





Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
10000,0.458328,0.643919,0.683148,0.731302,0.683133,0.786779
20000,0.585505,0.694376,0.451967,0.0,0.0,0.0
30000,0.691916,0.691182,0.548478,0.708129,0.548306,0.999458
40000,0.691581,0.690658,0.548033,0.708038,0.548033,1.0
50000,0.693505,0.697118,0.451967,0.0,0.0,0.0
60000,0.693311,0.695263,0.451967,0.0,0.0,0.0
70000,0.693312,0.693813,0.451967,0.0,0.0,0.0
80000,0.693266,0.692625,0.548033,0.708038,0.548033,1.0
90000,0.693203,0.693687,0.451967,0.0,0.0,0.0
100000,0.6932,0.693402,0.451967,0.0,0.0,0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.643919050693512, 'eval_accuracy': 0.6831477357089829, 'eval_f1': 0.7313019390581718, 'eval_precision': 0.6831333803810868, 'eval_recall': 0.7867786507721485, 'epoch': 29.69121140142518}
