# Unbiased Pronoun Resolution

## Setup

### Imports

In [1]:
from src.utils import *
from src.data_utils import *
from src.models import *
from src.train import *
import os

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


### Set config

In [2]:
set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'using device: {device}')
os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # disable HF tokenizer parallelism
os.environ['CUDA_LAUNCH_BLOCKING'] = '1' # for debugging cuda errors
# os.environ['TORCH_USE_CUDA_DSA'] = 'true'
# set torch matmul precision
torch.set_float32_matmul_precision('medium')

using device: cuda


## Task: Finetune & eval LM on dataset(s)

### GAP

#### Load data

In [3]:
# create data module
root_dir = 'data/gap/'  # root dir of GAP dataset
bsz = 32    # batch size
data_module = GAPDataModule(root_dir, bsz=bsz)
print(f'size of train set: {len(data_module.train_dataset)} | val set: {len(data_module.val_dataset)} | test set: {len(data_module.test_dataset)}')

# create data loaders
train_loader, val_loader, test_loader = data_module.train_dataloader(), data_module.val_dataloader(), data_module.test_dataloader()

# get value counts of labels
# get_value_counts(data_module)

size of train set: 2000 | val set: 454 | test set: 2000


#### Train (finetune) LM

In [4]:
# %%script false --no-raise-error
# clear GPU memory
torch.cuda.empty_cache()

# create the model
lm_name = 'nielsr/coref-roberta-large'
model = GAPCorefClassifier(lm_name)

# training args
args = {'lm_name': lm_name, 'num_epochs': 10, 'precision': '16-mixed', 'patience': 3, 'ckpt_name': 'ckpt_best', 'resume_ckpt': None, 'grad_batches': 4, 'save_top_k': 1}

# train
model, trainer = train(model, train_loader, val_loader, args)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type               | Params
---------------------------------------------
0 | model | RobertaModel       | 355 M 
1 | fc    | Linear             | 3.1 K 
2 | loss  | CrossEntropyLoss   | 0     
3 | acc   | MulticlassAccuracy | 0     
4 | f1    | MulticlassF1Score  | 0     
---------------------------------------------
355 M     Trainable params
0         Non-trainable params
355 M     Total params
1,421.451 Total estimated model params size (MB)


Epoch 0: 100%|██████████| 63/63 [00:23<00:00,  2.73it/s, v_num=last]       

Metric val_loss improved. New best score: 1.155


Epoch 1: 100%|██████████| 63/63 [00:22<00:00,  2.74it/s, v_num=last]

Metric val_loss improved by 0.108 >= min_delta = 0.0. New best score: 1.047


Epoch 2: 100%|██████████| 63/63 [00:22<00:00,  2.74it/s, v_num=last]

Metric val_loss improved by 0.037 >= min_delta = 0.0. New best score: 1.010


Epoch 5: 100%|██████████| 63/63 [00:22<00:00,  2.75it/s, v_num=last]

Monitored metric val_loss did not improve in the last 3 records. Best score: 1.010. Signaling Trainer to stop.


Epoch 5: 100%|██████████| 63/63 [00:22<00:00,  2.75it/s, v_num=last]


#### Eval

In [5]:
# test
trainer.test(dataloaders=test_loader)

  rank_zero_warn(
Restoring states from the checkpoint path at /home/neil/dl4nlp-pr/model_ckpts/GAPCorefClassifier/nielsr/coref-roberta-large/ckpt_best.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/neil/dl4nlp-pr/model_ckpts/GAPCorefClassifier/nielsr/coref-roberta-large/ckpt_best.ckpt


Testing DataLoader 0: 100%|██████████| 63/63 [00:06<00:00,  9.17it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Runningstage.testing metric      DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.3346666991710663
         test_f1            0.19895586371421814
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_acc': 0.3346666991710663, 'test_f1': 0.19895586371421814}]

In [None]:
# get test predictions
test_out = trainer.predict(dataloaders=test_loader)
# stack batch preds and labels
test_labels = torch.cat([x[0] for x in test_out]).cpu().numpy() 
test_preds = torch.cat([x[1] for x in test_out])
# get value counts of 0's & 1's in test_labels, test_preds
unique_labels, counts_labels = np.unique(test_labels, return_counts=True)
unique_preds, counts_preds = np.unique(test_preds, return_counts=True)

# print the counts
print(f"counts in test_labels: {dict(zip(unique_labels, counts_labels))}")
print(f"counts in test_preds: {dict(zip(unique_preds, counts_preds))}")