# Unbiased Pronoun Resolution

## Setup

### Imports

In [8]:
from src.utils import *
from src.data_utils import *
from src.models import *
from src.train import *
from src.zs_utils import *
import os

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Set config

In [2]:
set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'using device: {device}')
os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # disable HF tokenizer parallelism
os.environ['CUDA_LAUNCH_BLOCKING'] = '1' # for debugging cuda errors
# os.environ['TORCH_USE_CUDA_DSA'] = 'true'
# set torch matmul precision
torch.set_float32_matmul_precision('medium')

using device: cuda


## Task: Finetune & eval LM on dataset(s)

### GAP

#### Load data

In [3]:
# create data module
root_dir = 'data/gap/'  # root dir of GAP dataset
bsz = 32    # batch size
data_module = GAPDataModule(root_dir, bsz=bsz)
print(f'size of train set: {len(data_module.train_dataset)} | val set: {len(data_module.val_dataset)} | test set: {len(data_module.test_dataset)}')

# create data loaders
train_loader, val_loader, test_loader = data_module.train_dataloader(), data_module.val_dataloader(), data_module.test_dataloader()

# get value counts of labels
# get_value_counts(data_module)

size of train set: 2000 | val set: 454 | test set: 2000


#### Train (finetune) LM

In [4]:
# %%script false --no-raise-error
# clear GPU memory
torch.cuda.empty_cache()

# create the model
lm_name = 'nielsr/coref-roberta-large'
model = GAPCorefClassifier(lm_name)

# training args
args = {'lm_name': lm_name, 'num_epochs': 10, 'precision': '16-mixed', 'patience': 3, 'ckpt_name': 'ckpt_best', 'resume_ckpt': None, 'grad_batches': 4, 'save_top_k': 1}

# train
model, trainer = train(model, train_loader, val_loader, args)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type               | Params
---------------------------------------------
0 | model | RobertaModel       | 355 M 
1 | fc    | Linear             | 3.1 K 
2 | loss  | CrossEntropyLoss   | 0     
3 | acc   | MulticlassAccuracy | 0     
4 | f1    | MulticlassF1Score  | 0     
---------------------------------------------
355 M     Trainable params
0         Non-trainable params
355 M     Total params
1,421.451 Total estimated model params size (MB)


Epoch 0: 100%|██████████| 63/63 [00:23<00:00,  2.73it/s, v_num=last]       

Metric val_loss improved. New best score: 1.155


Epoch 1: 100%|██████████| 63/63 [00:22<00:00,  2.74it/s, v_num=last]

Metric val_loss improved by 0.108 >= min_delta = 0.0. New best score: 1.047


Epoch 2: 100%|██████████| 63/63 [00:22<00:00,  2.74it/s, v_num=last]

Metric val_loss improved by 0.037 >= min_delta = 0.0. New best score: 1.010


Epoch 5: 100%|██████████| 63/63 [00:22<00:00,  2.75it/s, v_num=last]

Monitored metric val_loss did not improve in the last 3 records. Best score: 1.010. Signaling Trainer to stop.


Epoch 5: 100%|██████████| 63/63 [00:22<00:00,  2.75it/s, v_num=last]


#### Eval

In [5]:
# test
trainer.test(dataloaders=test_loader)

  rank_zero_warn(
Restoring states from the checkpoint path at /home/neil/dl4nlp-pr/model_ckpts/GAPCorefClassifier/nielsr/coref-roberta-large/ckpt_best.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/neil/dl4nlp-pr/model_ckpts/GAPCorefClassifier/nielsr/coref-roberta-large/ckpt_best.ckpt


Testing DataLoader 0: 100%|██████████| 63/63 [00:06<00:00,  9.17it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Runningstage.testing metric      DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.3346666991710663
         test_f1            0.19895586371421814
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_acc': 0.3346666991710663, 'test_f1': 0.19895586371421814}]

In [None]:
# get test predictions
test_out = trainer.predict(dataloaders=test_loader)
# stack batch preds and labels
test_labels = torch.cat([x[0] for x in test_out]).cpu().numpy() 
test_preds = torch.cat([x[1] for x in test_out])
# get value counts of 0's & 1's in test_labels, test_preds
unique_labels, counts_labels = np.unique(test_labels, return_counts=True)
unique_preds, counts_preds = np.unique(test_preds, return_counts=True)

# print the counts
print(f"counts in test_labels: {dict(zip(unique_labels, counts_labels))}")
print(f"counts in test_preds: {dict(zip(unique_preds, counts_preds))}")

## Task: Zero-shot CR using LLMs

### GAP

In [3]:
# classify gap test set zs using gpt3
data_path = 'data/gap/gap-test.tsv'
num_samples = 10
responses = classify_coref_zs(data_path, num_samples)
for i, response in enumerate(responses):
    print(f'response {i}: {response}')

response 0: Dehner
response 1: Dehner
response 2: Dehner
response 3: Alonso
response 4: Alonso
response 5: Alonso
response 6: Ali Aladhadh
response 7: Ali Aladhadh
response 8: Ali Aladhadh
response 9: Pisciotta
response 10: Pisciotta
response 11: Pisciotta
response 12: Eddie
response 13: Eddie
response 14: Eddie
response 15: Jewel Staite
response 16: Jewel Staite
response 17: Jewel Staite
response 18: Allison
response 19: Allison
response 20: Allison
response 21: Jeni
response 22: Jeni
response 23: Jeni
response 24: Malave
response 25: Malave
response 26: Malave
response 27: Lorrie Morgan
response 28: Lorrie Morgan
response 29: Lorrie Morgan


## Coref-mt5

In [5]:
from src.coref_mt5 import *

# @title sample document - from wikipedia
doc_title = "Eiffel Tower Wiki"
doc = """The Eiffel Tower (French: tour Eiffel) is a wrought-iron lattice tower on the Champ de Mars in Paris, France. It is named after the engineer Gustave Eiffel, whose company designed and built the tower.
Locally nicknamed "La dame de fer" (French for "Iron Lady"), it was constructed from 1887 to 1889 as the centerpiece of the 1889 World's Fair.
Although initially criticised by some of France's leading artists and intellectuals for its design, it has since become a global cultural icon of France and one of the most recognisable structures in the world.
The Eiffel Tower is the most visited monument with an entrance fee in the world: 6.91 million people ascended it in 2015.
It was designated a monument historique in 1964, and was named part of a UNESCO World Heritage Site ("Paris, Banks of the Seine") in 1991."""

emulated_preds = [
      [
          'It ## is named after -> The Eiffel Tower ( French : tour Eiffel ) ##'
          ' is a wrought ;; the tower ## . ** _ -> It ## is named after ;;'
      ],
      ['it ## was constructed from -> [1 ;;'],
      [
          'its ## design , it -> [1 ;; it ## has since become -> its ## design'
          " , it ;; France ' s ## leading artists and -> France ## . [1 It ;;"
          " France ## and one of -> France ' s ## leading artists and ;;"
      ],
      [
          'The Eiffel Tower ## is the most -> [1 ;; it ## in 2015 . -> The'
          ' Eiffel Tower ## is the most ;; the world ## : 6 . -> the world ## .'
          ' | _ ;;'
      ],
      [
          'It ## was designated a -> [1 ;; Paris ## , Banks of -> Paris , [2'
          ' France ## ] . [1 ;;'
      ],
]

# predict coref using mt5
states_dict = pred_cr_mt5(doc, doc_title, emulate_preds=True, emulated_preds=emulated_preds, debug=True, expand_only=False)

# print coref result as annotated text
annotate_cr(states_dict)

[nltk_data] Downloading package stopwords to /home/neil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Processing documents: {'Eiffel Tower Wiki'}
input batch[0]:  ['coref: w | _ The Eiffel Tower ( French : tour Eiffel ) is a wrought - iron lattice tower on the Champ de Mars in Paris , France . It is named after the engineer Gustave Eiffel , whose company designed and built the tower . ** _ Locally nicknamed " La dame de fer " ( French for " Iron Lady "), it was constructed from 1887 to 1889 as the centerpiece of the 1889 World \' s Fair . _ Although initially criticised by some of France \' s leading artists and intellectuals for its design , it has since become a global cultural icon of France and one of the most recognisable structures in the world . _ The Eiffel Tower is the most visited monument with an entrance fee in the world : 6 . 91 million people ascended it in 2015 . _ It was designated a monument historique in 1964 , and was named part of a UNESCO World Heritage Site (" Paris , Banks of the Seine ") in 1991 .']
mt5 output:      ['It ## is named after -> The Eiffel Tower ( F

## Fast Coref

In [6]:
# classify gap test set zs using fcoref
data_path = 'data/gap/gap-test.tsv'  # path to gap test set
num_samples = None # number of samples to classify
model_name = 'lmcoref'  # either 'fcoref' or 'lmcoref'
# init fcoref classifier
fcr_classifier = FCorefClassifier(model_name, data_path, num_samples)
# get cluster preds
cluster_preds = fcr_classifier.pred_cr_clusters()
# get label preds
labels, preds = fcr_classifier.pred_cr_labels(verbose=False)

# eval 
metrics = ['acc', 'f1'] # metrics to eval
res_file_path = None # path to save results
print(f'evaluating metrics on {num_samples} for {model_name} on GAP test...')
eval_res = eval_preds(labels, preds, metrics)
for metric, val in eval_res.items():
    print(f'{metric}: {val:.3f}')

10/12/2023 14:50:44 - INFO - 	 missing_keys: []
10/12/2023 14:50:44 - INFO - 	 unexpected_keys: []
10/12/2023 14:50:44 - INFO - 	 mismatched_keys: []
10/12/2023 14:50:44 - INFO - 	 error_msgs: []
10/12/2023 14:50:44 - INFO - 	 Model Parameters: 590.0M, Transformer: 434.6M, Coref head: 155.4M
10/12/2023 14:50:44 - INFO - 	 Tokenize 2000 inputs...
Map: 100%|██████████| 2000/2000 [00:07<00:00, 270.48 examples/s]
10/12/2023 14:50:51 - INFO - 	 ***** Running Inference on 2000 texts *****
Inference: 100%|██████████| 2000/2000 [00:42<00:00, 47.53it/s]
10/12/2023 14:51:34 - INFO - 	 Tokenize 2000 inputs...
Map: 100%|██████████| 2000/2000 [00:06<00:00, 314.91 examples/s]
10/12/2023 14:51:40 - INFO - 	 ***** Running Inference on 2000 texts *****
Inference: 100%|██████████| 2000/2000 [00:41<00:00, 47.90it/s]


predicting labels for 2000 samples using LingMessCoref...


2000it [00:00, 13341.68it/s]

done.
evaluating metrics on None for lmcoref on GAP test...
acc: 0.707
f1: 0.674





### Train fastcoref on GAP

In [36]:
# create gap clusters
data_path = 'data/gap/gap-development.tsv'
output_path = 'data/gap/gap-development-with-clusters.jsonlines'
num_samples = None
# convert_gap_to_jsonlines(data_path, output_path, num_samples)

In [38]:
from fastcoref import TrainingArgs, CorefTrainer

args = TrainingArgs(
    output_dir='test-trainer',
    overwrite_output_dir=True,
    model_name_or_path='distilroberta-base',
    device=device,
    epochs=100,
    logging_steps=100,
    eval_steps=100
)   # you can control other arguments such as learning head and others.

trainer = CorefTrainer(
    args=args,
    train_file='data/gap/gap-development-with-clusters.jsonlines', 
    # dev_file='data/gap/gap-validation-with-clusters.jsonlines',
    test_file='data/gap/gap-test-with-clusters.jsonlines',
    # nlp=nlp # optional, for custom nlp class from spacy
)
trainer.train()
trainer.evaluate(test=True)



10/12/2023 16:35:15 - INFO - 	 Loading FCoref model with underlying transformer distilroberta-base
10/12/2023 16:35:17 - INFO - 	 FCoref Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
Generating train split: 2000 examples [00:00, 16570.42 examples/s]
Map:  14%|█▎        | 272/2000 [00:03<00:22, 76.46 examples/s]


AttributeError: 'NoneType' object has no attribute 'start'