In [1]:
!pip install pytorch-crf

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from crf import load_data, make_labels2i

train_filepath = "./NER_TRAIN_PREAMBLE.json"
dev_filepath = "./NER_DEV_PREAMBLE.json"
train_sents, train_tag_sents = load_data(train_filepath)
dev_sents, dev_tag_sents = load_data(dev_filepath)
labels2i = make_labels2i()

print(len(train_sents))
print()
print("labels2i", labels2i)

 
 

       ..." with entities "[(0, 37, 'COURT'), (315, 354, 'PETITIONER'), (371,...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
     ..." with entities "[(7, 45, 'COURT'), (190, 201, 'JUDGE'), (330, 348,...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.

       ..." with entities "[(7, 41, 'COURT'), (232, 244, 'JUDGE'), (342, 432,...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.

    D..." with entities "[(7, 43, 'COURT'), (138, 159, 'JUDGE'), (291, 307,...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.

                             ..." with e

1560

labels2i {'<PAD>': 0, 'B-PRECEDENT': 1, 'B-RESPONDENT': 2, 'B-COURT': 3, 'B-PETITIONER': 4, 'B-PROVISION': 5, 'B-LAWYER': 6, 'B-STATUTE': 7, 'B-CASE_NUMBER': 8, 'B-DATE': 9, 'B-OTHER_PERSON': 10, 'B-JUDGE': 11, 'B-ORG': 12, 'B-GPE': 13, 'B-WITNESS': 14, 'I-PRECEDENT': 15, 'I-RESPONDENT': 16, 'I-COURT': 17, 'I-PETITIONER': 18, 'I-PROVISION': 19, 'I-LAWYER': 20, 'I-STATUTE': 21, 'I-CASE_NUMBER': 22, 'I-DATE': 23, 'I-OTHER_PERSON': 24, 'I-JUDGE': 25, 'I-ORG': 26, 'I-GPE': 27, 'I-WITNESS': 28, 'O': 29}




In [3]:
# Reference assignment 3 : https://github.com/csci5832-f22/assignment_3
from typing import List
def make_features(text: List[str], sent_tags) -> List[List[int]]:
    feature_lists = []
    for i, token in enumerate(text):
        feats = []
        if i > 0:
          prev_word = text[i-1]
          prev_pos  = sent_tags[i-1]
        else:
          prev_word = '<s>'
          prev_pos  = "<s>"
        if i < len(text)-1:
          next_word = text[i+1]
          next_pos  = sent_tags[i+1]
        else:
          next_word = '<s>'
          next_pos  = '<s>'
        feats.append(f"word={token}")
        feats.append(f"pos={sent_tags[i]}")
        feats.append(f"prev_word={prev_word}")
        feats.append(f"prev_pos={prev_pos}")
        feats.append(f"next_word={next_word}")
        feats.append(f"next_pos={next_pos}")
        
        feature_lists.append(feats)
    return feature_lists

In [4]:
import spacy
def featurize(sents: List[List[str]]) -> List[List[List[str]]]:
    nlp = spacy.load("en_core_web_sm")
    feats = []
    for sent in sents:
        sent_tags = []
        docs = [nlp(word) for word in sent]
        for doc in docs:
          for token in doc:
            sent_tags.append(token.pos_)
        feats.append(make_features(sent, sent_tags))

    return feats

In [5]:
import torch
from crf import f1_score, predict, PAD_SYMBOL, pad_features, pad_labels
from tqdm.autonotebook import tqdm
import random

def training_loop(
    num_epochs,
    batch_size,
    train_features,
    train_labels,
    dev_features,
    dev_labels,
    optimizer,
    model,
    labels2i,
    pad_feature_idx
):
    samples = list(zip(train_features, train_labels))
    
    random.shuffle(samples)
    
    batches = []
    for i in range(0, len(samples), batch_size):
        batches.append(samples[i:i+batch_size])
    
    print("Training...")
    for i in range(num_epochs):
        losses = []
        for batch in tqdm(batches):
            features, labels = zip(*batch)
            features = pad_features(features, pad_feature_idx)
            features = torch.stack(features)
            labels = pad_labels(labels, labels2i[PAD_SYMBOL])
            labels = torch.stack(labels)
            mask = (labels != labels2i[PAD_SYMBOL])

            optimizer.zero_grad()

            log_likelihood = model(features, labels, mask=mask)
            
            negative_log_likelihood = -log_likelihood
            negative_log_likelihood.backward()

            optimizer.step()

            losses.append(negative_log_likelihood.item())
        
        print(f"epoch {i}, loss: {sum(losses)/len(losses)}")

        dev_f1 = f1_score(predict(model, dev_features), dev_labels, labels2i['O'])
        print(f"Dev F1 {dev_f1}")
        
    return model

In [6]:
from crf import build_features_set
from crf import make_features_dict
from crf import encode_features, encode_labels
from crf import NERTagger

train_features = featurize(train_sents)
dev_features = featurize(dev_sents)
all_features = build_features_set(train_features)
features_dict = make_features_dict(all_features)
model = NERTagger(len(features_dict), len(labels2i))

encoded_train_features = encode_features(train_features, features_dict)
encoded_dev_features = encode_features(dev_features, features_dict)
encoded_train_labels = encode_labels(train_tag_sents, labels2i)
encoded_dev_labels = encode_labels(dev_tag_sents, labels2i)

Building features set!


100%|██████████| 1560/1560 [00:00<00:00, 4107.06it/s]


Found 76260 features


In [7]:
num_epochs = 30
batch_size = 16
LR=0.05
optimizer = torch.optim.SGD(model.parameters(), LR)

model = training_loop(
    num_epochs,
    batch_size,
    encoded_train_features,
    encoded_train_labels,
    encoded_dev_features,
    encoded_dev_labels,
    optimizer,
    model,
    labels2i,
    features_dict[PAD_SYMBOL]
)

Training...


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 0, loss: 126.45675990046287


  score = torch.where(mask[i].unsqueeze(1), next_score, score)


Dev F1 tensor([0.3504])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 1, loss: 70.98768251769397
Dev F1 tensor([0.4218])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 2, loss: 55.25939635841214
Dev F1 tensor([0.4510])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 3, loss: 46.03580825182856
Dev F1 tensor([0.4838])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 4, loss: 53.39538502206608
Dev F1 tensor([0.4947])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 5, loss: 41.598842251057526
Dev F1 tensor([0.5095])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 6, loss: 40.36826441239337
Dev F1 tensor([0.5205])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 7, loss: 35.50864622544269
Dev F1 tensor([0.5332])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 8, loss: 34.01345064202133
Dev F1 tensor([0.5374])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 9, loss: 36.269710998145904
Dev F1 tensor([0.5388])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 10, loss: 32.71673069194872
Dev F1 tensor([0.5542])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 11, loss: 31.23404128210885
Dev F1 tensor([0.5572])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 12, loss: 31.24874307671372
Dev F1 tensor([0.5512])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 13, loss: 29.086858058462337
Dev F1 tensor([0.5554])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 14, loss: 27.944413584105824
Dev F1 tensor([0.5620])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 15, loss: 27.728808062417166
Dev F1 tensor([0.5600])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 16, loss: 29.939846087475214
Dev F1 tensor([0.5622])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 17, loss: 28.432824572738337
Dev F1 tensor([0.5665])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 18, loss: 25.669189988350382
Dev F1 tensor([0.5687])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 19, loss: 29.155021735600062
Dev F1 tensor([0.5700])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 20, loss: 27.635842196795405
Dev F1 tensor([0.5722])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 21, loss: 27.847754750932967
Dev F1 tensor([0.5706])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 22, loss: 23.935424395969935
Dev F1 tensor([0.5717])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 23, loss: 25.71626227242606
Dev F1 tensor([0.5758])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 24, loss: 23.309661641412852
Dev F1 tensor([0.5738])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 25, loss: 23.381238888721075
Dev F1 tensor([0.5832])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 26, loss: 23.185208855843058
Dev F1 tensor([0.5874])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 27, loss: 22.60916212626866
Dev F1 tensor([0.5861])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 28, loss: 21.864125349083725
Dev F1 tensor([0.5861])


  0%|          | 0/98 [00:00<?, ?it/s]

epoch 29, loss: 21.86725328406509
Dev F1 tensor([0.5857])
