### Download dataset

In [None]:
import urllib.request
import re

url = "https://raw.githubusercontent.com/datamade/probablepeople/master/name_data/labeled/person_labeled.xml"
urllib.request.urlretrieve(url, "person_labeled.xml")

('person_labeled.xml', <http.client.HTTPMessage at 0x7f4ef40eefb0>)

In [None]:
with open("person_labeled.xml", 'r') as f:
    xml_text = f.read()

In [None]:
tags = set([tag[1:-1] for tag in re.findall(r'<\w+>', xml_text)])

In [None]:
tags

{'And',
 'FirstInitial',
 'GivenName',
 'LastInitial',
 'MiddleInitial',
 'MiddleName',
 'Name',
 'NameCollection',
 'Nickname',
 'PrefixMarital',
 'PrefixOther',
 'SuffixGenerational',
 'SuffixOther',
 'Surname'}

### Calculating statistic of tags appearances

In [None]:
tags_stat = dict()
for tag in re.findall(r'<\w+>', xml_text):
  if tag in tags_stat:
    tags_stat[tag] += 1
  else:
    tags_stat[tag] = 0
tags_stat

{'<NameCollection>': 0,
 '<Name>': 2891,
 '<GivenName>': 2349,
 '<Surname>': 2337,
 '<FirstInitial>': 69,
 '<LastInitial>': 45,
 '<SuffixGenerational>': 164,
 '<MiddleName>': 189,
 '<MiddleInitial>': 386,
 '<Nickname>': 96,
 '<And>': 85,
 '<SuffixOther>': 114,
 '<PrefixOther>': 62,
 '<PrefixMarital>': 102}

### Remove unnecessary labels

In [None]:
tags.remove('Name')
tags.remove('NameCollection')

In [None]:
from copy import deepcopy

all_tags = deepcopy(tags)

### Creating input data for dataset consisted of words and tags

In [None]:
import xml.etree.ElementTree as ET

tree = ET.ElementTree(ET.fromstring(xml_text))
root = tree.getroot()

sentences, tags = [], []
for name in root.findall('Name'):
    sentence, tag = [], []
    for child in name:
        sentence.append(child.text.strip())
        tag.append(child.tag)
    sentences.append(sentence)
    tags.append(tag)


In [None]:
sentences[10:20], tags[10:20]

([['Jianxiong', 'Xiao'],
  ['B.', 'Cloer'],
  ['FRANCO'],
  ['MEDINA'],
  ['Russell,', 'Herman'],
  ['WILKERSON'],
  ['Elizabeth', 'B'],
  ['ROCHELL'],
  ['BIRD'],
  ['CASE']],
 [['GivenName', 'Surname'],
  ['FirstInitial', 'Surname'],
  ['Surname'],
  ['Surname'],
  ['Surname', 'GivenName'],
  ['Surname'],
  ['GivenName', 'LastInitial'],
  ['GivenName'],
  ['Surname'],
  ['Surname']])

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m59.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m118.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [None]:
import torch
from torchtext import data
from transformers import BertTokenizer, BertForTokenClassification, BertModel
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [None]:
all_tags

{'And',
 'FirstInitial',
 'GivenName',
 'LastInitial',
 'MiddleInitial',
 'MiddleName',
 'Nickname',
 'PrefixMarital',
 'PrefixOther',
 'SuffixGenerational',
 'SuffixOther',
 'Surname'}

In [None]:
all_tags = list(all_tags)
tag2idx = {all_tags[i]: i for i in range(len(all_tags))}


### Creating space token and pad token. Space token I use for better distinguishing words in inference

In [None]:
tag2idx['space'] = 12
tag2idx['pad'] = 13
tag2idx

{'GivenName': 1,
 'MiddleInitial': 0,
 'FirstInitial': 2,
 'And': 3,
 'SuffixGenerational': 4,
 'Nickname': 5,
 'PrefixOther': 7,
 'Surname': 6,
 'MiddleName': 8,
 'PrefixMarital': 9,
 'SuffixOther': 10,
 'LastInitial': 11,
 'space': 12,
 'pad': 13}

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

### Create NER Dataset

In [None]:
class NERDataset(Dataset):
    def __init__(self, sentences, tags):
        self.sentences = sentences
        self.tags = tags
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        tags = self.tags[idx]
        sentence = self.sentences[idx]

        tokens = []
        tag_labels = []
        for word, tag in zip(sentence, tags):
            word_tokens = self.tokenizer.tokenize(word)
            tokens.extend(word_tokens + ['%'])
            tag_labels.extend([tag] * len(word_tokens) + ['space'])
        tokens.pop()
        tag_labels.pop()

        encoded_input = torch.tensor(self.tokenizer.convert_tokens_to_ids(tokens))
        attention_mask = torch.tensor([1] * len(encoded_input))
        encoded_tags = torch.tensor(list(map(lambda x: tag2idx[x], tag_labels)))
        tags = list(map(lambda x: tag2idx[x], tags))
        return encoded_input, attention_mask, encoded_tags, tags


### Create dataloader and batch it according lengths of sequence so model could learn more effectively

In [None]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # Separate the batch into input_ids, attention_masks, tags and raw_tags
    input_ids, attention_masks, tags, raw_tags = zip(*batch)
    input_ids = pad_sequence([input_ids[0].T, input_ids[1].T, input_ids[2].T, input_ids[3].T], batch_first=True, padding_value=0)
    attention_masks = pad_sequence(attention_masks, batch_first=True)
    tags = pad_sequence(tags, batch_first=True, padding_value=13)

    return input_ids, attention_masks, torch.tensor(tags), raw_tags

# Create the dataloader with the custom collate_fn
dataset = NERDataset(sentences, tags)
sorted_dataset = sorted(dataset, key=lambda x: len(x[0]))
dataloader = DataLoader(sorted_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

### Creating simple model with biLSTM and Linear on top of bert embeddings

In [None]:
class NERModel(nn.Module):
    def __init__(self, num_tags):
        super(NERModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.lstm = nn.LSTM(768, 128, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(256, num_tags)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs['last_hidden_state']
        lstm_output, _ = self.lstm(sequence_output)
        lstm_output = self.dropout(lstm_output)
        logits = self.fc(lstm_output)

        return logits

In [None]:
model = NERModel(num_tags=len(tag2idx))

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from torch.nn.functional import cross_entropy

### Define cross entropy loss with logits

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-6)

# Training loop (you can modify this based on your specific needs)
num_epochs = 1000
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

NERModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [None]:
device

device(type='cuda')

### Setting requires_grad = False for bert layers so only last layers make to train

In [None]:
for name, para in model.named_parameters():
    if name.startswith('bert.'):
        para.requires_grad = False

In [None]:
for param in model.parameters():
    print(param.requires_grad)

In [None]:
num_classes = len(tag2idx)

In [None]:
import numpy as np

### Trained several times with num_epochs=1000,400,200. First time trained on sorted dataset, next time on randomly batched dataset

In [None]:
for epoch in range(num_epochs):
    i = 0
    losses = []
    for batch in dataloader:
        i += 1
        input_ids, attention_mask, tags_, _ = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        tags_ = tags_.to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits.reshape(-1, num_classes), tags_.reshape(-1))
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {np.mean(losses):.4f}")

  return input_ids, attention_masks, torch.tensor(tags), raw_tags


Epoch 1/200 | Loss: 0.8136
Epoch 2/200 | Loss: 0.8141
Epoch 3/200 | Loss: 0.8129
Epoch 4/200 | Loss: 0.8128
Epoch 5/200 | Loss: 0.8122
Epoch 6/200 | Loss: 0.8103
Epoch 7/200 | Loss: 0.8097
Epoch 8/200 | Loss: 0.8090
Epoch 9/200 | Loss: 0.8090
Epoch 10/200 | Loss: 0.8063
Epoch 11/200 | Loss: 0.8071
Epoch 12/200 | Loss: 0.8059
Epoch 13/200 | Loss: 0.8047
Epoch 14/200 | Loss: 0.8041
Epoch 15/200 | Loss: 0.8022
Epoch 16/200 | Loss: 0.8025
Epoch 17/200 | Loss: 0.8011
Epoch 18/200 | Loss: 0.8016
Epoch 19/200 | Loss: 0.7993
Epoch 20/200 | Loss: 0.7991
Epoch 21/200 | Loss: 0.7979
Epoch 22/200 | Loss: 0.7962
Epoch 23/200 | Loss: 0.7954
Epoch 24/200 | Loss: 0.7960
Epoch 25/200 | Loss: 0.7959
Epoch 26/200 | Loss: 0.7948
Epoch 27/200 | Loss: 0.7938
Epoch 28/200 | Loss: 0.7924
Epoch 29/200 | Loss: 0.7926
Epoch 30/200 | Loss: 0.7915
Epoch 31/200 | Loss: 0.7894
Epoch 32/200 | Loss: 0.7883
Epoch 33/200 | Loss: 0.7885
Epoch 34/200 | Loss: 0.7878
Epoch 35/200 | Loss: 0.7881
Epoch 36/200 | Loss: 0.7848
E

In [None]:
torch.save(model, 'ner_model2.pt')

In [None]:
model2 = torch.load('drive/MyDrive/ner_model2.pt') #, map_location=torch.device('cpu'))

In [None]:
!cp ner_model2.pt drive/MyDrive

In [None]:
num_classes = len(tag2idx)

In [None]:
num_classes

14

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### As model predicts tag for each token we need to make prediction for word consisted of several tokens. We do it by averaging up probabilities of tokens and taking argmax of it

In [None]:
def get_word_predictions(logits, separation_symbol=12):
    _, predicted_labels = torch.max(logits, dim=2)
    # print(predicted_labels)
    word_predictions = []
    batch_word_predictions = []

    for batch in range(logits.size(0)):
        word = []
        word_target = []
        word_length = 0
        for token in range(logits.size(1)):
            label = predicted_labels[batch, token].item()
            if label != separation_symbol:
                word.append(label)
                word_target.append(logits[batch, token])
                word_length += 1
            else:
                if word:
                    # Choose one target from multiple targets for each token
                    averaged_target = sum(word_target) / word_length
                    word_predictions.append(torch.argmax(averaged_target).item())
                    word_target = []
                    word = []
                    word_length = 0

        if word:
            averaged_target = sum(word_target) / word_length
            word_predictions.append(torch.argmax(averaged_target).item())
        batch_word_predictions.append(word_predictions)
        word_predictions = []

    return batch_word_predictions

### Function for calculation precision recall for each class

In [None]:
def calculate_precision_recall(model):
    true_positives = torch.zeros(num_classes)
    false_positives = torch.zeros(num_classes)
    false_negatives = torch.zeros(num_classes)
    model.eval()
    for batch in dataloader:
        input_ids, attention_mask, encoded_target, raw_target = batch
        with torch.no_grad():
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            logits = model(input_ids, attention_mask)
        predicted_targets = get_word_predictions(logits)
        for i in range(len(raw_target)):
            for true_label, predicted_label in zip(raw_target[i], predicted_targets[i]):
                if predicted_label == true_label:
                    true_positives[true_label] += 1
                else:
                    false_positives[predicted_label] += 1
                    false_negatives[true_label] += 1

    precisions = true_positives / (true_positives + false_positives)
    recalls = true_positives / (true_positives + false_negatives)

    return precisions, recalls

### Calculate precision and recall on train dataset

In [None]:
precisions, recalls = calculate_precision_recall(model2)

  return input_ids, attention_masks, torch.tensor(tags), raw_tags


In [None]:
precisions

tensor([   nan, 0.6023, 0.0060, 0.0000,    nan,    nan, 0.6832,    nan,    nan,
           nan,    nan,    nan,    nan, 0.0000])

In [None]:
recalls

tensor([0.0000, 0.7082, 0.0053, 0.0000, 0.0000, 0.0000, 0.8199, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000,    nan,    nan])

In [None]:
import pandas as pd

In [None]:
results = pd.DataFrame({'GivenName': [0.60, 0.71],
                        'Surname': [0.68, 0.82]},
                        index=['precision', 'recall'])
results

Unnamed: 0,GivenName,Surname
precision,0.6,0.68
recall,0.71,0.82


### Conclusions and what else can be done:

1. Model fit relatively good on the biggest classes
2. We can train longer, because loss continued to decline
3. Split on train/val dataset to estimate metrics accuratly
4. Hardcode labels for some classes for example for "And" class, we will get in general very good metrics for this class
5. Hardcode rules for some classes, for example for class "Surname", if word ends on 'ov' for russian surnames then it is surname
6. Merge some two classes in one class and then create classificator between these 2 classes for example for prefixes and suffixes
7. Experiment with current architecture, optimizer, learning_rate
8. Try CRF on top of the model layers