### Generating randon set of data for training and dev

In [None]:
!pip install faker

In [None]:
import json
import random
import string
from faker import Faker

# Initialize Faker for Indian names/locations
fake = Faker('en_IN')

def get_spoken_digits(ph_str):
    digit_map = {'0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four', '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine'}
    # fifty percent chance to write as words otherwise numbers
    if random.choice([True, False]):
        return " ".join([digit_map[d] for d in ph_str])
    return " ".join(ph_str) 

def introduce_noise(text, noise_level=0.05):
    if not text: # Handle empty string case
        return text

    noisy_text = list(text)
    alphabet = string.ascii_lowercase + string.digits + ' '

    # we either insert , delete or substitute random letters
    for i in range(len(noisy_text) - 1, -1, -1):
        if random.random() < noise_level:
            action = random.choice(['insert', 'delete', 'substitute'])

            if action == 'insert':
                noisy_text.insert(i, random.choice(alphabet))
            elif action == 'delete':
                if len(noisy_text) > 1: # Don't delete if it's the last character
                    noisy_text.pop(i)
            elif action == 'substitute':
                noisy_text[i] = random.choice(alphabet)
    return "".join(noisy_text)

def generate_entry(entry_id, noise_percentage=0.1):
    # we generate patterns to mirror the dataset
    patterns = [
        "email", "phone", "card", "mixed_travel", "simple_city"
    ]
    pattern = random.choice(patterns)

    text = ""
    entities = []

    if pattern == "email":
        fname = fake.first_name().lower()
        lname = fake.last_name().lower()
        domain = random.choice(["gmail", "outlook", "yahoo", "hotmail"])

        intro = random.choice(["my email is ", "email id is ", "contact at "])
        p_name = f"{fname} dot {lname}"
        email_domain = f"{domain} dot com"

        # concatenate the random entities that are generated
        full_str = f"{intro}{p_name} at {email_domain}"

        p_start = len(intro)
        p_end = p_start + len(p_name)

        e_start = p_end + 4
        e_end = e_start + len(email_domain)

        text = full_str

        entities.append({"start": p_start, "end": p_end, "label": "PERSON_NAME"})
        entities.append({"start": e_start, "end": e_end, "label": "EMAIL"})

    elif pattern == "phone":
        ph = str(random.randint(6000000000, 9999999999))
        ph_text = get_spoken_digits(ph)
        intro = random.choice(["my number is ", "call me on ", "phone "])

        text = intro + ph_text
        entities.append({"start": len(intro), "end": len(text), "label": "PHONE"})

    elif pattern == "card":
        c1 = str(random.randint(1000,9999))
        c2 = str(random.randint(1000,9999))
        c3 = str(random.randint(1000,9999))
        c4 = str(random.randint(1000,9999))
        card_str = f"{c1} {c2} {c3} {c4}"
        intro = random.choice(["card number is ", "my credit card is ", "card "])

        text = intro + card_str
        entities.append({"start": len(intro), "end": len(text), "label": "CREDIT_CARD"})

    elif pattern == "mixed_travel":
        city = fake.city().lower()
        date_obj = fake.future_date()
        date_str = date_obj.strftime("%d %m %Y")

        intro = "i will travel to "
        mid = " on "

        text = f"{intro}{city}{mid}{date_str}"

        c_start = len(intro)
        c_end = c_start + len(city)
        d_start = c_end + len(mid)
        d_end = d_start + len(date_str)

        entities.append({"start": c_start, "end": c_end, "label": "CITY"})
        entities.append({"start": d_start, "end": d_end, "label": "DATE"})

    elif pattern == "simple_city":
        city = fake.city().lower()
        intro = random.choice(["i live in ", "location is ", "from "])
        text = intro + city
        entities.append({"start": len(intro), "end": len(text), "label": "CITY"})

    # Introduce noise for a certain percentage of entries
    if random.random() < noise_percentage:
        original_text = text
        text = introduce_noise(text)

        entities = [] 

    return json.dumps({"id": entry_id, "text": text, "entities": entities})

# Generate Files
print("Generating train.jsonl (1000 lines)...")
with open("../data/train.jsonl", "w") as f:
    for i in range(1000):
        # Pass noise_percentage to generate_entry
        f.write(generate_entry(f"utt_{i:04d}", noise_percentage=0.3) + "\n") # 30% noisy data

print("Generating dev.jsonl (200 lines)...")
with open("../data/dev.jsonl", "w") as f:
    for i in range(200):
        f.write(generate_entry(f"utt_{1000+i:04d}", noise_percentage=0.3) + "\n") # 30% noisy data

print("Done! Files saved.")

### Running the training and evaluation pipeline using a larger model (BERT-base-NER) with the same noisy dataset

In [4]:
!python train.py --model_name dslim/bert-base-NER --train ../data/train.jsonl --dev ../data/dev.jsonl --out_dir ../out

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([15]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768])

In [5]:
!python predict.py --model_dir ../out --input ../data/dev.jsonl --output ../out/dev_pred.json

Wrote predictions for 200 utterances to ../out/dev_pred.json


## We can clearly observe that the larger model gives a higher precision on the same dev data for the same hyperparameters

In [6]:
!python eval_span_f1.py --gold ../data/dev.jsonl --pred ../out/dev_pred.json

Per-entity metrics:
CITY            P=0.850 R=0.962 F1=0.903
CREDIT_CARD     P=0.857 R=1.000 F1=0.923
DATE            P=0.926 R=1.000 F1=0.962
EMAIL           P=0.909 R=1.000 F1=0.952
PERSON_NAME     P=0.909 R=1.000 F1=0.952
PHONE           P=0.800 R=1.000 F1=0.889

Macro-F1: 0.930

PII-only metrics: P=0.884 R=1.000 F1=0.938
Non-PII metrics: P=0.850 R=0.962 F1=0.903


### But the tradeoff can be clearly seen in the latency which is higher than 20ms for p95

In [7]:
!python measure_latency.py --model_dir ../out --input ../data/dev.jsonl --runs 50

Latency over 50 runs (batch_size=1):
  p50: 28.73 ms
  p95: 39.60 ms


### Here we train the default model with the same train dataset 

In [8]:
!python train.py --model_name distilbert-base-uncased --train ../data/train.jsonl --dev ../data/dev.jsonl --out_dir ../out

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████████████████████████| 125/125 [00:13<00:00,  8.97it/s]
Epoch 1 average loss: 0.8492
Epoch 2/3: 100%|██████████████████████████████| 125/125 [00:10<00:00, 12.27it/s]
Epoch 2 average loss: 0.2047
Epoch 3/3: 100%|██████████████████████████████| 125/125 [00:10<00:00, 11.48it/s]
Epoch 3 average loss: 0.1298
Saved model + tokenizer to ../out


In [9]:
!python predict.py --model_dir ../out --input ../data/dev.jsonl --output ../out/dev_pred.json

Wrote predictions for 200 utterances to ../out/dev_pred.json


### The precision drops for the same hyperparameter configuration

In [10]:
!python eval_span_f1.py --gold ../data/dev.jsonl --pred ../out/dev_pred.json

Per-entity metrics:
CITY            P=0.714 R=0.849 F1=0.776
CREDIT_CARD     P=0.821 R=0.958 F1=0.885
DATE            P=0.926 R=1.000 F1=0.962
EMAIL           P=0.938 R=1.000 F1=0.968
PERSON_NAME     P=0.882 R=1.000 F1=0.938
PHONE           P=0.833 R=1.000 F1=0.909

Macro-F1: 0.906

PII-only metrics: P=0.883 R=0.992 F1=0.934
Non-PII metrics: P=0.714 R=0.849 F1=0.776


### With a relatively similar precision, the latency is extremely low for the smaller bert model

In [11]:
!python measure_latency.py --model_dir ../out --input ../data/dev.jsonl --runs 50

Latency over 50 runs (batch_size=1):
  p50: 12.52 ms
  p95: 14.62 ms


### Conclusion:

We need to tradeoff between the precision and inference latency. If we need better precision over the same test set, we use a slighlty larger model. But, if we need faster inference, we use a smaller model while losing some precision on the same task.