In [None]:
%load_ext autoreload
%autoreload 2

# Getting data
Only focusing on Danish data 

In [None]:
from datasets import load_dataset
import plotly.express as px
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast

In [None]:
# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("DDSC/nordic-embedding-training-data")

In [None]:
ds

In [None]:
dsdk = ds.filter(lambda sample: True if sample["language"] == "danish" else False)
texts = dsdk["train"]["query"] + dsdk["train"]["positive"] + dsdk["train"]["negative"]

In [None]:
type(dsdk["train"]["positive"])

In [None]:
texts[7912]

In [None]:
len(texts)

# Tokenizing dataset
We want a small model. For static models, the size of the model scales with the number of tokens. To reduce the number of tokens we will therefore add a normalizer that lower-cases. 

In [None]:
import regex
from collections import Counter
from tqdm import tqdm
import pandas as pd
from plotly import express as px

In [None]:
tokenizer_regex = regex.compile(r"\w+")

# Tokenize all texts
tokens = []

for text in tqdm(texts, desc="Tokenizing texts"):
    if text:
        tokens.extend(tokenizer_regex.findall(text.lower()))

token_counts = Counter(tokens)

In [None]:
len(token_counts)

In [None]:
counts_df = pd.DataFrame(token_counts.most_common(), columns=["word", "counts"])
counts_df = counts_df.reset_index().rename(columns={"index": "rank"})

In [None]:
counts_df["prob"] = counts_df["counts"] / counts_df["counts"].sum()

In [None]:
vis_idx = list(range(1000)) + list(range(1000, counts_df.shape[0], 1000))
len(vis_idx)
px.line(counts_df.take(vis_idx), x="rank", y="counts", hover_data=["word"])

In [None]:
counts_df

In [None]:
n_vertical_rows = 100
fig = px.line(
    counts_df.take(vis_idx), x="rank", y="counts", hover_data=["word"], log_x=True
)

for idx, count_line in (
    counts_df.groupby("counts")
    .first()
    .sort_values(by="rank", ascending=False)
    .iloc[:n_vertical_rows]
    .iterrows()
):
    fig.add_vline(x=count_line["rank"])

fig.show()

From around index 260k, there is only a single count. 
From around index 190k, there is only two counts. 

# Understanding weighting in model2vec

The distillation in model2vec includes a post-processing step which requires the size of the final token space. This includes built-in tokens and the added vocabulary. To assess the weighting, we need to create the total token space first. 

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoConfig
from model2vec.tokenizer import (
    clean_and_create_vocabulary,
    replace_vocabulary,
    turn_tokens_into_ids,
)
from model2vec.distill.inference import (
    PCADimType,
    create_embeddings,
    post_process_embeddings,
)
from tokenizers import normalizers
from typing import Optional, cast
import numpy as np

In [None]:
tokenizer = AutoTokenizer.from_pretrained("jealk/llm2vec-scandi-mntp-v2")

We are adding the lower-case vocabulary, so we need to add a normaliser to the tokenizer. 

In [None]:
print("Type of tokenizer", type(tokenizer))
print("Type of backend tokenizer", type(tokenizer.backend_tokenizer))

In [None]:
# Check current normalizer
print("Current backend normalizer:", tokenizer.backend_tokenizer.normalizer)

The current model does not have a normaliser, but we want to add one that ignores casing. 

In [None]:
# Add lowercase normalization
if tokenizer.backend_tokenizer.normalizer is not None:
    # Keep existing normalizers and add lowercase
    tokenizer.backend_tokenizer.normalizer = normalizers.Sequence(
        [tokenizer.backend_tokenizer.normalizer, normalizers.Lowercase()]
    )
else:
    # No existing normalizer, just add lowercase
    tokenizer.backend_tokenizer.normalizer = normalizers.Lowercase()

print("New normalizer:", tokenizer.backend_tokenizer.normalizer)

In [None]:
type(tokenizer)

In [None]:
from dkmodel2vec.vocab import add_instruction_tokenizer

t2 = add_instruction_tokenizer(tokenizer, instruction=DANISH_INSTRUCTION)

In [None]:
t2.decode(t2.encode("fjaldf"), add_special_tokens=True)

The prepend method allows us to add instructions. 

In [None]:
# What preprocessing-related attributes exist?
attrs = [
    attr
    for attr in dir(tokenizer)
    if "preprocess" in attr.lower()
    or "prefix" in attr.lower()
    or "template" in attr.lower()
]
print(attrs)

In [None]:
print("New normalizer:", tokenizer.backend_tokenizer.normalizer)

In [None]:
# Test with mixed case text
test_texts = ["Hej Sverige!", "DETTE ER DANSK TEKST", "Mixed CaSe TeXt"]

print("Testing lowercase normalization:")
for text in test_texts:
    tokens = tokenizer.tokenize(text)
    print(f"Input: '{text}'")
    print(f"Tokens: {tokens}")
    print(
        f"Decoded: '{tokenizer.decode(tokenizer.encode(text), skip_special_tokens=False)}'"
    )
    print()

It works!

In the following we go through the different steps in the distillation script in model2vec/distilllation.py: 

In [None]:
backend_tokenizer = tokenizer.backend_tokenizer

In [None]:
vocabulary = counts_df["word"].tolist()  # vocabulary is sorted by popularity

counts_df

In [None]:
n_tokens_before = len(vocabulary)

print(f"{n_tokens_before} individual words")

In [None]:
token_remove_regex = None
all_tokens, backend_tokenizer_new_normalizer = clean_and_create_vocabulary(
    tokenizer, vocabulary, token_remove_regex=token_remove_regex
)

In [None]:
backend_tokenizer.model

The BPE tokenizer is a byte-pair encoder which allows tokenization of all strings. 

In [None]:
backend_tokenizer_new_normalizer.model

In [None]:
import model2vec

model2vec.__version__

In [None]:
len(all_tokens)

In [None]:
n_tokens_after = len([token for token in all_tokens if not token.is_internal])
print(f"{n_tokens_after} external tokens (added from vocabulary)")

There are still >600k tokens so this is waaay too big..

In [None]:
tokenizer.special_tokens_map

In [None]:
[token for token in tokenizer.get_vocab() if "." == token]

In [None]:
unk_token = cast(Optional[str], tokenizer.special_tokens_map.get("unk_token"))
pad_token = cast(Optional[str], tokenizer.special_tokens_map.get("pad_token"))

# Weird if to satsify mypy

if unk_token is None:
    unk_token = cast(
        Optional[str], [token for token in tokenizer.get_vocab() if "_" == token][0]
    )
    print(
        "The unknown token is not set. Setting it to the '_' token. This is a workaround to allow encoding of more texts without error."
    )
if pad_token is None:
    if unk_token is not None:
        pad_token = unk_token
        print(
            "The pad token is not set. Setting it to the unk token. This is a workaround for models that don't have a pad token."
        )
    else:
        pad_token = unk_token or all_tokens[0].form
        print(
            "The pad token is not set. Setting it to the first token in the vocabulary. This is a workaround for models that don't have a pad token."
        )

In [None]:
print(f"unk token: '{unk_token}', pad_token: '{pad_token}'")

unk_token is None by default because I am looking at a BPE tokenizer which can tokenizer anything and therefore does not require an unk token. 

In [None]:
"_" in tokenizer.get_vocab()

In [None]:
from model2vec.tokenizer.datamodels import Token

unk_token_obj = Token(
    form="[UNK]", normalized_form="[UNK]", is_subword=False, is_internal=False
)
all_tokens_with_unk_token = all_tokens + [unk_token_obj]

In [None]:
# Replace the vocabulary in the tokenizer with the new vocabulary.
backend_tokenizer_replaced_vocab = replace_vocabulary(
    backend_tokenizer_new_normalizer,
    all_tokens_with_unk_token,
    unk_token="[UNK]",
    pad_token=pad_token,
)

In [None]:
backend_tokenizer_replaced_vocab.model
# backend_tokenizer_replaced_vocab.model

In [None]:
backend_tokenizer_replaced_vocab.encode(texts[12167])

In [None]:
internal_tokens = [token for token in all_tokens if token.is_internal == True]
external_tokens = [token for token in all_tokens if token.is_internal == False]
print(
    f"Internal tokens: {len(internal_tokens)}, External tokens: {len(external_tokens)}"
)

In [None]:
len(all_tokens_with_unk_token)

In [None]:
# Convert tokens to IDs
token_ids = turn_tokens_into_ids(all_tokens_with_unk_token, tokenizer, unk_token)

In [None]:
token_ids[300_000:300_050]

We need to check that the tokens_ids decode properly. 

In [None]:
tokenizer.decode(token_ids[-2])  # last token

In [None]:
all_tokens_with_unk_token[-2].form

In [None]:
assert tokenizer.decode(token_ids[-2]) == tokenizer.decode(
    tokenizer.encode(all_tokens_with_unk_token[-2].form), skip_special_tokens=False
)

Looks good!

In [None]:
import sys
import os

sys.path.append(os.path.dirname(os.getcwd()))

In [None]:
from dkmodel2vec.distillation import estimate_token_frequencies

In [None]:
estimate_token_frequencies(
    backend_tokenizer=backend_tokenizer_replaced_vocab,
    corpus_texts=["list fjkl", "Hej "],
)

In [None]:
backend_tokenizer_replaced_vocab.encode(
    'Analyser denleichtige metaforiske anvendelse af lyset i H.C. Andersens eventyr "Den lille pige med svovlstikkerne" og "HyacintensSystematik_".'
)

In [None]:
texts[7912]

In [None]:
texts[12167]

In [None]:
backend_tokenizer_replaced_vocab.encode(".").ids

In [None]:
backend_tokenizer_replaced_vocab.encode(texts[12167])

In [None]:
token_counts = estimate_token_frequencies(
    backend_tokenizer=backend_tokenizer_replaced_vocab,
    corpus_texts=texts,
    batch_size=1000,
)

We are creating a new tokenizer and accompanying embeddings.

As long as we are guaranteed that the tokens in the new tokenizer has had created the correct embeddings then we are safe. I have added a new normalizer. That ought not to change anything. 



In [None]:
token_counts

In [None]:
token_counts.size

In [None]:
from dkmodel2vec.distillation import weigh_by_freq

total = token_counts.total()
weights = np.asarray([total / count_n for _, count_n in token_counts.most_common()])

In [None]:
weights

In [None]:
weights / np.abs(np.max(weights))

In [None]:
backend_tokenizer_replaced_vocab.id_to_token(601)

In [None]:
token_counts.total()

In [None]:
counted = [
    (backend_tokenizer_replaced_vocab.id_to_token(id), id, count)
    for id, count in token_counts.most_common()
]

In [None]:
counts_token = pd.DataFrame(counted, columns=["token", "id", "count"]).sort_values(
    by="count", ascending=False
)
counts_token["rank"] = counts_token.index

In [None]:
counts_token.shape

In [None]:
counts_token

In [None]:
# only show these indices
vis_idx = list(range(1000)) + list(range(1000, counts_token.shape[0], 1000))

# number of vertical lines
n_vertical_rows = 100
fig = px.line(
    counts_token.take(vis_idx),
    x="rank",
    y="count",
    hover_data=["token", "id", "rank"],
    log_x=True,
)

for idx, count_line in (
    counts_token.groupby("count")
    .first()
    .sort_values(by="rank", ascending=False)
    .iloc[:n_vertical_rows]
    .iterrows()
):
    fig.add_vline(x=count_line["rank"])

fig.show()

In [None]:
seen_token_ids = [n[0] for n in token_counts.most_common()]
seen_set = set(seen_token_ids)

# Single list comprehension: ordering is seen ->unseen token
sorted_tokens = [all_tokens_with_unk_token[token_id] for token_id in seen_token_ids] + [
    token for i, token in enumerate(all_tokens_with_unk_token) if i not in seen_set
]

In [None]:
sorted_tokens[-300_000:]

In [None]:
backend_tokenizer_replaced_vocab.encode("Hej,  Jeg hedder anders").tokens

Looks good...

Model2vec uses the following approach to weigh the embedding of each token:

In [None]:
def get_weights(token_size: int, sif_coefficient: float):
    """Calculate weight for each token using Zipf's law and the SIF coefficient."""
    inv_rank = 1 / (np.arange(2, token_size + 2))
    proba = inv_rank / np.sum(inv_rank)
    weights = sif_coefficient / (sif_coefficient + proba)
    return weights


sif_coefficient = 1e-4

counts_token["estimated_weight"] = get_weights(
    token_size=counts_token.shape[0], sif_coefficient=sif_coefficient
)
counts_token["estimated_prob"] = 1 / counts_token["estimated_weight"]
counts_token["estimated_prob"] = (
    counts_token["estimated_prob"] / counts_token["estimated_weight"].sum()
)

counts_token["prob"] = counts_token["count"] / counts_token["count"].sum()

counts_token["prob/estimated_prob"] = (
    counts_token["prob"] / counts_token["estimated_prob"]
)

In [None]:
counts_token

In [None]:
px.line(
    counts_token.iloc[vis_idx],
    x="rank",
    y=["prob", "estimated_prob"],
    hover_data=["token", "count"],
    log_x=True,
    log_y=True,
)

Hmmm.. For this token_size it look like it underestimates the probability of frequent words and underestimates the frequency of rare words. Let's instead just use the weights from the actual dataset. This means we set the SIF coefficient to zero and implement our own post-processing and distillation script.