In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import mlflow
import plotly.express as px
import pandas as pd

In [None]:
tracking_uri = "http://localhost:8000"
experiment_name = "llm2model2vec"

In [None]:

mlflow.set_tracking_uri(tracking_uri)

In [None]:
runs = mlflow.search_runs(search_all_experiments=True)
runs  = runs[runs["tags.mlflow.parentRunId"].isnull()]
runs

In [None]:
runs['output_dim_numeric'] = runs['params.output_dim'].replace('None', None)
runs['output_dim_numeric'] = pd.to_numeric(runs['output_dim_numeric'], errors='coerce', downcast = "integer")


# Comparing the different hypotheses

### Dimension reduction hurts the performance but is required

In [None]:
run_names = ["dimNone","dim4096", "dim2048", "dim1024", "dim512", "dim256"]
rs = runs[runs['tags.mlflow.runName'].isin(run_names)]
rs

In [None]:
rs_groupby = rs.groupby(["params.output_dim", "metrics.raw_accuracy"]).count()
rs_groupby

All of the runs with output_dim of 256 have the same performance, so we will pick the first. 

In [None]:
rs = rs.groupby(["params.output_dim", "metrics.raw_accuracy"]).first()

In [None]:

rs = rs.reset_index().sort_values(by='output_dim_numeric', na_position='last')

In [None]:
rs[["params.output_dim", "metrics.raw_accuracy","output_dim_numeric"]].sort_values("output_dim_numeric")

In [None]:
px.bar(rs, x = "params.output_dim", y = "metrics.raw_accuracy")

Apparently, going from no PCA to PCA with same dimensions gives a performance boost. 

Otherwise, the fewer the dimensions, the lower the accuracy (as expected). We will stick to 256 because we want small model. 

In [None]:
runs.columns

### Normalization of final embedding improves performance

In [None]:
rs = runs[(runs['tags.mlflow.runName'].str.contains("normalize_embeddings")) | (runs['tags.mlflow.runName']=="dim256")]
rs = rs.groupby("tags.mlflow.runName").first()
rs['params.normalize_embeddings'] = rs['params.normalize_embeddings'].fillna(False)

In [None]:
rs['params.normalize_embeddings']

In [None]:
px.bar(rs, x = "params.normalize_embeddings", y = "metrics.raw_accuracy")

Pretty big performance enhancement to normalize embeddings. So this is definately a good idea.

### There is a sweetspot for SIF coefficients

In [None]:
rs = runs[(runs['tags.mlflow.runName'].str.contains("sif")) | (runs['tags.mlflow.runName']=="dim256")]
rs = rs.groupby("tags.mlflow.runName").first()

In [None]:
rs

In [None]:
px.bar(rs.sort_values(by = "params.sif_coefficient"), x = "params.sif_coefficient", y = "metrics.raw_accuracy")

It looks like we get the highest performance for sif_coefficient 0.005. But we should experiment with normalized embeddings. 

### Larger added vocubalry increases performance

In [None]:
rs = runs[(runs['tags.mlflow.runName'].str.contains("vocab")) | (runs['tags.mlflow.runName'].str.contains("ignore_external")) | (runs['tags.mlflow.runName']=="dim256")]
rs = rs.groupby("tags.mlflow.runName").first()

In [None]:
fig = px.bar(rs.reset_index(), x = "params.vocab_size", y = "metrics.raw_accuracy", color = "tags.mlflow.runName")
fig.update_layout(barmode='group')

Adding external tokens seems to increase performance. Pew. But it looks like performance increases slowly with vocab size. Here, vocab size is the number of added tokens which does not include internal tokens. 

### Unused internal tokens hurt performance

Here we investigage the effect of stripping internal tokens. The tokenizer contains an internal vocabulary of ~150k tokens, must of which are never used. For example, tokens that are upper case are not possible to use because the pre-tokenizer removes them. During dimension reduction these tokens are equal to all other tokens despite not being important and this likely reduces the performance. 

Likewise, tokens containing exotic characters are very infrequently used and we investigate the effect of neglecting them. 

Finally, since there are a lot of tokens are very infrequently used (if used at all), we investigate the effect of neglecting their contribution to the variance when doing the dimension reduction. That way, only tokens seen in the training data is used to perform the dimension reduction. 


In [None]:
rs = runs[(runs['tags.mlflow.runName'].str.contains("strip")) | (runs['tags.mlflow.runName']=="dim256")]
rs = rs.groupby("tags.mlflow.runName").first()



In [None]:
px.bar(rs.reset_index(), x = "tags.mlflow.runName", y = "metrics.raw_accuracy")

It looks like all these initiatives tend to improve the performance. 

### Prenormalization treats tokens more fairly but hurts performance

The raw output of the embeddings are not normalized and since they span large space, the tokens whose embeddings contribute to the variance are likely also the ones that are "long". In an effort to reduce this effect, it was also investigated how normalization of the individual token embeddings affect the performance. By normalizing, we "remove" information but we treat the tokens more "fairly". 

In [None]:
rs = runs[(runs['tags.mlflow.runName'].str.contains("pre")) | (runs['tags.mlflow.runName']=="dim256")]
rs = rs.groupby("tags.mlflow.runName").first()



In [None]:
px.bar(rs.reset_index(), x = "tags.mlflow.runName", y = "metrics.raw_accuracy")

It looks like all prenormalization hurts the performance. 

In [None]:

# Remove lower-case tokens from output tokenizer vocab
#python scripts/hyperparams.py --output-dim 256 --strip-upper-case

# Remove both lower-case tokens and strip exotic tokens from tokenizer vocab
#python scripts/hyperparams.py --output-dim 256 --strip-upper-case --strip-exotic

# Focus dimension reduction (PCA) on the embeddings space that is represented in the corpus.
#python scripts/hyperparams.py --output-dim 256 --strip-upper-case --strip-exotic --focus-pca

In [None]:
# Run experiments with different SIF coefficients
#python scripts/hyperparams.py --output-dim 256 --sif-coefficient 0.01
#python scripts/hyperparams.py --output-dim 256 --sif-coefficient 0.005
#python scripts/hyperparams.py --output-dim 256 --sif-coefficient 0.001
#python scripts/hyperparams.py --output-dim 256 --sif-coefficient 0.0005
#python scripts/hyperparams.py --output-dim 256 --sif-coefficient 0.0001