# Note

In Hugging Face tokenizers, each model is trained on it's own tokens. 
[Source](https://discuss.huggingface.co/t/tokenizer-splits-up-pre-split-tokens/2078)

# Setup

## Imports

In [1]:
# General
import numpy as np
import pandas as pd

from datasets import Dataset

import torch
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer, AutoModel
from transformers.pipelines.pt_utils import KeyDataset

In [2]:
import os
import sys
# Add the path to the utils folder
sys.path.append(os.path.abspath('../..'))
import importlib
# Custom modules
from utils import memory_usage, load_json, process_parquet_in_chunks, file_exists
from config import run_config, PROCESSED_DATA_PATH, FEATURES_PATH, MODELS_PATH
importlib.reload(sys.modules['utils'])
importlib.reload(sys.modules['config'])

<module 'config' from 'e:\\College\\4- Senior 2\\Semester 1\\NLP\\Project\\config.py'>

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


## Config

In [4]:
run_config()

# Load Data

In [5]:
X_train = np.load(PROCESSED_DATA_PATH + "/X_train.npy", allow_pickle=True)

In [6]:
X_train[0]

array(['three', 'pizzas', 'no', 'american', 'cheese', 'and', 'a', 'water',
       'and', 'one', 'ginger', 'ale', 'and', 'a', 'san', 'pellegrino'],
      dtype=object)

In [7]:
X_train.shape

(2100467,)

In [8]:
max_length = max(len(seq) for seq in X_train)
max_length

32

# jina.ai

v2 returns **sentence** embeddings

In [14]:
# model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-small-en', trust_remote_code=True) # trust_remote_code is needed to use the encode method
# embeddings = model.encode(['How is the weather today?'.split(), 'What is the current weather like today?'.split()], padding=True, truncation=True, max_length=128, return_tensors='pt', )

In [22]:
feature_extraction = pipeline('feature-extraction', model="jinaai/jina-embeddings-v3", tokenizer="jinaai/jina-embeddings-v3", device=device, trust_remote_code=True)
features = feature_extraction(["I am a longer sentence more than 5 tokens", "I am a short sentence"], padding=True, truncation=True, max_length=128, return_tensors='pt')

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Some weights of the model checkpoint at jinaai/jina-embeddings-v3 were not used when initializing XLMRobertaModel: ['roberta.emb_ln.bias', 'roberta.emb_ln.weight', 'roberta.embeddings.token_type_embeddings.parametrizations.weight.0.lora_A', 'roberta.embeddings.token_type_embeddings.parametrizations.weight.0.lora_B', 'roberta.embeddings.token_type_embeddings.parametrizations.weight.original', 'roberta.embeddings.word_embeddings.parametrizations.weight.0.lora_A', 'roberta.embeddings.word_embeddings.parametrizations.weight.0.lora_B', 'roberta.embeddings.word_embeddings.parametrizations.weight.original', 'roberta.encoder.layers.0.mixer.Wqkv.bias', 'roberta.encoder.layers.0.mixer.Wqkv.parametrizations.weight.0.lora_A', 'roberta.encoder.layers.0.mixer.Wqkv.parametrizations.weight.0.lora_B', 'roberta.encoder.layers.0.mixer.Wqkv.parametrizations.weight.original', 'roberta.encoder.layers.0.mi

# Load RoBERTa model

- RoBERTa: It follows the same approach as BERT for input representations but uses **byte pair encoding** (BPE) with a larger vocabulary size (up to 50,000 tokens). [Source](https://dsstream.com/roberta-vs-bert-exploring-the-evolution-of-transformer-models/#:~:text=RoBERTa%3A%20It%20follows%20the%20same,Next%20Sentence%20Prediction%20(NSP))
- [Best Embeddings Models](https://www.reddit.com/r/LocalLLaMA/comments/18j39qt/what_embedding_models_are_you_using_for_rag/)
- [Pipeline Reference](https://huggingface.co/docs/transformers/main_classes/pipelines)
- BPE will help overcome spelling mistakes (in my opinion).

In [None]:
update_feature = False
if update_feature or not file_exists(MODELS_PATH + "/contextual_embeddings.parquet"):
    tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True)
    pipe = pipeline("feature-extraction", model="jinaai/jina-embeddings-v3", trust_remote_code=True, device=device)
    pipe.model.eval()

    # Convert your data to a Hugging Face dataset
    dataset = Dataset.from_pandas(pd.DataFrame(X_train, columns=["tokenized"]))

    # Use KeyDataset to create a dataset that extracts the "tokenized" field
    key_dataset = KeyDataset(dataset, "tokenized")

    # Initialize an empty list to store embeddings
    all_embeddings = []

    # # Process the dataset in batches and collect embeddings
    # for out in pipe(key_dataset, batch_size=8, padding='max_length', truncation=True, max_length=max_length):
    #     all_embeddings.extend(out)
    # Process the dataset in batches and collect embeddings
    for batch in key_dataset:
        tokenized_inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        print(tokenized_inputs)
        embeddings = pipe(tokenized_inputs["input_ids"])
        all_embeddings.extend(embeddings)


    # Convert the embeddings to a DataFrame
    embeddings = pd.DataFrame(all_embeddings)

    # Save the embeddings to a parquet file
    embeddings.to_parquet(MODELS_PATH + "/contextual_embeddings.parquet")
else:
    print("Loading the model")
    embeddings = pd.read_parquet(MODELS_PATH + "/contextual_embeddings.parquet")


# MiniLM

In [87]:
sentences = ["This is an example sentence", "Each sentence is converted word unqiue"]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Tokenize sentences
encoded_input = tokenizer(list(X_train[0]), padding=True, truncation=True, return_tensors='pt', is_split_into_words=True, )

print(encoded_input.input_ids)

print(tokenizer.decode(encoded_input.input_ids[0]))

print(tokenizer.convert_ids_to_tokens(encoded_input.input_ids[0]))

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)


tensor([[  101,  2093, 10733,  2015,  2053,  2137,  8808,  1998,  1037,  2300,
          1998,  2028, 14580, 15669,  1998,  1037,  2624, 21877,  6216, 24860,
          2080,   102]])
[CLS] three pizzas no american cheese and a water and one ginger ale and a san pellegrino [SEP]
['[CLS]', 'three', 'pizza', '##s', 'no', 'american', 'cheese', 'and', 'a', 'water', 'and', 'one', 'ginger', 'ale', 'and', 'a', 'san', 'pe', '##lle', '##grin', '##o', '[SEP]']


In [80]:
model_output[0][0].shape

torch.Size([10, 384])

In [None]:
model_output.last_hidden_state

tensor(True)

## Word Vectorization

Related Resources:
- TF-IDF Matrix -> https://openclassrooms.com/en/courses/6532301-introduction-to-natural-language-processing/8081363-apply-the-tf-idf-vectorization-approach

In [None]:
def vectorize_words(row):
    tokens: list[str] = row["tokenized"]
    sentence_tfidfs = []
    sentence_word2vec = []
    for token in tokens:
        tfidf_index = vectorizer.vocabulary_.get(token, 0) # Default index zero
        sentence_tfidfs.append(tfidf_features[:, tfidf_index].toarray().reshape(-1))
        sentence_word2vec.append(word_embeddings.get(token, [0] * 100)) # Default zero-vector
    row['tfidf_features'], row['word2vec_features'] = sentence_tfidfs, sentence_word2vec
    return row

In [None]:
df_train = df_train.progress_apply(vectorize_words, axis=1)
df_train.head(1)

  0%|          | 58/2456446 [01:25<934:18:24,  1.37s/it] 

In [None]:
df_dev = df_dev.progress_apply(vectorize_words, axis=1)
df_dev.head(1)

100%|██████████| 348/348 [00:00<00:00, 499.51it/s]


Unnamed: 0,src,top,tokenized,tfidf_features,word2vec_features
0,i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage,i want to order (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING sausage ) and (TOPPING black olives ) ) and (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING pepperoni ) and (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) and (PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING pepperoni ) and (TOPPING sausage ) ),"[i, want, to, order, two, medium, pizzas, with, sausage, and, black, olives, and, two, medium, pizzas, with, pepperoni, and, extra, cheese, and, three, large, pizzas, with, pepperoni, and, sausage]","[[0.08430143723922251, 0.0, 0.13337969831862775, 0.0, 0.0, 0.10034713760427152, 0.17396375337864922, 0.17958770538428326, 0.14606531916873786, 0.17721922092796344, 0.19713448714781656, 0.0, 0.17288347347903418, 0.0, 0.0, 0.0, 0.1366724335010161, 0.0, 0.0, 0.1562794386914836, 0.19391225920436742, 0.0, 0.11286164475205294, 0.0, 0.12416543512769322, 0.0, 0.0, 0.19234760746279847, 0.12248119173361699, 0.0, 0.20664361011193244, 0.0, 0.18995951017233198, 0.2045777012433944, 0.1168537902697436, 0.0, 0.16890313266495663, 0.0, 0.0, 0.1335421575549344, 0.0, 0.19944260753166945, 0.0, 0.18473089869790738, 0.0, 0.19133428377261583, 0.20978968793267175, 0.14310616380645452, 0.17265659900612262, 0.0, 0.16817054006660384, 0.0, 0.14150333339868498, 0.17275472419101204, 0.22930989128613138, 0.0, 0.0, 0.20155267717081113, 0.1546082459604938, 0.15693159662581818, 0.1832160787506764, 0.0, 0.18729363892005363, 0.0, 0.20773045712657492, 0.0, 0.19346051503038109, 0.0, 0.16041910398343542, 0.09303061188182...","[[0.0677469, -0.041739315, -0.08083352, 0.12261514, 0.16721858, -0.05736017, 0.09708657, 0.28409472, -0.118705854, -0.027471012, -0.057421274, -0.14376749, 0.0473368, 0.14283921, -0.030394064, -0.031858526, 0.052781112, 0.07924095, 0.0069334647, -0.2780961, 0.09244968, -0.032641098, -0.026583068, 0.036451172, 0.03341323, -0.025228117, -0.07656543, -0.118221, -0.10734222, 0.03260397, 0.15275688, 0.05852104, 0.106173486, -0.05881508, -0.082549885, 0.1174997, 0.06174134, -0.08423522, -0.055344716, -0.16970553, -0.06645013, 0.006280372, -0.11624798, -0.00490635, 0.1332938, -0.05824645, -0.00758351, -0.1255734, 0.039873965, 0.07710664, 0.045426518, -0.18234527, -0.024939088, -0.10234514, 0.08368705, -0.06852132, -0.07534842, -0.15418817, -0.14619642, -0.003937893, -0.101910375, -0.030932281, 0.1487361, -0.090863645, -0.320359, 0.09530128, -0.022416687, 0.22749043, -0.17629756, 0.21058744, -0.03136628, 0.1173821, 0.12045498, 0.03843337, 0.11156459, 0.058297314, 0.104958236, -0.11261302, ..."


In [None]:
save_pickle(FEATURES_PATH + "/df_train.pkl", df_train)
save_pickle(FEATURES_PATH + "/df_dev.pkl", df_dev)

In [None]:
memory_usage()