In [2]:
import pandas as pd
import os
splits = {}
for split_name in ['train', 'validation', 'test']:
    csv_path = f"{split_name}.csv"
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
        splits[split_name] = {
            'index': df['index'].tolist(),
            'title': df['title'].tolist(),
            'content': df['content'].tolist(),
            'starRating': df['starRating'].tolist(),
        }

In [None]:
import torch
from tqdm import tqdm
import unicodedata
from enum import Enum

MODEL_NORMALIZATION = {
    "faur-ai/LLMic": True
}

def remove_diacritics(text):
    return ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')

class EmbeddingExtractor:
    def __init__(self, model, tokenizer, model_name, device=None, pooling="classical-avg"):
        self.model = model.to(device or ("cuda" if torch.cuda.is_available() else "cpu")).eval()
        self.tokenizer = tokenizer
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.pooling = pooling
        self.model_name = model_name
        self.model_needs_normalization = MODEL_NORMALIZATION.get(self.model_name, False)
        self.max_length = getattr(self.model.config, "max_position_embeddings")
        
    def _build_prompt(self, text, strategy):
        if strategy == "echo":
            prompt = f"Rescrie recenzia: {text}. Recenzia rescrisă: {text}."
        elif strategy == "summary":
            prompt = f"Rezumă recenzia: {text}. Răspunde doar cu un cuvânt:"
        else:
            prompt = f"Scrie recenzia: {text}."

        if self.model_needs_normalization:
          return remove_diacritics(prompt).lower()
        return prompt

    def _apply_pooling(self, hidden_states, inputs, text, strategy, pooling_method):
        input_ids = inputs["input_ids"][0]
        full_text = self.tokenizer.decode(input_ids, skip_special_tokens=True).strip()

        if self.model_needs_normalization:
            full_text = remove_diacritics(full_text).lower()

        # print(f"\n[DEBUG] Full Decoded Prompt:\n{full_text}\n")

        if strategy == "echo":
            instruction = "recenzia rescrisa:" if self.model_needs_normalization else "Recenzia rescrisă:"
            second_occurrence_idx = full_text.rfind(text)
            selected_text = full_text[second_occurrence_idx:]
            # print(f"[DEBUG] Selected (echo second occurrence):\n{selected_text}\n")
            second_tokens = self.tokenizer(selected_text, return_tensors="pt", truncation=True).to(self.device)
            length = second_tokens["input_ids"].shape[1] - 2
            selected = hidden_states[:, -length:, :]

        elif strategy == "summary":
            generated_ids = self.model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=10,
                do_sample=False,
                pad_token_id=self.tokenizer.eos_token_id
            )
            generated_text = self.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
            completion = generated_text[len(full_text):].strip()
            # print(f"[DEBUG] Generated Text (summary):\n{completion}\n")
            first_word = completion.split()[0] if completion else ""
            # print(f"[DEBUG] Generated Text First Word (summary):\n{first_word}\n")
            if self.model_name == "faur-ai/LLMic":
                first_word = next((w for w in completion.split() if w.isalnum() and len(w) >= 3), "")
                # print(f"[DEBUG] Generated Text First Word LLMIC (summary):\n{first_word}\n")
            if not completion:
                print("[WARNING] Model did not generate anything after summary instruction.")

            summary_tokens = self.tokenizer(first_word, return_tensors="pt", truncation=True).to(self.device)
            length = summary_tokens["input_ids"].shape[1] - 2
            selected = hidden_states[:, -length:, :] if length > 0 else hidden_states[:, -1:, :]

        else:
            instruction = "scrie recenzia:" if self.model_needs_normalization else "Scrie recenzia:"
            idx = full_text.find(instruction)
            if idx == -1:
                raise ValueError(f"Failed to find classical instruction '{instruction}' in the prompt text.")
            after_instruction = full_text[idx + len(instruction):].strip()
            # print(f"[DEBUG] Selected (classical real input):\n{after_instruction}\n")
            text_tokens = self.tokenizer(after_instruction, return_tensors="pt", truncation=True).to(self.device)
            length = text_tokens["input_ids"].shape[1] - 2
            selected = hidden_states[:, -length:, :]

        if pooling_method == "avg":
            return selected.mean(dim=1).squeeze()
        elif pooling_method == "last":
            return selected[:, -1, :].squeeze()
        else:
            raise ValueError(f"Unknown pooling method: {pooling_method}")


    def extract_single(self, text):
        strategy, pooling_method = self.pooling.split("-")
        prompt = self._build_prompt(text, strategy)
        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True,  max_length=self.max_length).to(self.device)
        
        # print("[DEBUG] Decoded back:", self.tokenizer.decode(inputs["input_ids"][0])) # see how model internally tokenizes data (lowercase etc) llmic fara diactritice si lowercase
        with torch.no_grad():
            outputs = self.model(**inputs, output_hidden_states=True)
            hidden_states = outputs.hidden_states[-1]
        text = remove_diacritics(text).lower() if self.model_needs_normalization else text
        return self._apply_pooling(hidden_states, inputs, text, strategy, pooling_method)

    def extract_batch(self, texts, save_path=None, save_format="pt"):
        embeddings = []

        for text in tqdm(texts, desc=f"Extracting ({self.pooling})"):
            emb = self.extract_single(text)
            embeddings.append(emb.cpu())

        stacked = torch.stack(embeddings)

        if save_path:
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            if save_format == "pt":
                torch.save(stacked, save_path)
            elif save_format == "npy":
                import numpy as np
                np.save(save_path, stacked.numpy())

        return stacked

    def extract_in_chunks(extractor, texts, save_path, save_format="pt", batch_size=64):
        all_embeddings = []
    
        total = len(texts)
        with tqdm(total=total, desc=f"Extracting ({extractor.pooling})") as pbar:
            for i in range(0, total, batch_size):
                batch = texts[i:i + batch_size]
                batch_emb = extractor.extract_batch(batch)  
                all_embeddings.append(batch_emb)
                pbar.update(len(batch))
    
        stacked = torch.cat(all_embeddings)
        return stacked

In [8]:
import os
import numpy as np

class EmbeddingExtractionRunner:
    def __init__(self, model, tokenizer, model_name, device=None):
        self.model = model
        self.tokenizer = tokenizer
        self.model_name = model_name
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")

    def run(self, splits, save_root):
        strategies = ["classical-avg", "classical-last", "echo-avg", "summary-avg"]
        fields = ["title", "content", "title+content"]

        for strategy in strategies:
            for field in fields:
                extractor = EmbeddingExtractor(
                    model=self.model,
                    tokenizer=self.tokenizer,
                    model_name=self.model_name,
                    device=self.device,
                    pooling=strategy
                )

                for split_name, split_data in splits.items():
                    print(f"field={field}, split={split_name}")
                    if field == "title":
                        texts = split_data['title']
                    elif field == "content":
                        texts = split_data['content']
                    elif field == "title+content":
                        texts = [f"{t} {c}" for t, c in zip(split_data['title'], split_data['content'])]
                    else:
                        raise ValueError(f"Unsupported field type: {field}")

                    base_dir = os.path.join(
                        save_root,
                        self.model_name,
                        strategy,
                        field,
                        split_name
                    )
                    os.makedirs(base_dir, exist_ok=True)

                    pt_path = os.path.join(base_dir, "embeddings.pt")
                    npy_path = os.path.join(base_dir, "embeddings.npy")

                    embeddings = extractor.extract_batch(texts) 
                    torch.save(embeddings, pt_path)
                    np.save(npy_path, embeddings.cpu().numpy())

                    del embeddings
                    torch.cuda.empty_cache()
                    import gc
                    gc.collect()

RoLlama

In [42]:
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_NAME = "OpenLLM-Ro/RoLlama3.1-8b-Instruct"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

Loading checkpoint shards: 100% 4/4 [00:01<00:00,  2.33it/s]


In [57]:
extractor = EmbeddingExtractor(
    model=model,
    tokenizer=tokenizer,
    model_name=MODEL_NAME,
    device="cuda",
    pooling="summary-avg"
)

text = splits["train"]["content"][3]

embedding = extractor.extract_single(text)

print(f"Embedding shape: {embedding.shape}")
print(f"Embedding preview: {embedding[:10]}")



[DEBUG] Full Decoded Prompt:
Rezumă recenzia: un produs excelent care mi-a depasit asteptarile.  se incarca "fast" si incarca "fast" la randul lui.  recomand cu caldura.. Răspunde doar cu un cuvânt:

[DEBUG] Generated Text First Word (summary):
excelent. Răspunde doRecenz

[DEBUG] Generated Text First Word (summary):
excelent.

Embedding shape: torch.Size([4096])
Embedding preview: tensor([-3.0418,  0.7847,  5.7454,  0.3592,  1.6239, -2.5949, -0.8607, -1.4430,
        -3.5565,  1.5602], device='cuda:0')


In [None]:
runner = EmbeddingExtractionRunner(model, tokenizer, model_name="OpenLLM-Ro/RoLlama3.1-8b-Instruct")
runner.run(splits, save_root="outputs_embeddings")

MGPT

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_NAME = "ai-forever/mGPT-1.3B-romanian"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

In [29]:
extractor = EmbeddingExtractor(
    model=model,
    tokenizer=tokenizer,
    model_name=MODEL_NAME,
    device="cuda",
    pooling="summary-avg"
)

text = splits["train"]["content"][3]

embedding = extractor.extract_single(text)

print(f"Embedding shape: {embedding.shape}")
print(f"Embedding preview: {embedding[:10]}")

[DEBUG] Generated Text (summary):
"fast".

- Nu-mi place

[DEBUG] Generated Text First Word (summary):
"fast".

Embedding shape: torch.Size([2048])
Embedding preview: tensor([ 0.1807, -1.0960, -0.1395,  0.5721,  0.5812, -0.0932,  0.7749,  1.6887,
        -0.1019,  0.6627], device='cuda:0')


In [None]:
runner = EmbeddingExtractionRunner(model, tokenizer, model_name="ai-forever/mGPT-1.3B-romanian")
runner.run(splits, save_root="outputs_embeddings")

Llmic

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_NAME = "faur-ai/LLMic"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
extractor = EmbeddingExtractor(
    model=model,
    tokenizer=tokenizer,
    model_name=MODEL_NAME,
    device="cuda",
    pooling="summary-avg"
)

text = splits["train"]["content"][3]

embedding = extractor.extract_single(text)

print(f"Embedding shape: {embedding.shape}")
print(f"Embedding preview: {embedding[:10]}")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Embedding shape: torch.Size([2560])
Embedding preview: tensor([-7.0801, -4.2104, -2.0157, 13.6697,  9.5080, -7.7106, -6.3963, -5.4862,
         3.0037, 33.4825], device='cuda:0')


It generates the same thing no matter what.

In [None]:
runner = EmbeddingExtractionRunner(model, tokenizer, model_name="faur-ai/LLMic")
runner.run(splits, save_root="outputs_embeddings")

LLama3

In [None]:
pip install ipywidgets

In [None]:
from huggingface_hub import interpreter_login

interpreter_login()

In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
extractor = EmbeddingExtractor(
    model=model,
    tokenizer=tokenizer,
    model_name=MODEL_NAME,
    device="cuda",
    pooling="summary-avg"
)

text = splits["train"]["content"][3]

embedding = extractor.extract_single(text)

print(f"Embedding shape: {embedding.shape}")
print(f"Embedding preview: {embedding[:10]}")



Embedding shape: torch.Size([4096])
Embedding preview: tensor([-3.0598,  1.6368,  4.4468,  0.6540,  2.1140, -3.9921, -0.7091, -0.9007,
        -2.7611,  0.9963], device='cuda:0')


In [None]:
runner = EmbeddingExtractionRunner(model, tokenizer, model_name="meta-llama/Llama-3.1-8B-Instruct")
runner.run(splits, save_root="outputs_embeddings")