In [1]:
!pip install -U bitsandbytes



In [2]:
from huggingface_hub import login
from google.colab import userdata

token_value = userdata.get('hf_token')
login(token=token_value)

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import numpy as np

def select_best_layer(activations: np.ndarray, labels: np.ndarray):

    n_samples, n_layers, hidden_size = activations.shape
    layer_aucs = {}

    # Перебір по кожному шару
    for layer in range(n_layers):
        # Формуємо матрицю X для поточного шару: (n_samples, hidden_size)
        X_layer = activations[:, layer, :]

        # Навчання логістичної регресії
        clf = LogisticRegression(max_iter=1000)
        clf.fit(X_layer, labels)

        # Прогнозування й обчислення AUC
        probs = clf.predict_proba(X_layer)[:, 1]
        auc = roc_auc_score(labels, probs)
        layer_aucs[layer] = auc

    # Вибір шару з максимальною AUC
    best_layer = max(layer_aucs, key=layer_aucs.get)
    return best_layer, layer_aucs

In [None]:
def collect_activations_batched(
    model,
    tokenizer,
    dataset_path: str,
    generation_args: dict,
    batch_size: int = 8,
    max_samples: int = None
) -> Tuple[np.ndarray, np.ndarray]:
    # Загрузка датасета
    with open(dataset_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    if max_samples is not None:
        data = data[:max_samples]

    all_activations = []
    all_labels = []
    batch_prompts = []
    batch_labels = []

    # Подготовка всех промптов и меток
    for sample in tqdm(data, desc="Preparing prompts"):
        instruction = sample["alpaca_instruction"]
        input_text = sample["alpaca_input"]
        trait = sample["trait"]

        if input_text:
            user_prompt = f"{instruction}\n\nInput: {input_text}"
        else:
            user_prompt = instruction

        neutral_prompt = f"<|user|>\n{user_prompt}\n<|assistant|>\n"
        traited_prompt = sample["full_prompt"]

        batch_prompts.extend([neutral_prompt, traited_prompt])
        batch_labels.extend([0, 1])  # 0 для neutral, 1 для traited

    # Обработка батчами
    for i in tqdm(range(0, len(batch_prompts), batch_size), desc="Processing batches"):
        current_batch = batch_prompts[i:i+batch_size]
        current_labels = batch_labels[i:i+batch_size]

        # Токенизация батча
        inputs = tokenizer(
            current_batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=tokenizer.model_max_length,
            return_token_type_ids=False
        ).to(model.device)

        # Получение скрытых состояний
        with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=True)

        # Сбор активаций для батча
        # outputs.hidden_states: tuple of (n_layers+1) tensors of shape (batch_size, seq_len, hidden_size)
        # Мы пропускаем первый элемент (входные эмбеддинги)
        hidden_states = torch.stack(outputs.hidden_states[1:])  # (n_layers, batch_size, seq_len, hidden_size)

        # Усреднение по токенам (seq_len) для каждого слоя и каждого примера в батче
        # Маска для паддингов (если есть)
        attention_mask = inputs.attention_mask.unsqueeze(0).unsqueeze(-1)  # (1, batch_size, seq_len, 1)
        # Умножаем на маску и суммируем, затем делим на сумму масок
        sum_states = (hidden_states * attention_mask).sum(dim=2)  # (n_layers, batch_size, hidden_size)
        sum_mask = attention_mask.sum(dim=2)  # (1, batch_size, 1)
        layer_activations = sum_states / sum_mask  # (n_layers, batch_size, hidden_size)

        # Перенос на CPU и преобразование в numpy
        batch_activations = layer_activations.permute(1, 0, 2).cpu().numpy()  # (batch_size, n_layers, hidden_size)

        all_activations.append(batch_activations)
        all_labels.extend(current_labels)

    # Объединение всех батчей
    all_activations = np.concatenate(all_activations, axis=0)
    all_labels = np.array(all_labels)

    return all_activations, all_labels

In [4]:
from transformers import LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig
import torch

def load_llama8bit(model_name: str = "meta-llama/Llama-2-7b-hf"):
    """
    Завантажує квантизовану в 8-bit модель LLaMA та токенізатор.

    Параметри:
        model_name: назва моделі на Hugging Face Hub

    Повертає:
        model: LlamaForCausalLM з output_hidden_states=True
        tokenizer: LlamaTokenizer
    """
    torch.cuda.empty_cache()
    bnb_config = BitsAndBytesConfig(load_in_8bit=True)

    model = LlamaForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        output_hidden_states=True
    )
    tokenizer = LlamaTokenizer.from_pretrained(model_name)
    model.eval()
    return model, tokenizer


model, tokenizer = load_llama8bit()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend


RuntimeError: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend

In [None]:
generation_args = {
    "max_new_tokens": 250,
    "do_sample": True,
    "temperature": 0.7,
    "top_p": 0.9
}

all_activations, all_labels = collect_activations_batched(
    model, tokenizer, "trait_combined_dataset.json",
    generation_args, batch_size=4
)

In [None]:
best_layer, layer_aucs = select_best_layer(all_activations, all_labels)

print("\nLayers AUC scores:")
for layer, auc in layer_aucs.items():
    print(f"Layer {layer}: AUC = {auc:.4f}")

print(f"\nBest layer: {best_layer} with AUC = {layer_aucs[best_layer]:.4f}")