In [None]:
!pip install -U bitsandbytes sentencepiece protobuf

In [2]:
from huggingface_hub import login
from google.colab import userdata

token_value = userdata.get('hf_token')
login(token=token_value)

In [1]:
from sklearn.metrics.pairwise import cosine_distances
import numpy as np

def select_best_layer(activations: np.ndarray, labels: np.ndarray):
    """
    Выбирает слой с наибольшим изменением косинусного расстояния между нейтральными и traited активациями
    по сравнению с предыдущим слоем.

    Параметры:
        activations: np.ndarray, форма (n_samples, n_layers, hidden_size)
        labels: np.ndarray, форма (n_samples,), где 0 - neutral, 1 - traited

    Возвращает:
        best_layer: индекс слоя с максимальным изменением косинусного расстояния
        layer_deltas: словарь {layer_index: изменение_расстояния}
        layer_distances: словарь {layer_index: абсолютное_расстояние}
    """
    n_samples, n_layers, hidden_size = activations.shape
    layer_distances = {}
    layer_deltas = {0: 0}
    
    # Разделяем активации на нейтральные и traited
    neutral_acts = activations[labels == 0]
    traited_acts = activations[labels == 1]

    # Проверка, что у нас есть пары для сравнения
    assert len(neutral_acts) == len(traited_acts), "Количество neutral и traited примеров должно совпадать"

    # Вычисляем расстояния для всех слоев
    for layer in range(n_layers):
        neutral_layer = neutral_acts[:, layer, :]
        traited_layer = traited_acts[:, layer, :]
        
        distances = cosine_distances(neutral_layer, traited_layer)
        mean_distance = np.mean(np.diag(distances))
        layer_distances[layer] = mean_distance

    # Вычисляем изменения расстояний между слоями
    for layer in range(1, n_layers):
        delta = layer_distances[layer] - layer_distances[layer-1]
        layer_deltas[layer] = delta

    # Находим слой с максимальным изменением
    best_layer = max(layer_deltas, key=layer_deltas.get)
    
    return best_layer, layer_distances, layer_deltas

In [32]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import roc_auc_score
# import numpy as np
# 
# def select_best_layer(activations: np.ndarray, labels: np.ndarray):
# 
#     n_samples, n_layers, hidden_size = activations.shape
#     layer_aucs = {}
# 
#     # Перебір по кожному шару
#     for layer in range(n_layers):
#         # Формуємо матрицю X для поточного шару: (n_samples, hidden_size)
#         X_layer = activations[:, layer, :]
# 
#         # Навчання логістичної регресії
#         clf = LogisticRegression(max_iter=1000)
#         clf.fit(X_layer, labels)
# 
#         # Прогнозування й обчислення AUC
#         probs = clf.predict_proba(X_layer)[:, 1]
#         auc = roc_auc_score(labels, probs)
#         layer_aucs[layer] = auc
# 
#     # Вибір шару з максимальною AUC
#     best_layer = max(layer_aucs, key=layer_aucs.get)
#     return best_layer, layer_aucs

In [2]:
import json
import numpy as np
import torch
from tqdm import tqdm
from typing import Tuple

def collect_activations_batched(
    model,
    tokenizer,
    dataset_path: str,
    generation_args: dict,
    model_type: str = "llama",  # "llama", "mistral" или "flan_t5"
    batch_size: int = 8,
    max_samples: int = None
) -> Tuple[np.ndarray, np.ndarray, list]:
    """
    Собирает активации, метки и ответы модели для всех промптов в датасете
    
    Параметры:
        model_type: тип модели ("llama", "mistral" или "flan_t5")
    
    Возвращает:
        all_activations: np.ndarray форма (n_samples, n_layers, hidden_size)
        all_labels: np.ndarray форма (n_samples,)
        all_responses: list[dict] список ответов модели с дополнительной информацией
    """
    # Загрузка датасета
    with open(dataset_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    if max_samples is not None:
        data = data[:max_samples]
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        
    all_activations = []
    all_labels = []
    all_responses = []
    batch_prompts = []
    batch_labels = []
    batch_samples = []  # Для хранения исходных данных

    # Подготовка всех промптов и меток
    for sample in tqdm(data, desc="Preparing prompts"):
        # Выбираем нужный формат промпта в зависимости от модели
        prompts = sample["prompts"][model_type]
        
        # Добавляем оба варианта (neutral и traited)
        batch_prompts.extend([prompts["neutral"], prompts["traited"]])
        batch_labels.extend([0, 1])  # 0 для neutral, 1 для traited
        batch_samples.extend([sample, sample])

    # Обработка батчами
    for i in tqdm(range(0, len(batch_prompts), batch_size), desc="Processing batches"):
        current_batch = batch_prompts[i:i+batch_size]
        current_labels = batch_labels[i:i+batch_size]
        current_samples = batch_samples[i:i+batch_size]

        # Токенизация батча
        inputs = tokenizer(
            current_batch, 
            return_tensors="pt", 
            padding=True,
            truncation=True,
            max_length=tokenizer.model_max_length,
            return_token_type_ids=False
        ).to(model.device)

        # Получение скрытых состояний и генерация ответов
        with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=True)
            generated_ids = model.generate(
                **inputs,
                **generation_args,
                pad_token_id=tokenizer.eos_token_id
            )
            
        # Декодирование ответов
        responses = [
            {
                "model_type": model_type,
                "prompt": prompt,
                "response": tokenizer.decode(ids, skip_special_tokens=True),
                "label": label,
                "trait": sample["trait"],
                "instruction": sample["alpaca_instruction"],
                "input": sample["alpaca_input"],
                "is_neutral": label == 0,
                "system_prompt": sample["system_prompt"] if label == 1 else None
            }
            for prompt, ids, label, sample in zip(
                current_batch,
                generated_ids,
                current_labels,
                current_samples
            )
        ]
        
        # Для FLAN-T5 удаляем префикс "Output:" из ответа
        if model_type == "flan_t5":
            for r in responses:
                r["response"] = r["response"].replace("Output:", "").strip()
        
        all_responses.extend(responses)

        # Сбор активаций
        hidden_states = torch.stack(outputs.hidden_states[1:])  # Пропускаем embedding слой
        attention_mask = inputs.attention_mask.unsqueeze(0).unsqueeze(-1)
        sum_states = (hidden_states * attention_mask).sum(dim=2)
        sum_mask = attention_mask.sum(dim=2)
        layer_activations = sum_states / sum_mask
        batch_activations = layer_activations.permute(1, 0, 2).cpu().numpy()

        all_activations.append(batch_activations)
        all_labels.extend(current_labels)

    # Объединение всех батчей
    all_activations = np.concatenate(all_activations, axis=0)
    all_labels = np.array(all_labels)

    return all_activations, all_labels, all_responses

In [3]:
from transformers import LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig
import torch

def load_llama8bit(model_name: str = "meta-llama/Llama-2-7b-hf"):
    """
    Завантажує квантизовану в 8-bit модель LLaMA та токенізатор.

    Параметри:
        model_name: назва моделі на Hugging Face Hub

    Повертає:
        model: LlamaForCausalLM з output_hidden_states=True
        tokenizer: LlamaTokenizer
    """
    torch.cuda.empty_cache()
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16
    )

    model = LlamaForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        output_hidden_states=True
    )
    tokenizer = LlamaTokenizer.from_pretrained(model_name)

    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"
    model.eval()
    return model, tokenizer

model, tokenizer = load_llama8bit()



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
generation_args = {
    "max_new_tokens": 512,
    "do_sample": True,
    "temperature": 0.7,
    "top_p": 0.9
}

all_activations, all_labels, all_responses = collect_activations_batched(
    model, tokenizer, "trait_combined_dataset.json",
    generation_args, batch_size=8
)

Preparing prompts: 100%|██████████| 9100/9100 [00:00<00:00, 1187448.79it/s]
Processing batches:   0%|          | 6/2275 [15:37<98:26:15, 156.18s/it]


KeyboardInterrupt: 

In [30]:
all_activations.shape, all_labels.shape

((2, 32, 4096), (2,))

In [42]:
best_layer, layer_scores, layers_deltas = select_best_layer(all_activations, all_labels)

print("\nLayers scores:")
for layer, score in layer_scores.items():
    print(f"Layer {layer}: score = {score:.4f}, delta = {layers_deltas[layer]:.4f}")

print(f"\nBest layer: {best_layer} with score = {layer_scores[best_layer]:.4f} delta = {layers_deltas[best_layer]:.4f}")


Layers scores:
Layer 0: score = 0.0153, delta = 0.0000
Layer 1: score = 0.0001, delta = -0.0152
Layer 2: score = 0.0002, delta = 0.0001
Layer 3: score = 0.0003, delta = 0.0001
Layer 4: score = 0.0005, delta = 0.0002
Layer 5: score = 0.0008, delta = 0.0002
Layer 6: score = 0.0011, delta = 0.0003
Layer 7: score = 0.0017, delta = 0.0006
Layer 8: score = 0.0021, delta = 0.0005
Layer 9: score = 0.0027, delta = 0.0006
Layer 10: score = 0.0033, delta = 0.0006
Layer 11: score = 0.0041, delta = 0.0009
Layer 12: score = 0.0050, delta = 0.0009
Layer 13: score = 0.0066, delta = 0.0016
Layer 14: score = 0.0080, delta = 0.0014
Layer 15: score = 0.0104, delta = 0.0023
Layer 16: score = 0.0160, delta = 0.0057
Layer 17: score = 0.0197, delta = 0.0037
Layer 18: score = 0.0268, delta = 0.0071
Layer 19: score = 0.0342, delta = 0.0074
Layer 20: score = 0.0464, delta = 0.0122
Layer 21: score = 0.0542, delta = 0.0078
Layer 22: score = 0.0681, delta = 0.0139
Layer 23: score = 0.0726, delta = 0.0045
Layer 24: