In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, time, json
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm
import spacy

import sys

sys.path.append("../../")
import os

import logging
from src.utils import logging_utils
from src.utils import env_utils
from src import functional
from datasets import load_dataset

logger = logging.getLogger(__name__)

logging.basicConfig(
    level=logging.DEBUG,
    format=logging_utils.DEFAULT_FORMAT,
    datefmt=logging_utils.DEFAULT_DATEFMT,
    stream=sys.stdout,
)

import torch
import transformers

logger.info(f"{torch.__version__=}, {torch.version.cuda=}")
logger.info(
    f"{torch.cuda.is_available()=}, {torch.cuda.device_count()=}, {torch.cuda.get_device_name()=}"
)
logger.info(f"{transformers.__version__=}")

  from .autonotebook import tqdm as notebook_tqdm


2024-08-15 13:11:54 __main__ INFO     torch.__version__='2.3.1', torch.version.cuda='12.1'
2024-08-15 13:11:54 __main__ INFO     torch.cuda.is_available()=True, torch.cuda.device_count()=1, torch.cuda.get_device_name()='NVIDIA RTX A6000'
2024-08-15 13:11:54 __main__ INFO     transformers.__version__='4.43.3'


In [3]:
import torch

from nnsight import LanguageModel
from src.models import ModelandTokenizer

# model_key = "meta-llama/Meta-Llama-3-8B"
model_key = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_key = "google/gemma-2-9b-it"
# model_key = "google/gemma-2-27b-it"
# model_key = "Qwen/Qwen2-7B"

mt = ModelandTokenizer(
    model_key=model_key,
    torch_dtype=torch.float16,
)

2024-08-15 13:11:54 accelerate.utils.modeling INFO     We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.35s/it]

2024-08-15 13:12:00 src.models INFO     loaded model </home/local_arnab/Codes/00_MODEL/meta-llama/Meta-Llama-3-8B-Instruct> | size: 15316.516 MB | dtype: torch.float16 | device: cuda:0





In [4]:
ds = load_dataset("Salesforce/wikitext", "wikitext-103-v1")

2024-08-15 13:12:00 urllib3.connectionpool DEBUG    Starting new HTTPS connection (1): huggingface.co:443
2024-08-15 13:12:00 urllib3.connectionpool DEBUG    https://huggingface.co:443 "GET /api/datasets/Salesforce/wikitext HTTP/11" 200 4982
2024-08-15 13:12:00 urllib3.connectionpool DEBUG    Starting new HTTPS connection (1): s3.amazonaws.com:443
2024-08-15 13:12:00 urllib3.connectionpool DEBUG    https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/datasets/Salesforce/wikitext/Salesforce/wikitext.py HTTP/11" 404 0
2024-08-15 13:12:00 urllib3.connectionpool DEBUG    https://huggingface.co:443 "GET /api/datasets/Salesforce/wikitext HTTP/11" 200 4982
2024-08-15 13:12:00 urllib3.connectionpool DEBUG    Starting new HTTPS connection (1): huggingface.co:443
2024-08-15 13:12:00 urllib3.connectionpool DEBUG    https://huggingface.co:443 "HEAD /datasets/Salesforce/wikitext/resolve/b08601e04326c79dfdd32d625aee71d232d685c3/README.md HTTP/11" 200 0
2024-08-15 13:12:00 urllib3.con

In [5]:
import numpy as np
from src.utils import experiment_utils

experiment_utils.set_seed(321)

num_docs = 300
doc_indices = np.random.choice(len(ds["train"]), num_docs, replace=False)

2024-08-15 13:12:01 src.utils.experiment_utils INFO     setting all seeds to 321


In [6]:
cache_dir = os.path.join(
    env_utils.DEFAULT_RESULTS_DIR,
    "cache_states",
)

os.makedirs(cache_dir, exist_ok=True)

from src.models import prepare_input
from src.functional import get_module_nnsight, free_gpu_cache

limit = 1000
for doc_index in tqdm(doc_indices):
    doc = ds["train"][int(doc_index)]["text"]
    inputs = prepare_input(prompts=doc, tokenizer=mt)
    if inputs["input_ids"].shape[1] > limit:
        inputs["input_ids"] = inputs["input_ids"][:, :limit]
        inputs["attention_mask"] = inputs["attention_mask"][:, :limit]

    doc_cache: dict[int, torch.Tensor] = {}

    with mt.trace(inputs, scan=False, validate=False) as trace:
        for layer in mt.layer_names:
            module = get_module_nnsight(mt, layer)
            doc_cache[layer] = module.output[0].save()

    for layer in mt.layer_names:
        doc_cache[layer] = doc_cache[layer].detach().cpu().numpy().astype(np.float32)

    cache_path = os.path.join(cache_dir, f"{doc_index}")
    np.savez_compressed(cache_path, **doc_cache)

    free_gpu_cache()

  0%|          | 0/300 [00:00<?, ?it/s]You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
100%|██████████| 300/300 [19:19<00:00,  3.86s/it]


In [7]:
# file = np.load(cache_path + ".npz")

In [8]:
# file["model.layers.0"]