In this notebook I was trying to speed up the get_activation function by tokenizing the texts once before the start of the loop - I don't think it actually worked. There should also now be a progress bar on getting the activations (with tqdm)

In [None]:
!pip install torch
!pip install transformers
!pip install numpy pandas scikit-learn
!pip install datasets

In [None]:
!pip install opencv-python-headless

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import psutil, os
import h5py


In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device('cpu')

print(device)

In [None]:
#get data (put files into colab folder)

# load splits
train_df = pd.read_csv("data_train.csv")
val_df   = pd.read_csv("data_val.csv")
test_df  = pd.read_csv("data_test.csv")

print(train_df.head())

In [None]:
print(f"Train instances: {train_df.shape[0]}")
print(f"Val instances: {val_df.shape[0]}")
print(f"Test instances: {test_df.shape[0]}")

In [None]:
# #get the activations for each layer (length: 13) and save them to a file
# def get_activations(encodings, out_file, batch_size=16, max_len=256, log_every=100):
#     """
#     Args:
#       texts (list): list of strings
#       out_file (str): file to save activations to
#     Returns:
#       file: npz file with activations for each layer
#     """
#     print("Starting get_activations…")
#     process = psutil.Process(os.getpid())
#     n_layers = 13
#     all_layers = [[] for _ in range(n_layers)]

#     with tqdm(total=encodings["input_ids"].size(0), desc="Texts") as pbar:
#         for i in range(0, encodings["input_ids"].size(0), batch_size):
#             batch_inputs = {k: v[i:i+batch_size].to(device) for k, v in encodings.items()}
#             batch_size_actual = batch_inputs['input_ids'].size(0)
            
#             with torch.no_grad():
#                 outputs = model(**batch_inputs)
#                 hidden_states = outputs.hidden_states  #tuple of 13 [batch size, sequence length, hidden size 768]
    
#             #take [CLS] token (index 0) from each layer
#             for layer_idx, layer_hid in enumerate(hidden_states):
#                 all_layers[layer_idx].append(layer_hid[:, 0, :].cpu().numpy())    
    
#             # free up GPU memory
#             del outputs, hidden_states
#             torch.cuda.empty_cache()
    
#             pbar.update(batch_size_actual)

#             # monitoring RAM
#             if ( i // batch_size) % log_every == 0:
#                 mem_gb = process.memory_info().rss / 1e9
#                 pbar.set_postfix({"RAM (GB)": f"{mem_gb:.2f}"})

#         all_layers = [np.concatenate(layer_batches, axis=0) for layer_batches in all_layers]
#         np.savez_compressed(out_file, **{f"layer{idx}": arr for idx, arr in enumerate(all_layers)})
            
#     print(f"Saved activations to {out_file}")
#     return out_file

In [None]:
#load pretrained BERT
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModel.from_pretrained(model_name, output_hidden_states=True).to(device)
model.eval()

In [None]:
print("CUDA available:", torch.cuda.is_available())
print("Number of GPUs:", torch.cuda.device_count())
print("Model device:", next(model.parameters()).device)

In [None]:
# tokenize(train_df["text"], tokenizer, "train_encodings.npz")

In [None]:
def get_activations_faster(texts, out_file, batch_size=16, max_len=256, log_every=100):
    """
    Get activations for each layer and save them to a compressed .npz file.
    Tokenizes all texts at once and avoids repeated disk writes.
    """
    device = next(model.parameters()).device
    process = psutil.Process()
    print("Starting get_activations…", flush=True)

    # Tokenize once outside the loop
    encodings = tokenizer(
        texts,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_len
    )

    # Pre-allocate lists for each layer
    n_layers = 13
    all_layers = [[] for _ in range(n_layers)]

    n_texts = len(texts)
    with tqdm(total=n_texts, desc="Texts") as pbar:
        for i in range(0, n_texts, batch_size):
            batch_inputs = {k: v[i:i+batch_size].to(device) for k, v in encodings.items()}
            batch_size_actual = batch_inputs['input_ids'].size(0)

            with torch.no_grad():
                outputs = model(**batch_inputs)
                hidden_states = outputs.hidden_states  # tuple of 13 layers

            # Take CLS token (index 0) for each layer
            for layer_idx, layer_hid in enumerate(hidden_states):
                all_layers[layer_idx].append(layer_hid[:, 0, :].cpu().numpy())

            # Free GPU memory
            del outputs, hidden_states
            torch.cuda.empty_cache()

            pbar.update(len(batch_size_actual))

            if (i // batch_size) % log_every == 0:
                mem_gb = process.memory_info().rss / 1e9
                pbar.set_postfix({"RAM (GB)": f"{mem_gb:.2f}"})

    # Concatenate all batches per layer
    all_layers = [np.concatenate(layer_batches, axis=0) for layer_batches in all_layers]

    # Save all layers at once
    np.savez_compressed(out_file, **{f"layer{idx}": arr for idx, arr in enumerate(all_layers)})
    print(f"Saved activations to {out_file}")
    return out_file

In [None]:
#making sure all texts are strings
train_texts = train_df["text"].fillna("").astype(str).tolist()

In [None]:
get_activations_faster(train_texts, "train_layers.npz")