In [None]:
!pip install -U bitsandbytes

In [None]:
!rm -rf /content/lm-evaluation-harness
!git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness
!cd /content/lm-evaluation-harness && pip install -e .

In [None]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

In [None]:
model_16bit = AutoModelForCausalLM.from_pretrained(
    "facebook/opt-350m",
    device_map="auto",
    # quantization_config=quantization_config,
    dtype="auto"
)
model_16bit.model.decoder.layers[-1].final_layer_norm.weight.dtype

In [None]:
def print_parameters(model):

  for params in model.named_parameters():
    print("Paramters name:", params[0], " | ", "dtype: ", params[1].dtype)

In [None]:
#Conversion to 8bit using LLM.int8() method

In [None]:
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
model_8bit = AutoModelForCausalLM.from_pretrained(
    "facebook/opt-350m",
    device_map="auto",
    quantization_config=quantization_config,
    dtype="auto"
)

In [None]:
#Eval fp16 model

In [None]:
!lm_eval --model hf \
    --model_args pretrained=facebook/opt-350m \
    --tasks hellaswag \
    --device cuda:0 \
    --batch_size auto

2025-10-12 09:52:03.539895: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760262723.597926    1290 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760262723.607841    1290 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1760262723.647697    1290 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1760262723.647730    1290 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1760262723.647735    1290 computation_placer.cc:177] computation placer alr

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
tokenizer.push_to_hub("YuvrajSingh9886/facebook-opt-350m-8bit-llm.int8-llmhead-fp162")

In [None]:
#eval 8bit model

In [None]:
!lm_eval --model hf \
    --model_args pretrained=YuvrajSingh9886/facebook-opt-350m-8bit-llm.int8 \
    --tasks hellaswag \
    --device cuda:0 \
    --batch_size auto

In [None]:
#Skipping lm head for int8

In [None]:
import torch
# Extract parameters
old_layer = model_8bit.model.decoder.project_out

# Create a new fp16 Linear layer with same shape
new_layer = torch.nn.Linear(old_layer.in_features, old_layer.out_features, bias=old_layer.bias is not None)
new_layer = new_layer.to(torch.float16)

# Copy weights (convert to fp16)
new_layer.weight.data = old_layer.weight.data.clone().to(torch.float16)
print(new_layer.weight.data.dtype)
if old_layer.bias is not None:
    new_layer.bias.data = old_layer.bias.data.clone().to(torch.float16)

# Replace in model
model_8bit.model.decoder.project_out = new_layer

# model_8bit.model.decoder.project_out.weight.data = model_8bit.model.decoder.project_out.weight.data.to(torch.float16).clone()


In [None]:
# quantization_config = BitsAndBytesConfig(load_in_8bit=True)
# model_8bit.model.decoder.project_out = model_8bit.model.decoder.project_out.clone().to(torch.float16).  # won't work cus the layer Linear8bitLt does not allow the dtype conversion to be successful

In [None]:
print_parameters(model_8bit)

In [None]:
!pip install huggingface_hub -q

###Checking the memory before and after quantization


In [None]:
mem_16bit = model_16bit.get_memory_footprint() / 1e9

In [None]:
mem_8bit = model_8bit.get_memory_footprint() / 1e9

In [None]:
#Memory saved ratio

In [None]:
(mem_8bit / mem_16bit)*100

###Uploading to hf


In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
model_8bit.push_to_hub("facebook-opt-350m-8bit-llm.int8-llmhead-fp162")

###Evals (contd)

In [None]:
#Evaluate the model! (8bit without lm_head being in fp16)

In [None]:
!lm_eval --model hf \
    --model_args pretrained=YuvrajSingh9886/facebook-opt-350m-8bit-llm.int8-llmhead-fp162 \
    --tasks hellaswag \
    --device cuda:0 \
    --batch_size auto

In [None]:
# Save this script as: analyze_weights.py
import torch
from transformers import AutoModelForCausalLM
import matplotlib.pyplot as plt
import numpy as np

# --- Configuration ---
MODEL_ID = "facebook/opt-350m"
DEVICE = "cpu"
DEFAULT_THRESHOLD = 6.0

def analyze_model_weights(model_id):
    """
    Loads a model and generates a grid of histograms and heatmaps for the
    weights of each linear layer.
    """
    print(f"--- Starting Part 1: Weight Analysis for {model_id} ---")
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(DEVICE)
    print("Model loaded.")

    linear_layers = [(name, module) for name, module in model.named_modules() if isinstance(module, torch.nn.Linear)]
    num_layers = len(linear_layers)
    print(f"Found {num_layers} linear layers.")

    fig, axes = plt.subplots(num_layers, 2, figsize=(15, num_layers * 5))
    fig.suptitle(f'Static Weight Analysis for {model_id}', fontsize=20, y=1.0)

    for i, (name, layer) in enumerate(linear_layers):
        weights = layer.weight.data
        weights = weights.cpu().numpy()

        # Plot Histogram
        ax_hist = axes[i, 0]
        ax_hist.hist(weights.flatten(), bins=500, log=True, color='darkblue')
        ax_hist.axvline(x=DEFAULT_THRESHOLD, color='r', linestyle='--', label=f'Threshold = {DEFAULT_THRESHOLD}')
        ax_hist.set_title(f"{name}\nHistogram of Weights")
        ax_hist.set_ylabel("Frequency (Log)")
        ax_hist.legend()

        # Plot Heatmap
        ax_heatmap = axes[i, 1]
        im = ax_heatmap.imshow(weights, aspect='auto', cmap='viridis')
        ax_heatmap.set_title(f"{name}\nHeatmap of Weights")
        # break
    plt.tight_layout(rect=[0, 0, 1, 0.99])
    output_filename = f"{model_id.replace('/', '_')}_WEIGHT_analysis.png"
    plt.savefig(output_filename, dpi=150)
    print(f"\nWeight analysis complete. Grid saved to: {output_filename}")

if __name__ == '__main__':
    analyze_model_weights(MODEL_ID)

In [None]:
# Save this script as: analyze_activations.py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

# --- Configuration ---
MODEL_ID = "facebook/opt-350m"
DATASET_ID = "wikitext"
DATASET_CONFIG = "wikitext-2-raw-v1"
NUM_SAMPLES = 50  # Number of data samples to run through the model
MAX_LENGTH = 512  # Sequence length
DEVICE = "cpu"
DEFAULT_THRESHOLD = 6.0

# This dictionary will store the captured activations
activation_storage = {}

def get_hook(name):
    """
    Creates a forward hook function to capture the input of a module.
    The input to a linear layer is the activation from the previous layer.
    """
    def hook(model, input, output):
        # We store the first element of the input tuple, which is the hidden state tensor
        activation_storage[name] = input[0].detach()
    return hook

def analyze_model_activations(model_id):
    """
    Loads a model and sample data, then uses forward hooks to capture and
    visualize the input activations for each linear layer.
    """
    print(f"--- Starting Part 2: Activation Analysis for {model_id} ---")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(DEVICE)
    print("Model and tokenizer loaded.")

    # --- 1. Register hooks on all linear layers ---
    linear_layers = []
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            linear_layers.append((name, module))
            module.register_forward_hook(get_hook(name))
    num_layers = len(linear_layers)
    print(f"Registered forward hooks on {num_layers} linear layers.")

    # --- 2. Load data and run a forward pass to trigger hooks ---
    dataset = load_dataset(DATASET_ID, DATASET_CONFIG, split="test")
    model.eval() # Set model to evaluation mode
    with torch.no_grad():
        for i in tqdm(range(NUM_SAMPLES), desc="Running forward passes"):
            text = dataset[i]['text']
            if not text: continue
            inputs = tokenizer(text, return_tensors="pt", max_length=MAX_LENGTH, truncation=True).to(DEVICE)
            model(**inputs) # This forward pass triggers the hooks
    print("Forward passes complete. Activations have been captured.")

    # --- 3. Create the plot grid ---
    fig, axes = plt.subplots(num_layers, 2, figsize=(15, num_layers * 5))
    fig.suptitle(f'Dynamic Activation Analysis for {model_id}', fontsize=20, y=1.0)

    for i, (name, layer) in enumerate(linear_layers):
        if name not in activation_storage:
            print(f"Warning: No activation captured for layer {name}")
            continue

        activations = activation_storage[name]
        abs_activations = activations.view(-1).cpu().numpy() # Flatten all activations over all samples

        # Plot Histogram
        ax_hist = axes[i, 0]
        ax_hist.hist(abs_activations, bins=500, log=True, color='green')
        ax_hist.axvline(x=DEFAULT_THRESHOLD, color='r', linestyle='--', label=f'Threshold = {DEFAULT_THRESHOLD}')
        ax_hist.set_title(f"{name}\nHistogram of Activations")
        ax_hist.set_ylabel("Frequency (Log)")
        ax_hist.legend()

        # Plot Heatmap
        # We visualize the activations from the last sample for the heatmap
        last_sample_activations = torch.abs(activations).cpu().numpy()
        # Activations are 3D (batch, seq_len, features), we reshape to 2D
        last_sample_activations_2d = last_sample_activations.reshape(-1, last_sample_activations.shape[-1])
        ax_heatmap = axes[i, 1]
        im = ax_heatmap.imshow(last_sample_activations_2d, aspect='auto', cmap='plasma')
        ax_heatmap.set_title(f"{name}\nHeatmap of Activations (Last Sample)")
        ax_heatmap.set_xlabel("Feature Dimension")
        ax_heatmap.set_ylabel("Token Position (Flattened)")


    plt.tight_layout(rect=[0, 0, 1, 0.99])
    output_filename = f"{model_id.replace('/', '_')}_ACTIVATION_analysis.png"
    plt.savefig(output_filename, dpi=150)
    print(f"\nActivation analysis complete. Grid saved to: {output_filename}")


if __name__ == '__main__':
    analyze_model_activations(MODEL_ID)

In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

model_id = "bigscience/bloom-1b7"

quantization_config = BitsAndBytesConfig(
    llm_int8_threshold=0.0,
    llm_int8_enable_fp32_cpu_offload=True
)

model_8bit = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype="auto",
    device_map="auto",
    quantization_config=quantization_config,
)