In [None]:
from huggingface_hub import login

login("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", add_to_git_credential=True)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import matplotlib.pyplot as plt

def analyze_layers_in_range(model, lower_bound, upper_bound):
    layers_in_range = []
    layers_out_of_range = []
    all_layers = []  # To keep track of the order in which layers are encountered

    for name, param in tqdm(model.named_parameters(), desc="Analyzing parameters"):
        if param.requires_grad:
            param_cpu = param.detach().cpu().float()
            
            # Check if all parameters in the layer are within the specified range
            in_range = ((param_cpu >= lower_bound) & (param_cpu <= upper_bound)).all().item()
            
            if in_range:
                layers_in_range.append(name)
            else:
                layers_out_of_range.append(name)
            
            # Record the order of layers
            all_layers.append((name, in_range))

    return layers_in_range, layers_out_of_range, all_layers

def print_layer_inlier_outlier(all_layers):
    print("\nLayer Status Report:")
    print("=" * 40)
    
    # Print information for each layer in the order they were encountered
    for layer_name, in_range in all_layers:
        if in_range:
            print(f"{layer_name}: Inlier (All parameters within range)")
        else:
            print(f"{layer_name}: Outlier (Some parameters out of range)")

def plot_layer_status(all_layers, output_path):
    # Extract statuses
    statuses = [1 if in_range else 0 for _, in_range in all_layers]

    plt.figure(figsize=(20, 10))  # Adjusted figure size for better readability
    bar_width = 0.8  # Set a fixed width for the bars
    x = range(len(all_layers))  # X-axis positions for the bars

    # Plot the bars
    plt.bar(x, [1] * len(all_layers), color=['green' if status == 1 else 'red' for status in statuses], width=bar_width)

    # Set the x-ticks to be numbered (0, 1, 2, ..., len(all_layers)-1)
    plt.xticks(x[::10], [i for i in range(len(all_layers))][::10], rotation=90)  # Show every 10th tick for clarity

    # Label the axes
    plt.xlabel('Layer Index')
    plt.ylabel('Status')
    plt.title('Layer Parameter Status')

    # Remove everything under x-axis
    plt.gca().spines['bottom'].set_visible(False)
    plt.gca().spines['left'].set_visible(False)
    plt.gca().xaxis.set_ticks_position('none')
    plt.gca().yaxis.set_ticks_position('none')

    # Adjust layout to make room for labels
    plt.tight_layout()

    # Save the plot
    plt.savefig(output_path, bbox_inches='tight')  # Save the plot to the specified file path
    plt.close()  # Close the figure to free up memory

# Load model and tokenizer
model_name = "stabilityai/japanese-stablelm-base-beta-7b"  # Update with the correct model name
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(device)

lower_bound, upper_bound = -1, 1

layers_in_range, layers_out_of_range, all_layers = analyze_layers_in_range(model, lower_bound, upper_bound)

print(f"Layers with all parameters in range: {len(layers_in_range)}")
print(f"Layers with any parameters out of range: {len(layers_out_of_range)}")

# Print the layer status report in the order encountered
print_layer_inlier_outlier(all_layers)

# Define the output path (adjust this path to a location on your local system)
output_path = "layer_status_plot_stablelm.png"  # Path to save the plot

# Plot the layer status bar graph and save it
plot_layer_status(all_layers, output_path)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import matplotlib.pyplot as plt

def analyze_layers_in_range(model, lower_bound, upper_bound):
    layers_in_range = []
    layers_out_of_range = []
    all_layers = []  # To keep track of the order in which layers are encountered

    for name, param in tqdm(model.named_parameters(), desc="Analyzing parameters"):
        if param.requires_grad:
            param_cpu = param.detach().cpu().float()
            
            # Check if all parameters in the layer are within the specified range
            in_range = ((param_cpu >= lower_bound) & (param_cpu <= upper_bound)).all().item()
            
            if in_range:
                layers_in_range.append(name)
            else:
                layers_out_of_range.append(name)
            
            # Record the order of layers
            all_layers.append((name, in_range))

    return layers_in_range, layers_out_of_range, all_layers

def print_layer_inlier_outlier(all_layers):
    print("\nLayer Status Report:")
    print("=" * 40)
    
    # Print information for each layer in the order they were encountered
    for layer_name, in_range in all_layers:
        if in_range:
            print(f"{layer_name}: Inlier (All parameters within range)")
        else:
            print(f"{layer_name}: Outlier (Some parameters out of range)")

def plot_layer_status(all_layers, output_path):
    # Extract statuses
    statuses = [1 if in_range else 0 for _, in_range in all_layers]

    plt.figure(figsize=(20, 10))  # Adjusted figure size for better readability
    bar_width = 0.8  # Set a fixed width for the bars
    x = range(len(all_layers))  # X-axis positions for the bars

    # Plot the bars
    plt.bar(x, [1] * len(all_layers), color=['green' if status == 1 else 'red' for status in statuses], width=bar_width)

    # Set the x-ticks to be numbered (0, 1, 2, ..., len(all_layers)-1)
    plt.xticks(x[::10], [i for i in range(len(all_layers))][::10], rotation=90)  # Show every 10th tick for clarity

    # Label the axes
    plt.xlabel('Layer Index')
    plt.ylabel('Status')
    plt.title('Layer Parameter Status')

    # Remove everything under x-axis
    plt.gca().spines['bottom'].set_visible(False)
    plt.gca().spines['left'].set_visible(False)
    plt.gca().xaxis.set_ticks_position('none')
    plt.gca().yaxis.set_ticks_position('none')

    # Adjust layout to make room for labels
    plt.tight_layout()

    # Save the plot
    plt.savefig(output_path, bbox_inches='tight')  # Save the plot to the specified file path
    plt.close()  # Close the figure to free up memory

# Load model and tokenizer
model_name = "openbmb/MiniCPM-V-2"  # Update with the correct model name
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(device)

lower_bound, upper_bound = -1, 1

layers_in_range, layers_out_of_range, all_layers = analyze_layers_in_range(model, lower_bound, upper_bound)

print(f"Layers with all parameters in range: {len(layers_in_range)}")
print(f"Layers with any parameters out of range: {len(layers_out_of_range)}")

# Print the layer status report in the order encountered
print_layer_inlier_outlier(all_layers)

# Define the output path (adjust this path to a location on your local system)
output_path = "layer_status_plot_stablelm.png"  # Path to save the plot

# Plot the layer status bar graph and save it
plot_layer_status(all_layers, output_path)


In [None]:
!pip install opencv-python ultralytics

In [None]:
!git clone https://github.com/WongKinYiu/yolov7.git
!pip install -r yolov7/requirements.txt

In [None]:
!wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7-tiny.pt -P yolov7
!wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt -P yolov7
!wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7x.pt -P yolov7
!wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7-w6.pt -P yolov7
!wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7-e6.pt -P yolov7
!wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7-d6.pt -P yolov7
!wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7-e6e.pt -P yolov7

## VERIFY IF MODEL IS QUANTIZED FIRST

In [None]:
import torch

SF16_MAX = 0.999969482421875
SF16_MIN = -0.999969482421875
SF8_MAX = 0.9921875
SF8_MIN = -0.9921875

def is_in_sf16_range(tensor):
    return torch.all((tensor >= SF16_MIN) & (tensor <= SF16_MAX))

def quantize_to_sf16(tensor):
    # Clip the tensor to the SF16 range
    tensor = torch.clamp(tensor, SF16_MIN, SF16_MAX)
    
    # Apply the scaling to fit into 15-bit mantissa
    scaled = (tensor - SF16_MIN) / (SF16_MAX - SF16_MIN) * (2**15 - 1)
    
    # Round to nearest integer for mantissa
    rounded = torch.round(scaled)
    
    # Convert back to SF16 range by scaling it again
    quantized = rounded / (2**15 - 1) * (SF16_MAX - SF16_MIN) + SF16_MIN
    
    return quantized

def apply_selective_sf16_quantization(model):
    total_params = 0
    quantized_params = 0
    memory_usage = 0

    for name, param in model.named_parameters():
        total_params += param.numel()
        if is_in_sf16_range(param.data):
            param.data = quantize_to_sf16(param.data)
            quantized_params += param.numel()
            memory_usage += param.numel() * 16 / 8  # 16 bits per parameter
        else:
            memory_usage += param.numel() * 32 / 8  # 32 bits for non-quantized parameters

    memory_usage_mb = memory_usage / (1024 * 1024)
    return total_params, quantized_params, memory_usage_mb

def verify_selective_sf16_quantization(model):
    for name, param in model.named_parameters():
        if is_in_sf16_range(param.data):
            unique_values = torch.unique(param)
            if len(unique_values) > 65536:
                print(f"Warning: Quantized parameter {name} has more than 65536 unique values")
            else:
                print(f"Layer {name} is correctly quantized to SF16")
        else:
            print(f"Layer {name} is not quantized (outside SF16 range)")
            
def is_in_sf8_range(tensor):
    return torch.all((tensor >= SF8_MIN) & (tensor <= SF8_MAX))

def quantize_to_sf8(tensor):
    # Clip the tensor to the SF8 range
    tensor = torch.clamp(tensor, SF8_MIN, SF8_MAX)
    
    # Apply the scaling to fit into 7-bit mantissa
    scaled = (tensor - SF8_MIN) / (SF8_MAX - SF8_MIN) * (2**7 - 1)
    
    # Round to nearest integer for mantissa
    rounded = torch.round(scaled)
    
    # Convert back to SF8 range by scaling it again
    quantized = rounded / (2**7 - 1) * (SF8_MAX - SF8_MIN) + SF8_MIN
    
    return quantized

def apply_selective_sf8_quantization(model):
    total_params = 0
    quantized_params = 0
    memory_usage = 0

    for name, param in model.named_parameters():
        total_params += param.numel()
        if is_in_sf8_range(param.data):
            param.data = quantize_to_sf8(param.data)
            quantized_params += param.numel()
            memory_usage += param.numel() * 8 / 8  # 8 bits per parameter
        else:
            memory_usage += param.numel() * 32 / 8  # 32 bits for non-quantized parameters

    memory_usage_mb = memory_usage / (1024 * 1024)
    return total_params, quantized_params, memory_usage_mb

def verify_selective_sf8_quantization(model):
    for name, param in model.named_parameters():
        if is_in_sf8_range(param.data):
            unique_values = torch.unique(param)
            if len(unique_values) > 255:
                print(f"Warning: Quantized parameter {name} has more than 255 unique values")
            else:
                print(f"Layer {name} is correctly quantized to SF8")
        else:
            print(f"Layer {name} is not quantized (outside SF8 range)")

# Usage in your main script
model_name = '/kaggle/working/yolov7/yolov7-d6.pt'
sf16_model = torch.hub.load('yolov7', 'custom', model_name, source='local', force_reload=True)
sf16_model.eval()
total_params, quantized_params, quantized_memory = apply_selective_sf16_quantization(sf16_model)

print("SF16 Quantized")
print(f"Total parameters: {total_params}")
print(f"Quantized parameters: {quantized_params}")
print(f"Quantization percentage: {quantized_params / total_params * 100:.2f}%")
print(f"Estimated memory usage: {quantized_memory:.2f} MB")

print("\n")

sf8_model = torch.hub.load('yolov7', 'custom', model_name, source='local', force_reload=True)
sf8_model.eval()
total_params, quantized_params, quantized_memory = apply_selective_sf8_quantization(sf8_model)

print("SF8 Quantized")
print(f"Total parameters: {total_params}")
print(f"Quantized parameters: {quantized_params}")
print(f"Quantization percentage: {quantized_params / total_params * 100:.2f}%")
print(f"Estimated memory usage: {quantized_memory:.2f} MB")

# Verify quantization
# print("\nVerifying Selective SF16 Quantization:")
# verify_selective_sf16_quantization(quantized_model)

## RUN BENCHMARK COMPARISONS

In [None]:
# Download image
!wget https://upload.wikimedia.org/wikipedia/commons/6/64/Cat_and_dog_standoff_%283926784260%29.jpg

In [None]:
import torch
import cv2
import psutil
import time
import numpy as np
from torchvision.ops import box_iou

def calculate_memory_fp32(model):
    total_params = sum(p.numel() for p in model.parameters())
    memory_usage_fp32 = total_params * 4 / (1024 * 1024)
    return memory_usage_fp32

def estimate_complexity(model):
    return sum(p.numel() for p in model.parameters())

def detect_image(model, image_path, is_sf16=False, is_sf8=False):
    
    process = psutil.Process()
    initial_memory = process.memory_info().rss

    img = cv2.imread(image_path)
    img = cv2.resize(img, (640, 640))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    start_time = time.time()
    results = model(img)
    end_time = time.time()
    pred_boxes = results.pandas().xyxy[0]  # Get predictions

    complexity = estimate_complexity(model)

    final_memory = process.memory_info().rss
    elapsed_time = end_time - start_time
    memory_used = (final_memory - initial_memory) / (1024 * 1024)

    total_params = sum(p.numel() for p in model.parameters())

    return {
        "time": elapsed_time,
        "total_params": total_params,
        "complexity": complexity,
        "pred_boxes": pred_boxes  # Return predictions
    }

def compare_predictions(pred_boxes_normal, pred_boxes_quantized):
    pred_normal_tensor = torch.tensor(pred_boxes_normal[['xmin', 'ymin', 'xmax', 'ymax']].values, dtype=torch.float32)
    pred_quantized_tensor = torch.tensor(pred_boxes_quantized[['xmin', 'ymin', 'xmax', 'ymax']].values, dtype=torch.float32)

    # Calculate IoU
    ious = box_iou(pred_normal_tensor, pred_quantized_tensor)
    
    # Calculate mean IoU
    mean_iou = ious.mean().item() if ious.numel() > 0 else 0

    # Calculate distance between boxes
    distances = []
    for normal_box in pred_boxes_normal[['xmin', 'ymin', 'xmax', 'ymax']].values:
        closest_distance = float('inf')
        for quantized_box in pred_boxes_quantized[['xmin', 'ymin', 'xmax', 'ymax']].values:
            distance = np.linalg.norm(normal_box - quantized_box)
            closest_distance = min(closest_distance, distance)
        distances.append(closest_distance)
    
    mean_distance = np.mean(distances)

    return {
        "mean_iou": mean_iou,
        "mean_distance": mean_distance,
        "num_predictions_normal": len(pred_boxes_normal),
        "num_predictions_quantized": len(pred_boxes_quantized)
    }

# Path to your image
image_path = '/kaggle/working/Cat_and_dog_standoff_(3926784260).jpg'

# Load models
normal_model = torch.hub.load('yolov7', 'custom', model_name, source='local', force_reload=True)
normal_model.eval()
normal_results = detect_image(normal_model, image_path)
memory_model = calculate_memory_fp32(model)
print(f"Normal Predictions: {normal_results['pred_boxes']}")

sf16_model = torch.hub.load('yolov7', 'custom', model_name, source='local', force_reload=True)
sf16_model.eval()
total_params, sf16_params, sf16_memory = apply_selective_sf16_quantization(sf16_model)
sf16_results = detect_image(sf16_model, image_path, is_sf16=True)
print(f"SF16 Quantized Predictions: {sf16_results['pred_boxes']}")

# Compare sf16 predictions
sf16_comparison_results = compare_predictions(normal_results['pred_boxes'], sf16_results['pred_boxes'])

sf8_model = torch.hub.load('yolov7', 'custom', model_name, source='local', force_reload=True)
sf8_model.eval()
total_params, sf8_params, sf8_memory = apply_selective_sf8_quantization(sf8_model)
sf8_results = detect_image(sf8_model, image_path, is_sf8=True)
print(f"SF8 Quantized Predictions: {sf8_results['pred_boxes']}")

# Compare sf8 predictions
sf8_comparison_results = compare_predictions(normal_results['pred_boxes'], sf8_results['pred_boxes'])

# Print comparison results
print("Normal YOLOv7:")
print(f"Processing time: {normal_results['time']:.2f} seconds")
print(f"Memory used by model: {memory_model:.2} MB")
print(f"Estimated complexity: {normal_results['complexity']:,} operations")
print(f"Total parameters: {total_params:,}")

print("\nSelectively Quantized YOLOv7 (SF16):")
print(f"Processing time: {sf16_results['time']:.2f} seconds")
print(f"Memory used by model: {sf16_memory:.2f} MB")
print(f"Estimated complexity: {sf16_results['complexity']:,} operations")
print(f"Total parameters: {total_params}")
print(f"Quantized parameters: {sf16_params}")
print(f"Percentage of parameters quantized: {(sf16_params / total_params) * 100:.2f}%")

sf16_time_diff = (normal_results['time'] - sf16_results['time']) / normal_results['time'] * 100
sf16_memory_diff = (memory_model - sf16_memory) / memory_model * 100

print(f"\nTime improvement: {sf16_time_diff:.2f}%")
print(f"Memory improvement: {sf16_memory_diff:.2f}%")

print("Comparison Results:")
print(f"Mean IoU: {100*sf16_comparison_results['mean_iou']:.2f}%")
print(f"Mean Distance between predictions: {sf16_comparison_results['mean_distance']:.2f}")
print(f"Number of predictions (Normal): {sf16_comparison_results['num_predictions_normal']}")
print(f"Number of predictions (Quantized): {sf16_comparison_results['num_predictions_quantized']}")

print("\nSelectively Quantized YOLOv7 (SF8):")
print(f"Processing time: {sf8_results['time']:.2f} seconds")
print(f"Memory used by model: {sf8_memory:.2f} MB")
print(f"Estimated complexity: {sf8_results['complexity']:,} operations")
print(f"Total parameters: {total_params:,}")
print(f"Quantized parameters: {sf8_params:,}")
print(f"Percentage of parameters quantized: {(sf8_params / total_params) * 100:.2f}%")

sf8_time_diff = (normal_results['time'] - sf8_results['time']) / normal_results['time'] * 100
sf8_memory_diff = (memory_model- sf8_memory) / memory_model * 100

print(f"\nTime improvement: {sf8_time_diff:.2f}%")
print(f"Memory improvement: {sf8_memory_diff:.2f}%")

print("Comparison Results:")
print(f"Mean IoU: {100*sf8_comparison_results['mean_iou']:.2f}%")
print(f"Mean Distance between predictions: {sf8_comparison_results['mean_distance']:.2f}")
print(f"Number of predictions (Normal): {sf8_comparison_results['num_predictions_normal']}")
print(f"Number of predictions (Quantized): {sf8_comparison_results['num_predictions_quantized']}")

## TESTING ON LARGE LANGUAGE MODELS

In [1]:
from huggingface_hub import login
login(token="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")
!pip install --upgrade transformers

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful
Collecting transformers
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.45.2-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m81.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.w

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import time
import psutil

# Constants for SF16 and SF8
SF16_MAX = 0.999969482421875
SF16_MIN = -0.999969482421875
SF8_MAX = 0.9921875
SF8_MIN = -0.9921875

def is_in_sf16_range(tensor):
    return torch.all((tensor >= SF16_MIN) & (tensor <= SF16_MAX))

def is_in_sf8_range(tensor):
    return torch.all((tensor >= SF8_MIN) & (tensor <= SF8_MAX))

def quantize_to_sf16(tensor):
    tensor = torch.clamp(tensor, SF16_MIN, SF16_MAX)
    scaled = (tensor - SF16_MIN) / (SF16_MAX - SF16_MIN) * (2**15 - 1)
    rounded = torch.round(scaled)
    quantized = rounded / (2**15 - 1) * (SF16_MAX - SF16_MIN) + SF16_MIN
    return quantized

def quantize_to_sf8(tensor):
    tensor = torch.clamp(tensor, SF8_MIN, SF8_MAX)
    scaled = (tensor - SF8_MIN) / (SF8_MAX - SF8_MIN) * (2**7 - 1)
    rounded = torch.round(scaled)
    quantized = rounded / (2**7 - 1) * (SF8_MAX - SF8_MIN) + SF8_MIN
    return quantized

def apply_selective_quantization(model, quantize_func, is_in_range_func):
    total_params = 0
    quantized_params = 0
    memory_usage = 0

    for name, param in model.named_parameters():
        total_params += param.numel()
        if is_in_range_func(param.data):
            param.data = quantize_func(param.data)
            quantized_params += param.numel()
            memory_usage += param.numel() * (16 if quantize_func == quantize_to_sf16 else 8) / 8
        else:
            memory_usage += param.numel() * 32 / 8

    memory_usage_mb = memory_usage / (1024 * 1024)
    return total_params, quantized_params, memory_usage_mb

def calculate_memory_fp32(model):
    total_params = sum(p.numel() for p in model.parameters())
    memory_usage_fp32 = total_params * 4 / (1024 * 1024)
    return memory_usage_fp32

def estimate_complexity(model):
    return sum(p.numel() for p in model.parameters())

def evaluate_model(model, tokenizer, prompt):
    process = psutil.Process()
    initial_memory = process.memory_info().rss

    inputs = tokenizer(prompt, return_tensors="pt")
    
    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=50)
    end_time = time.time()
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    final_memory = process.memory_info().rss
    elapsed_time = end_time - start_time
    memory_used = (final_memory - initial_memory) / (1024 * 1024)

    complexity = estimate_complexity(model)

    return {
        "time": elapsed_time,
        "memory_used": memory_used,
        "complexity": complexity,
        "generated_text": generated_text
    }

# Load models
model_name = "google/gemma-2-2b"
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("Loading normal model...")
normal_model = AutoModelForCausalLM.from_pretrained(model_name)
normal_model.eval()

print("Loading and quantizing SF16 model...")
sf16_model = AutoModelForCausalLM.from_pretrained(model_name)
sf16_model.eval()
total_params, sf16_params, sf16_memory = apply_selective_quantization(sf16_model, quantize_to_sf16, is_in_sf16_range)

print("Loading and quantizing SF8 model...")
sf8_model = AutoModelForCausalLM.from_pretrained(model_name)
sf8_model.eval()
_, sf8_params, sf8_memory = apply_selective_quantization(sf8_model, quantize_to_sf8, is_in_sf8_range)

# Evaluation
prompt = "Near them, on the sand, Half sunk a shattered visage lies,"

print("\nEvaluating normal model...")
normal_results = evaluate_model(normal_model, tokenizer, prompt)
memory_model = calculate_memory_fp32(normal_model)

print("\nEvaluating SF16 quantized model...")
sf16_results = evaluate_model(sf16_model, tokenizer, prompt)

print("\nEvaluating SF8 quantized model...")
sf8_results = evaluate_model(sf8_model, tokenizer, prompt)

# Print results
print("\nNormal Gemma-2:")
print(f"Processing time: {normal_results['time']:.2f} seconds")
print(f"Memory used by model: {memory_model:.2f} MB")
print(f"Estimated complexity: {normal_results['complexity']:,} operations")
print(f"Generated text: {normal_results['generated_text']}")

print("\nSF16 Quantized Gemma-2:")
print(f"Processing time: {sf16_results['time']:.2f} seconds")
print(f"Memory used by model: {sf16_memory:.2f} MB")
print(f"Estimated complexity: {sf16_results['complexity']:,} operations")
print(f"Total parameters: {total_params:,}")
print(f"Quantized parameters: {sf16_params:,}")
print(f"Percentage of parameters quantized: {(sf16_params / total_params) * 100:.2f}%")
print(f"Generated text: {sf16_results['generated_text']}")

sf16_time_diff = (normal_results['time'] - sf16_results['time']) / normal_results['time'] * 100
sf16_memory_diff = (memory_model - sf16_memory) / memory_model * 100

print(f"\nTime improvement: {sf16_time_diff:.2f}%")
print(f"Memory improvement: {sf16_memory_diff:.2f}%")

print("\nSF8 Gemma-2:")
print(f"Processing time: {sf8_results['time']:.2f} seconds")
print(f"Memory used by model: {sf8_memory:.2f} MB")
print(f"Estimated complexity: {sf8_results['complexity']:,} operations")
print(f"Total parameters: {total_params:,}")
print(f"Quantized parameters: {sf8_params:,}")
print(f"Percentage of parameters quantized: {(sf8_params / total_params) * 100:.2f}%")
print(f"Generated text: {sf8_results['generated_text']}")

sf8_time_diff = (normal_results['time'] - sf8_results['time']) / normal_results['time'] * 100
sf8_memory_diff = (memory_model - sf8_memory) / memory_model * 100

print(f"\nTime improvement: {sf8_time_diff:.2f}%")
print(f"Memory improvement: {sf8_memory_diff:.2f}%")

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Loading normal model...


config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/481M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Loading and quantizing SF16 model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading and quantizing SF8 model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]


Evaluating normal model...


The 'max_batch_size' argument of HybridCache is deprecated and will be removed in v4.46. Use the more precisely named 'batch_size' argument instead.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)



Evaluating SF16 quantized model...

Evaluating SF8 quantized model...

Normal Gemma-2:
Processing time: 27.07 seconds
Memory used by model: 9972.92 MB
Estimated complexity: 2,614,341,888 operations
Generated text: Near them, on the sand, Half sunk a shattered visage lies, whose frown, And wrinkled lip, and sneer of cold command, Tell that its sculptor well those passions read Which yet survive, stamped on these lifeless things, The hand that mocked them, and the heart that fed.

The hand that mocked them,

SF16 Quantized Gemma-2:
Processing time: 25.38 seconds
Memory used by model: 6111.86 MB
Estimated complexity: 2,614,341,888 operations
Total parameters: 2,614,341,888
Quantized parameters: 2,024,308,224
Percentage of parameters quantized: 77.43%
Generated text: Near them, on the sand, Half sunk a shattered visage lies, whose frown, And wrinkled lip, and sneer of cold command, Tell that its sculptor well those passions read Which yet survive, stamped on these lifeless things, The han

In [1]:
class SuperFloat:
    def __init__(self, binary_repr):
        if len(binary_repr) != 16 or any(bit not in {'0', '1'} for bit in binary_repr):
            raise ValueError("Binary representation must be a 16-bit binary string.")
        self.binary_repr = binary_repr

    @property
    def sign_bit(self):
        return self.binary_repr[0]

    @property
    def mantissa_bits(self):
        return self.binary_repr[1:]

    def to_float(self):
        # Convert binary mantissa to fractional value
        mantissa = int(self.mantissa_bits, 2) / (1 << 15)
        return -mantissa if self.sign_bit == '1' else mantissa

    @classmethod
    def from_float(cls, value):
        if not (-1 < value < 1):
            raise ValueError("Value must be between -1 and 1 (exclusive).")
        sign_bit = '1' if value < 0 else '0'
        mantissa = int(abs(value) * (1 << 15))
        mantissa_bits = f"{mantissa:015b}"
        return cls(sign_bit + mantissa_bits)

    def __add__(self, other):
        # Convert to floats, perform addition, and clamp to valid range
        result = self.to_float() + other.to_float()
        if result >= 1:
            result = 1 - 1 / (1 << 15)
        elif result <= -1:
            result = -1 + 1 / (1 << 15)
        return SuperFloat.from_float(result)

    def __sub__(self, other):
        # Convert to floats, perform subtraction, and clamp to valid range
        result = self.to_float() - other.to_float()
        if result >= 1:
            result = 1 - 1 / (1 << 15)
        elif result <= -1:
            result = -1 + 1 / (1 << 15)
        return SuperFloat.from_float(result)

    def __mul__(self, other):
        # Convert to floats, perform multiplication, and clamp to valid range
        result = self.to_float() * other.to_float()
        return SuperFloat.from_float(result)

    def __truediv__(self, other):
        if other.to_float() == 0:
            raise ZeroDivisionError("Division by zero is not allowed.")
        # Convert to floats, perform division, and clamp to valid range
        result = self.to_float() / other.to_float()
        if result >= 1:
            result = 1 - 1 / (1 << 15)
        elif result <= -1:
            result = -1 + 1 / (1 << 15)
        return SuperFloat.from_float(result)

    def __repr__(self):
        return f"SuperFloat(binary={self.binary_repr}, float={self.to_float()})"

# Example usage
a = SuperFloat.from_float(0.5)
b = SuperFloat.from_float(-0.25)

print(f"a = {a}")
print(f"b = {b}")

# Addition
c = a + b
print(f"a + b = {c}")

# Subtraction
d = a - b
print(f"a - b = {d}")

# Multiplication
e = a * b
print(f"a * b = {e}")

# Division
f = a / b
print(f"a / b = {f}")

a = SuperFloat(binary=0100000000000000, float=0.5)
b = SuperFloat(binary=1010000000000000, float=-0.25)
a + b = SuperFloat(binary=0010000000000000, float=0.25)
a - b = SuperFloat(binary=0110000000000000, float=0.75)
a * b = SuperFloat(binary=1001000000000000, float=-0.125)
a / b = SuperFloat(binary=1111111111111111, float=-0.999969482421875)


In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np

# Set device: GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Custom SuperFloat quantization with variable bitwidth
def superfloat_quantize(tensor, bitwidth):
    if bitwidth < 1 or bitwidth > 16:
        raise ValueError("Bitwidth must be between 1 and 16.")
    
    # Calculate the number of mantissa bits
    mantissa_bits = bitwidth - 1  # 1 bit for the sign
    max_value = (1 << mantissa_bits) - 1  # Maximum representable integer
    
    # Clamp values to the range (-1, 1)
    tensor = torch.clamp(tensor, -1 + 1e-7, 1 - 1e-7)
    
    # Scale to the representable range and round
    tensor = torch.round(tensor * max_value) / max_value
    return tensor

# Quantize model weights
def quantize_model_weights(model, bitwidth):
    with torch.no_grad():
        for name, param in model.named_parameters():
            if "weight" in name:  # Only quantize weights, not biases
                param.data = superfloat_quantize(param.data, bitwidth)
    return model

# Calculate perplexity
def calculate_perplexity(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt")
    # Move input tensors to the GPU (if available)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
    return torch.exp(loss).item()

# Load pretrained model and tokenizer
model_name = "openai-community/gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Original model perplexity
text = "Hello, how are you?"
original_perplexity = calculate_perplexity(model, tokenizer, text)
print(f"Original Model Perplexity: {original_perplexity}")

# Quantize model weights with variable bitwidth
bitwidth = 16  # Example: 16-bit quantization
quantized_model = quantize_model_weights(model, bitwidth)

# Quantized model perplexity
quantized_perplexity = calculate_perplexity(quantized_model, tokenizer, text)
print(f"{bitwidth}-bit Quantized Model Perplexity: {quantized_perplexity}")