In [1]:
# Tell jupyter notebook to reload all modules before running each cell,thereby if 
# a function is modified, the changes are reflected immediately
%load_ext autoreload
%autoreload 2

In [2]:
# Hyperparameter Sensitivity Analysis
# This notebook explores the sensitivity of the model's performance to various hyperparameters using a grid search approach.
import os
print(os.getcwd())
os.chdir("..")
print(os.getcwd())

/home/alex/projects/ocr/scripts
/home/alex/projects/ocr


In [3]:

import itertools
import pandas as pd
from datasets import load_from_disk
from transformers import Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration, AutoProcessor
import torch
from qwen_vl_utils import process_vision_info
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from concurrent.futures import ThreadPoolExecutor
import torch
import time
import os

from src.train_test import *
from src.train_test import  run_inference_and_calculate_cer
from src.qwen_finetune import train_and_validate


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load dataset
dataset_p40 = load_from_disk("data/processed/dataset_p40")

In [5]:

# Define hyperparameter grid
learning_rates = [1e-5, 3e-5, 5e-5]
batch_sizes = [1]
train_select_end = len(dataset_p40)
accumulation_steps = [1, 2]
image_factors = [28]
max_steps_list = [5]

# Create combinations of hyperparameters
param_grid = list(itertools.product(learning_rates, batch_sizes, accumulation_steps, image_factors, max_steps_list))


# Testing

In [6]:

# Select the first set of parameters
params = param_grid[0]
lr, batch_size, acc_steps, img_factor, max_steps = params
# Set up output directory with modified learning rate format
output_dir = f'results/models/lr_{lr:.0e}'.replace('-', '_').replace('e-', 'e_') + f'_bs_{batch_size}_acc_{acc_steps}_img_{img_factor}_max_{max_steps}'

output_dir
os.makedirs(output_dir, exist_ok=True)




In [7]:

torch.cuda.empty_cache()
torch.cuda.reset_max_memory_allocated()



In [6]:
torch.cuda.empty_cache()
torch.cuda.reset_max_memory_allocated()
print(f"Allocated memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MiB")
print(f"Max memory allocated: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MiB")


Allocated memory: 0.00 MiB
Max memory allocated: 0.00 MiB




In [None]:
#dir()

In [16]:
params = param_grid[0]
params

(1e-05, 1, 1, 28, 1000, 5)

In [19]:
lr, batch_size, acc_steps, img_factor,  max_steps = params

In [None]:

# Start timing
start_time = time.time()

# Train model
try:
    train_and_validate(
        model_name='Qwen/Qwen2-VL-2B-Instruct',
        output_dir=output_dir,
        dataset_name='culturalheritagenus/Gongguan-OCR-p40',
        image_column='image',
        text_column='text',
        device='cuda:0' ,
        min_pixel=256,
        max_pixel=384,
        image_factor=img_factor,
        num_accumulation_steps=2,
        max_steps=10,
        train_select_end = 10,
        train_batch_size=batch_size,
        val_batch_size=1,
        lr = lr
    )
except Exception as e:
    print(f"Error during training: {e}")


In [21]:
# Evaluate model
message = "Convert this image to text"
small_test_dataset = dataset_p40.select(range(10))
# Evaluate model
df_results = run_inference_and_calculate_cer(f"{output_dir}/final", message, small_test_dataset) 

In [None]:

# Calculate metrics
mean_cer = df_results['CER'].mean()
median_cer = df_results['CER'].median()
memory_usage = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0
#speed = max_steps / (time.time() - start_time)

print(f"Mean CER: {mean_cer}")
print(f"Median CER: {median_cer}")
print(f"Memory Usage: {memory_usage}")
#print(f"Speed: {speed} steps/sec")
df_results['CER']

In [7]:

# Function to train and evaluate a model with given hyperparameters
def train_and_evaluate(params, message, dataset, output_path):

    lr, batch_size, acc_steps, img_factor, train_select_end, max_steps = params
    #output_dir = f'models/lr_{lr}_bs_{batch_size}_acc_{acc_    test_name = f'lr_{lrteps}_range_{train_start}_{train_end}_img_{img_factor}_eval_{eval_steps}_max_{max_steps}'
    test_name = f'lr_{lr:.0e}'.replace('-', '_').replace('e-', 'e_') + f'_bs_{batch_size}_acc_{acc_steps}_trainend_{train_select_end}_img_{img_factor}_max_{max_steps}'
    output_dir = f'{output_path}/{test_name}'
    os.makedirs(output_dir, exist_ok=True)

    # start timer
    start_time = time.time() 
    
    # Train model
    train_and_validate(
        model_name='Qwen/Qwen2-VL-2B-Instruct',
        output_dir=output_dir,
        dataset_name='culturalheritagenus/Gongguan-OCR-p40',
        image_column='image',
        text_column='text',
        device='cuda:0', 
        min_pixel=256,
        max_pixel=384,
        image_factor=img_factor,
        num_accumulation_steps=acc_steps,
        train_select_end = train_select_end,
        max_steps=max_steps,
        train_batch_size=batch_size,
        val_batch_size=1,
        lr = lr
    )

    # Evaluate model
    model_path = f"{output_dir}/final"
    results = run_inference_and_calculate_cer(model_path, message, dataset)
    
    # Save predictions to CSV
    results.to_csv(f'results/predictions/{test_name}.csv', index=False)

    # Calculate metrics
    mean_cer = results['CER'].mean()
    median_cer = results['CER'].median()
    memory_usage = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0
    speed = (time.time() - start_time)
    
    return {
        'params': params,
        'mean_cer': mean_cer,
        'median_cer': median_cer,
        'memory_usage': memory_usage,
        "speed": speed
    }


# Run function

Try for one row

In [8]:
message = "Convert this image to text"
output_path = "results/models"

In [30]:
small_test_dataset = dataset_p40.select(range(10))

In [None]:
train_and_evaluate(params, message, small_test_dataset, output_path)

Delete old model, processor and inputs:

In [9]:
import gc
def list_tensors_on_gpu(min_size_mb=10):
    print(f"Tensors on GPU larger than {min_size_mb}MB:\n")
    for obj in gc.get_objects():
        try:
            if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
                if obj.is_cuda:
                    size_mb = obj.element_size() * obj.nelement() / 1024**2
                    if size_mb > min_size_mb:
                        print(f"Type: {type(obj)}, Size: {size_mb:.2f} MB, Shape: {tuple(obj.shape)}")
        except:
            pass

list_tensors_on_gpu()

Tensors on GPU larger than 10MB:



  return isinstance(obj, torch.Tensor)
  if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):


In [10]:
# Step 1: Run garbage collection
import gc
gc.collect()

# Step 2: Empty the PyTorch CUDA cache
import torch
torch.cuda.empty_cache()


In [11]:
torch.cuda.empty_cache()
torch.cuda.reset_max_memory_allocated()
print(f"Allocated memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MiB")
print(f"Max memory allocated: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MiB")


Allocated memory: 0.00 MiB
Max memory allocated: 0.00 MiB


Try doing it the "old" way with for loops

In [12]:
# Define hyperparameter grid
learning_rates = [1e-3, 1e-5, 1e-7] # 
batch_sizes = [1, 2]
accumulation_steps = [1, 5]
image_factors = [28]
train_select_end = [round(len(dataset_p40) / 2), len(dataset_p40)]
max_steps_list = [100000]

# Create combinations of hyperparameters
param_grid = list(itertools.product(learning_rates, batch_sizes, accumulation_steps, image_factors, train_select_end, max_steps_list))


In [13]:
len(param_grid)

24

In [14]:
param_grid

[(0.001, 1, 1, 28, 330, 100000),
 (0.001, 1, 1, 28, 661, 100000),
 (0.001, 1, 5, 28, 330, 100000),
 (0.001, 1, 5, 28, 661, 100000),
 (0.001, 2, 1, 28, 330, 100000),
 (0.001, 2, 1, 28, 661, 100000),
 (0.001, 2, 5, 28, 330, 100000),
 (0.001, 2, 5, 28, 661, 100000),
 (1e-05, 1, 1, 28, 330, 100000),
 (1e-05, 1, 1, 28, 661, 100000),
 (1e-05, 1, 5, 28, 330, 100000),
 (1e-05, 1, 5, 28, 661, 100000),
 (1e-05, 2, 1, 28, 330, 100000),
 (1e-05, 2, 1, 28, 661, 100000),
 (1e-05, 2, 5, 28, 330, 100000),
 (1e-05, 2, 5, 28, 661, 100000),
 (1e-07, 1, 1, 28, 330, 100000),
 (1e-07, 1, 1, 28, 661, 100000),
 (1e-07, 1, 5, 28, 330, 100000),
 (1e-07, 1, 5, 28, 661, 100000),
 (1e-07, 2, 1, 28, 330, 100000),
 (1e-07, 2, 1, 28, 661, 100000),
 (1e-07, 2, 5, 28, 330, 100000),
 (1e-07, 2, 5, 28, 661, 100000)]

In [15]:
all_results = []
for i, params in enumerate(param_grid):
    print("starting line ",i,"out of", len(param_grid))
    
    all_results.append({
    'params': params,
    'results': train_and_evaluate(params, message, dataset_p40, output_path)
    })

starting line  0 out of 24


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.42it/s]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Training: 100%|██████████| 100000/100000 [7:54:27<00:00,  3.51it/s, loss=1.38]   
/pytorch/aten/src/ATen/native/cuda/TensorCompare.cu:110: _assert_async_cuda_kernel: block: [0,0,0], thread: [0,0,0] Assertion `probability tensor contains either `inf`, `nan` or element < 0` failed.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [55]:
start = time.time()
print(f"Time taken: {time.time() - start} seconds")

Time taken: 2.0742416381835938e-05 seconds


In [65]:
all_results[0]["results"]["speed"] / 60

7.6129380861918134

In [68]:
speeds = [r['results']['speed'] for r in all_results]
speeds

[456.7762851715088, 484.6063802242279, 319.8828670978546, 503.1054666042328]

In [72]:
df = pd.DataFrame([
    {**dict(zip(param_names, r['params'])), **r['results']} for r in all_results
])


NameError: name 'param_names' is not defined