## Thanks for the work [boristown](https://www.kaggle.com/code/boristown/agi-compressarc)

### In this version, redundant parameters have been removed, the code structure has been changed and improved, and execution has been significantly accelerated. A more compact and faster pipeline allows for more iterations within 12 hours, which increases the likelihood of finding the right solutions.

In [1]:
import os
import sys
import time
import json
import torch
import random
import importlib
import numpy as np
import multiprocessing

# Add external module path
sys.path.append('/kaggle/input/arc-inp')

# Dynamically import preprocessing module
path = "/kaggle/input/arc-inp/preprocessing.py"
preprocessing_spec = importlib.util.spec_from_file_location("preprocessing", path)
preprocessing = importlib.util.module_from_spec(preprocessing_spec)
sys.modules["preprocessing"] = preprocessing
preprocessing_spec.loader.exec_module(preprocessing)

# Import project modules
import train
import layers
import solve_task
import initializers
import arc_compressor
import solution_selection
import multitensor_systems

fake_mode = not os.getenv('KAGGLE_IS_COMPETITION_RERUN')

multiprocessing.set_start_method('spawn', force=True)
torch.set_default_dtype(torch.float32)
torch.set_default_device('cuda')
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True

if __name__ == '__main__':
    start = time.time()
    end = start + 5*3600

    n_cpus = multiprocessing.cpu_count()
    n_gpus = torch.cuda.device_count()
    
    split = "evaluation" if fake_mode else "test"
    with open(f'../input/arc-prize-2025/arc-agi_{split}_challenges.json', 'r') as f:
        problems = json.load(f)
    
    task_names = list(problems.keys())
    
    del problems
    
    n_tasks = len(task_names)

def parallelize_runs(gpu_quotas, task_usages, n_iterations):
    gpu_quotas = gpu_quotas[:]
    t = time.time()
    
    tasks_started = [False for i in range(n_tasks)]
    tasks_finished = [False for i in range(n_tasks)]
    processes = [None for i in range(n_tasks)]
    process_gpu_ids = [None for i in range(n_tasks)]
    
    with multiprocessing.Manager() as manager:
        memory_dict = manager.dict()
        solutions_dict = manager.dict()
        error_queue = manager.Queue()
        while not all(tasks_finished):
            if not error_queue.empty():
                raise ValueError(error_queue.get())
            for i in range(n_tasks):
                if tasks_started[i] and not tasks_finished[i]:
                    processes[i].join(timeout=0)
                    if not processes[i].is_alive():
                        tasks_finished[i] = True
                        gpu_quotas[process_gpu_ids[i]] += task_usages[i]
            
            for gpu_id in range(n_gpus):
                for i in range(n_tasks):
                    enough_quota = gpu_quotas[gpu_id] > task_usages[i]
                    enough_cpus = sum(map(int, tasks_started)) - sum(map(int, tasks_finished)) < n_cpus
                    if not tasks_started[i] and enough_quota and enough_cpus:
                        gpu_quotas[gpu_id] -= task_usages[i]
                        args = (task_names[i], split, end, n_iterations, gpu_id, memory_dict, solutions_dict, error_queue)
                        p = multiprocessing.Process(target=solve_task.solve_task, args=args)
                        p.start()
                        processes[i] = p
                        tasks_started[i] = True
                        process_gpu_ids[i] = gpu_id
                        
            # time.sleep(0.8) smaller -> faster
        if not error_queue.empty():
            raise ValueError(error_queue.get())
        
        memory_dict = dict(memory_dict)
        solutions_dict = dict(solutions_dict)
    
    time_taken = time.time() - t
    
    return memory_dict, solutions_dict, time_taken

if __name__ == '__main__':
    gpu_memory_quotas = [torch.cuda.mem_get_info(i)[0] for i in range(n_gpus)]

    gpu_task_quotas = [int(gpu_memory_quota // (4 * 1024**3)) for gpu_memory_quota in gpu_memory_quotas]
    task_usages = [1 for i in range(n_tasks)]
    memory_dict, _, _ = parallelize_runs(gpu_task_quotas, task_usages, 2)
    
    tasks = sorted(memory_dict.items(), key=lambda x: x[1], reverse=True)
    task_names, task_memory_usages = zip(*tasks)
    
    test_steps = 5 if fake_mode else 20
    mq = [memory_quota - 6 * 1024**3 for memory_quota in gpu_memory_quotas]
    _, _, time_taken = parallelize_runs(mq, task_memory_usages, test_steps)
    
    time_per_step = time_taken / test_steps
    time_left = end - time.time()
    n_steps = 5 if fake_mode else int(time_left // time_per_step)
    _, solutions_dict, time_taken = parallelize_runs(mq, task_memory_usages, n_steps)
    
    with open('submission.json', 'w') as f:
        json.dump(solutions_dict, f, indent=4)