In [4]:
import os
import random
import struct
import time

Создание файла

In [2]:
file_size = 2 * 1024 * 1024 * 1024
file_name = "random_integers.bin"
num_integers = file_size // 4

with open(file_name, "wb") as f:
    for _ in range(num_integers):
        random_int = random.randint(0, 0xFFFFFFFF)
        f.write(struct.pack('>I', random_int))

Простое последовательное чтение

In [5]:
def process_binary_file(file_name):
    total_sum, min_value, max_value = 0, 2 ** 32 - 1, 0
    with open(file_name, 'rb') as f:
        while chunk := f.read(4):
            x = struct.unpack('>I', chunk)[0]
            total_sum += x
            min_value, max_value = min(x, min_value), max(x, max_value)
    return total_sum, min_value, max_value

In [6]:
start_time = time.time()
total_sum, min_value, max_value = process_binary_file(file_name)
end_time = time.time()
print(f'Сумма: {total_sum}, Минимум: {min_value}, Максимум: {max_value}')
print(f"Время выполнения: {end_time - start_time} секунд")

Сумма: 1152907175663991962, Минимум: 12, Максимум: 4294967276
Время выполнения: 421.2669394016266 секунд


Многопоточная + memory-mapped files. Сравните время работы.

In [7]:
import mmap
import threading

def process_chunk(data, results, index):
    total_sum = 0
    min_value = 2**32 - 1
    max_value = 0

    for i in range(0, len(data), 4):
        x = struct.unpack('>I', data[i:i+4])[0]
        total_sum += x
        min_value, mx = min(x, min_value), max(x, max_value)

    results[index] = (total_sum, min_value, max_value)

def process_binary_file_parallel(file_name):
    with open(file_name, 'rb') as f:
        file_size = os.path.getsize(file_name)
        chunk_size = file_size // os.cpu_count()

        with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mm:
            threads = []
            results = [None] * os.cpu_count()

            for i in range(os.cpu_count()):
                start = i * chunk_size
                end = start + chunk_size if i != os.cpu_count() - 1 else file_size
                thread = threading.Thread(target=process_chunk, args=(mm[start:end], results, i))
                threads.append(thread)
                thread.start()

            for thread in threads:
                thread.join()

    total_sum = sum(result[0] for result in results)
    min_value = min(result[1] for result in results)
    max_value = max(result[2] for result in results)

    return total_sum, min_value, max_value

In [8]:
start_time = time.time()
total_sum, min_value, max_value = process_binary_file_parallel(file_name)
end_time = time.time()
print(f'Сумма: {total_sum}, Минимум: {min_value}, Максимум: {max_value}')
print(f"Время выполнения: {end_time - start_time} секунд")

Сумма: 1152907175663991962, Минимум: 12, Максимум: 0
Время выполнения: 412.33127069473267 секунд
