In [1]:
import csv
import math
import matplotlib.pyplot as plt
import multiprocessing
import numpy as np
import os
import time
import uproot

def get_total_size(path, n_files):
    filenames = sorted(os.listdir(path))
    total_size = sum([os.path.getsize(path + filenames[i]) for i in range(n_files)])
    return total_size / (2**30)

def col_average(data):
    n_rows = len(data)
    n_cols = len(data[0])
    return [sum([data[i][j] for i in range(1, n_rows)]) / (n_rows - 1) for j in range(n_cols)]

def col_standard_deviation(data):
    n_rows = len(data)
    n_cols = len(data[0])
    mean = col_average(data)
    return [(sum([(data[i][j] - mean[j])**2 for i in range(1, n_rows)]) / (n_rows - 1))**0.5 for j in range(n_cols)]

    
def partition_helper(slice_entries, file_entries, file_curr, entry_curr):
    if slice_entries <= file_entries[file_curr] - entry_curr:
        return [file_curr, slice_entries + entry_curr]
    elif file_curr == len(file_entries) - 1:
        return [file_curr, file_entries[-1]]
    else:
        return partition_helper(slice_entries - file_entries[file_curr] + entry_curr, file_entries, file_curr + 1, 0)

def partition(files, n_processes):
    file_entries = [file.num_entries for file in files]
    slice_entries = math.ceil(sum(file_entries) / n_processes)
    slices = []
    file_start = 0
    entry_start = 0
    for i in range(n_processes):
        slices.append([file_start, entry_start] + partition_helper(slice_entries, file_entries, file_start, entry_start))
        file_start = slices[-1][-2]
        entry_start = slices[-1][-1]
    return slices

def read_slice(files, slices, index, data):
    data_slice = []
    for i in range(slices[index][0], slices[index][2] + 1):
        data_slice.append(files[i].arrays("candidate_vMass", 
                                          "(candidate_charge == 0)\
                                          & (candidate_cosAlpha > 0.99)\
                                          & (candidate_lxy / candidate_lxyErr > 3.0)\
                                          & (candidate_vProb > 0.05)\
                                          & (ditrack_mass > 1.014) & (ditrack_mass < 1.024)\
                                          & (candidate_vMass > 5.33) & (candidate_vMass < 5.4)",
                                          entry_start=slices[index][1] if i == slices[index][0] else None,
                                          entry_stop=slices[index][3] if i == slices[index][2] else None,
                                          array_cache=None,
                                          library="np",
                                         )["candidate_vMass"])
    data.append(np.concatenate(tuple(data_slice)))

def runtime_measure_mp(path, n_files, n_processes):
    if n_files == 0: return 0
    if n_processes == 0: return runtime_measure(path, n_files)
    start = time.time()
    files = [uproot.open(path=path + filename + ":rootuple/CandidateTree", object_cache=None, array_cache=None) for filename in sorted(os.listdir(path))[:n_files]]
    slices = partition(files, n_processes)
    data = multiprocessing.Manager().list()
    processes = []
    for i in range(n_processes):
        p = multiprocessing.Process(target=read_slice, args=[files, slices, i, data])
        p.start()
        processes.append(p)

    for p in processes:
        p.join()
    
    np.concatenate(tuple(data))
    
    return time.time() - start

def runtime_measure(path, n_files):
    if n_files == 0: return 0
    start = time.time()
    files = [uproot.open(path=path + filename + ":rootuple/CandidateTree", object_cache=None, array_cache=None) for filename in sorted(os.listdir(path))[:n_files]]
    data = []
    for file in files:
        data.append(file.arrays("candidate_vMass", 
                              "(candidate_charge == 0)\
                              & (candidate_cosAlpha > 0.99)\
                              & (candidate_lxy / candidate_lxyErr > 3.0)\
                              & (candidate_vProb > 0.05)\
                              & (ditrack_mass > 1.014) & (ditrack_mass < 1.024)\
                              & (candidate_vMass > 5.33) & (candidate_vMass < 5.4)",
                              array_cache=None,
                              library="np")["candidate_vMass"])
        
    np.concatenate(tuple(data))
    
    return time.time() - start

def runtime_vs_variable(path, target_dir, measure_function, variable, step, n_loops, var_max, constant=None):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    result_path = ("%s/runtime_vs_%s_%d_%d_%d_%d.csv" % (target_dir, variable, constant, var_max, step, n_loops)) if constant else ("%s/runtime_vs_%s_%d_%d_%d.csv" % (target_dir, variable, var_max, step, n_loops))
    
    x = [get_total_size(path, a) for a in range(0, var_max + step, step)] if "size" in variable else [a for a in range(0, var_max + step, step)]
        
    with open(result_path, "w+", newline="") as f:
        csv.writer(f).writerow(x)
    for n in range(n_loops):
        y = [measure_function(*(path, i if "size" in variable else constant, constant if "size" in variable else i) if constant else (path, i)) for i in range(0, var_max + step, step)]
        with open(result_path, "a+", newline="") as f:
            csv.writer(f).writerow(y)
    



In [2]:
path = "../data/128_files/"
target_dir = "runtime_tests_uproot/test"

In [3]:
def get_num_entries(path, n_files):
    filenames = sorted(os.listdir(path))
    num_entries = sum([uproot.open(path + filenames[i] + ":rootuple/CandidateTree").num_entries for i in range(n_files)])
    return num_entries

In [4]:
get_num_entries(path, 128)

457860912

In [6]:
get_total_size(path, 128)

126.77721692249179

In [5]:
def partition_files(path, n_files, n_processes):
    filenames = sorted(os.listdir(path))
    partitions = []
    curr = 0
    for i in range(n_processes):
        if i >= n_files: break
        n_files_in_partition = n_files // n_processes if i >= n_files % n_processes else n_files // n_processes + 1
        files_to_read = []
        for j in range(n_files_in_partition):
            files_to_read.append(path + filenames[curr + j])
        curr += n_files_in_partition
        partitions.append(files_to_read)
    return partitions

In [6]:
# partition_files(path, 128, 64)

In [14]:
def read_slice_detailed(files, slices, index, data, process_runtimes):
    key = "P%d" % (index)
    start_time = time.time()
    data_slice = []
    for i in range(slices[index][0], slices[index][2] + 1):
        data_slice.append(files[i].arrays("candidate_vMass", 
                                          "(candidate_charge == 0)\
                                          & (candidate_cosAlpha > 0.99)\
                                          & (candidate_lxy / candidate_lxyErr > 3.0)\
                                          & (candidate_vProb > 0.05)\
                                          & (ditrack_mass > 1.014) & (ditrack_mass < 1.024)\
                                          & (candidate_vMass > 5.33) & (candidate_vMass < 5.4)",
                                          entry_start=slices[index][1] if i == slices[index][0] else None,
                                          entry_stop=slices[index][3] if i == slices[index][2] else None,
                                          array_cache=None,
                                          library="np",
                                         )["candidate_vMass"])
    data.append(np.concatenate(tuple(data_slice)))
    runtime = time.time() - start_time
    
    process_runtimes[key] = runtime

In [15]:
def runtime_measure_mp_detailed(path, n_files, n_processes):
    start_time = time.time()
    
    start_time2 = time.time()
    files = [uproot.open(path=path + filename + ":rootuple/CandidateTree", object_cache=None, array_cache=None) for filename in sorted(os.listdir(path))[:n_files]]
#     files = open_mp(path, n_files, 8)
    runtime2 = time.time() - start_time2
    slices = partition(files, n_processes)

    processes = []
    data = multiprocessing.Manager().list()
    process_runtimes = multiprocessing.Manager().dict()
    
    for i in range(n_processes):
        p = multiprocessing.Process(target=read_slice_detailed, args=[files, slices, i, data, process_runtimes])
        p.start()
        processes.append(p)
        
    for p in processes:
        p.join()
    
    np.concatenate(tuple(data))
    
    runtime = time.time() - start_time
    
    return runtime, runtime2, dict(process_runtimes)

In [16]:
runtime_measure_mp_detailed(path, 128, 64)

(16.877119302749634,
 8.633413076400757,
 {'P0': 2.3061718940734863,
  'P2': 3.4635045528411865,
  'P1': 3.560978651046753,
  'P11': 3.4315545558929443,
  'P9': 3.452666997909546,
  'P10': 3.482663154602051,
  'P3': 3.7990365028381348,
  'P13': 3.357825517654419,
  'P8': 3.8803114891052246,
  'P4': 3.9433705806732178,
  'P18': 3.3946096897125244,
  'P12': 3.7330987453460693,
  'P19': 3.470576047897339,
  'P15': 3.5607736110687256,
  'P14': 3.622859239578247,
  'P17': 3.722782611846924,
  'P20': 3.6657941341400146,
  'P22': 3.6409919261932373,
  'P21': 3.662630081176758,
  'P5': 4.456023216247559,
  'P31': 3.5433173179626465,
  'P36': 3.47725248336792,
  'P6': 4.542925834655762,
  'P25': 3.8495779037475586,
  'P33': 3.6438939571380615,
  'P34': 3.686098337173462,
  'P7': 4.714405536651611,
  'P27': 3.9191792011260986,
  'P35': 3.76322603225708,
  'P24': 4.036141395568848,
  'P37': 3.7718703746795654,
  'P28': 4.018613815307617,
  'P26': 4.089170694351196,
  'P39': 3.79947829246521,
  'P

In [17]:
runtime_measure_mp_detailed(path, 128, 32)

(15.391176700592041,
 9.195393323898315,
 {'P0': 4.074832201004028,
  'P31': 4.035709619522095,
  'P3': 4.826854944229126,
  'P1': 4.847188711166382,
  'P6': 4.850580453872681,
  'P2': 4.8972742557525635,
  'P4': 4.8943421840667725,
  'P5': 4.906720161437988,
  'P9': 4.908140659332275,
  'P7': 4.941329717636108,
  'P11': 4.930885076522827,
  'P10': 4.960983037948608,
  'P14': 4.997405290603638,
  'P19': 4.968256711959839,
  'P12': 5.030791282653809,
  'P21': 4.97875714302063,
  'P20': 4.9986937046051025,
  'P17': 5.030205011367798,
  'P13': 5.075993299484253,
  'P18': 5.039189577102661,
  'P15': 5.067828416824341,
  'P16': 5.081832647323608,
  'P28': 4.989311695098877,
  'P22': 5.048276662826538,
  'P26': 5.028354167938232,
  'P23': 5.069641351699829,
  'P24': 5.064610958099365,
  'P27': 5.046087980270386,
  'P29': 5.042645692825317,
  'P30': 5.045025110244751,
  'P25': 5.090800046920776,
  'P8': 5.949615240097046})

In [25]:
os.cpu_count()

32