In [110]:
import csv
import math
import matplotlib.pyplot as plt
import multiprocessing
import numpy as np
import os
import time
import uproot

def get_total_size(path, n_files):
    filenames = sorted(os.listdir(path))
    total_size = sum([os.path.getsize(path + filenames[i]) for i in range(n_files)])
    return total_size / (2**30)

def col_average(data):
    n_rows = len(data)
    n_cols = len(data[0])
    return [sum([data[i][j] for i in range(1, n_rows)]) / (n_rows - 1) for j in range(n_cols)]

def col_standard_deviation(data):
    n_rows = len(data)
    n_cols = len(data[0])
    mean = col_average(data)
    return [(sum([(data[i][j] - mean[j])**2 for i in range(1, n_rows)]) / (n_rows - 1))**0.5 for j in range(n_cols)]

    
def partition_helper(slice_entries, file_entries, file_curr, entry_curr):
    if slice_entries <= file_entries[file_curr] - entry_curr:
        return [file_curr, slice_entries + entry_curr]
    elif file_curr == len(file_entries) - 1:
        return [file_curr, file_entries[-1]]
    else:
        return partition_helper(slice_entries - file_entries[file_curr] + entry_curr, file_entries, file_curr + 1, 0)

def partition(files, n_processes):
    file_entries = [file.num_entries for file in files]
    slice_entries = math.ceil(sum(file_entries) / n_processes)
    slices = []
    file_start = 0
    entry_start = 0
    for i in range(n_processes):
        slices.append([file_start, entry_start] + partition_helper(slice_entries, file_entries, file_start, entry_start))
        file_start = slices[-1][-2]
        entry_start = slices[-1][-1]
    return slices

def read_slice(files, slices, index, data):
    data_slice = []
    for i in range(slices[index][0], slices[index][2] + 1):
        data_slice.append(files[i].arrays("candidate_vMass", 
                                          "(candidate_charge == 0)\
                                          & (candidate_cosAlpha > 0.99)\
                                          & (candidate_lxy / candidate_lxyErr > 3.0)\
                                          & (candidate_vProb > 0.05)\
                                          & (ditrack_mass > 1.014) & (ditrack_mass < 1.024)\
                                          & (candidate_vMass > 5.33) & (candidate_vMass < 5.4)",
                                          entry_start=slices[index][1] if i == slices[index][0] else None,
                                          entry_stop=slices[index][3] if i == slices[index][2] else None,
                                          array_cache=None,
                                          library="np",
                                         )["candidate_vMass"])
    data.append(np.concatenate(tuple(data_slice)))

def runtime_measure_mp(path, n_files, n_processes):
    if n_files == 0: return 0
    if n_processes == 0: return runtime_measure(path, n_files)
    start = time.time()
    files = [uproot.open(path=path + filename + ":rootuple/CandidateTree", object_cache=None, array_cache=None) for filename in sorted(os.listdir(path))[:n_files]]
    slices = partition(files, n_processes)
    data = multiprocessing.Manager().list()
    processes = []
    for i in range(n_processes):
        p = multiprocessing.Process(target=read_slice, args=[files, slices, i, data])
        p.start()
        processes.append(p)

    for p in processes:
        p.join()
    
    np.concatenate(tuple(data))
    
    return time.time() - start

def runtime_measure(path, n_files):
    if n_files == 0: return 0
    start = time.time()
    files = [uproot.open(path=path + filename + ":rootuple/CandidateTree", object_cache=None, array_cache=None) for filename in sorted(os.listdir(path))[:n_files]]
    data = []
    for file in files:
        data.append(file.arrays("candidate_vMass", 
                              "(candidate_charge == 0)\
                              & (candidate_cosAlpha > 0.99)\
                              & (candidate_lxy / candidate_lxyErr > 3.0)\
                              & (candidate_vProb > 0.05)\
                              & (ditrack_mass > 1.014) & (ditrack_mass < 1.024)\
                              & (candidate_vMass > 5.33) & (candidate_vMass < 5.4)",
                              array_cache=None,
                              library="np")["candidate_vMass"])
        
    np.concatenate(tuple(data))
    
    return time.time() - start

def runtime_vs_variable(path, target_dir, measure_function, variable, step, n_loops, var_max, constant=None):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    result_path = ("%s/runtime_vs_%s_%d_%d_%d_%d.csv" % (target_dir, variable, constant, var_max, step, n_loops)) if constant else ("%s/runtime_vs_%s_%d_%d_%d.csv" % (target_dir, variable, var_max, step, n_loops))
    
    x = [get_total_size(path, a) for a in range(0, var_max + step, step)] if "size" in variable else [a for a in range(0, var_max + step, step)]
        
    with open(result_path, "w+", newline="") as f:
        csv.writer(f).writerow(x)
    for n in range(n_loops):
        y = [measure_function(*(path, i if "size" in variable else constant, constant if "size" in variable else i) if constant else (path, i)) for i in range(0, var_max + step, step)]
        with open(result_path, "a+", newline="") as f:
            csv.writer(f).writerow(y)
    



In [None]:
path = "../data/128_files/"
target_dir = "runtime_tests_uproot/test"

In [92]:
def partition_files(path, n_files, n_processes):
    filenames = sorted(os.listdir(path))
    partitions = []
    curr = 0
    for i in range(n_processes):
        if i >= n_files: break
        n_files_in_partition = n_files // n_processes if i >= n_files % n_processes else n_files // n_processes + 1
        files_to_read = []
        for j in range(n_files_in_partition):
            files_to_read.append(path + filenames[curr + j])
        curr += n_files_in_partition
        partitions.append(files_to_read)
    return partitions

In [93]:
partition_files(path, 128, 64)

[['../data/128_files/file0.root', '../data/128_files/file1.root'],
 ['../data/128_files/file10.root', '../data/128_files/file100.root'],
 ['../data/128_files/file101.root', '../data/128_files/file102.root'],
 ['../data/128_files/file103.root', '../data/128_files/file104.root'],
 ['../data/128_files/file105.root', '../data/128_files/file106.root'],
 ['../data/128_files/file107.root', '../data/128_files/file108.root'],
 ['../data/128_files/file109.root', '../data/128_files/file11.root'],
 ['../data/128_files/file110.root', '../data/128_files/file111.root'],
 ['../data/128_files/file112.root', '../data/128_files/file113.root'],
 ['../data/128_files/file114.root', '../data/128_files/file115.root'],
 ['../data/128_files/file116.root', '../data/128_files/file117.root'],
 ['../data/128_files/file118.root', '../data/128_files/file119.root'],
 ['../data/128_files/file12.root', '../data/128_files/file120.root'],
 ['../data/128_files/file121.root', '../data/128_files/file122.root'],
 ['../data/12

In [171]:
def open_partition(partition, files):
    for file in partition:
        files.append(uproot.open(path=file + ":rootuple/CandidateTree"))

In [172]:
def open_mp(path, n_files, n_processes):
    partitions = partition_files(path, n_files, n_processes)
    
    processes = []
    files = multiprocessing.Manager().list()
    
    for i in range(n_processes):
        p = multiprocessing.Process(target=open_partition, args=[partitions[i], files])
        p.start()
        processes.append(p)
        
    for p in processes:
        p.join()
    
    return files

In [173]:
def read_slice_detailed(files, slices, index, data, process_runtimes):
    key = "P%d" % (index)
    start_time = time.time()
    data_slice = []
    for i in range(slices[index][0], slices[index][2] + 1):
        data_slice.append(files[i].arrays("candidate_vMass", 
                                          "(candidate_charge == 0)\
                                          & (candidate_cosAlpha > 0.99)\
                                          & (candidate_lxy / candidate_lxyErr > 3.0)\
                                          & (candidate_vProb > 0.05)\
                                          & (ditrack_mass > 1.014) & (ditrack_mass < 1.024)\
                                          & (candidate_vMass > 5.33) & (candidate_vMass < 5.4)",
                                          entry_start=slices[index][1] if i == slices[index][0] else None,
                                          entry_stop=slices[index][3] if i == slices[index][2] else None,
                                          array_cache=None,
                                          library="np",
                                         )["candidate_vMass"])
    data.append(np.concatenate(tuple(data_slice)))
    runtime = time.time() - start_time
    
    process_runtimes[key] = runtime

In [176]:
def runtime_measure_mp_detailed(path, n_files, n_processes):
    start_time = time.time()
    
    start_time2 = time.time()
    files = [uproot.open(path=path + filename + ":rootuple/CandidateTree", object_cache=None, array_cache=None) for filename in sorted(os.listdir(path))[:n_files]]
#     files = open_mp(path, n_files, 8)
    runtime2 = time.time() - start_time2
    slices = partition(files, n_processes)

    processes = []
    data = multiprocessing.Manager().list()
    process_runtimes = multiprocessing.Manager().dict()
    
    for i in range(n_processes):
        p = multiprocessing.Process(target=read_slice_detailed, args=[files, slices, i, data, process_runtimes])
        p.start()
        processes.append(p)
        
    for p in processes:
        p.join()
    
    np.concatenate(tuple(data))
    
    runtime = time.time() - start_time
    
    return runtime, runtime2, dict(process_runtimes)

In [181]:
runtime_measure_mp_detailed("../data/64_files/", 64, 128)

(18.007267951965332,
 6.581730604171753,
 {'P0': 2.220172166824341,
  'P10': 2.928131341934204,
  'P14': 3.3595879077911377,
  'P20': 3.2188687324523926,
  'P18': 3.3525984287261963,
  'P28': 3.1237401962280273,
  'P60': 2.441986560821533,
  'P22': 3.387251615524292,
  'P38': 3.0715949535369873,
  'P40': 3.0440187454223633,
  'P32': 3.2606210708618164,
  'P44': 3.0073225498199463,
  'P42': 3.062052011489868,
  'P68': 2.465517520904541,
  'P52': 2.864816188812256,
  'P34': 3.3766918182373047,
  'P36': 3.3515076637268066,
  'P64': 3.3847885131835938,
  'P3': 5.277414321899414,
  'P56': 4.102231979370117,
  'P84': 3.4493367671966553,
  'P33': 4.937404155731201,
  'P5': 5.710078001022339,
  'P17': 5.481387615203857,
  'P21': 5.409633159637451,
  'P29': 5.268514394760132,
  'P62': 4.509320259094238,
  'P31': 5.30276894569397,
  'P35': 5.251354455947876,
  'P1': 6.175487518310547,
  'P39': 5.249784708023071,
  'P45': 5.131251335144043,
  'P37': 5.360960483551025,
  'P69': 4.644705772399902,


In [156]:
runtime_measure_mp_detailed(path, 128, 32)

(22.489723443984985,
 6.02438497543335,
 {'P6': 6.227224826812744,
  'P1': 6.51708722114563,
  'P0': 6.715727806091309,
  'P12': 6.511242151260376,
  'P7': 6.755060434341431,
  'P10': 6.717219114303589,
  'P16': 6.620693206787109,
  'P3': 6.939175128936768,
  'P31': 6.344878196716309,
  'P4': 7.097124099731445,
  'P2': 7.1530842781066895,
  'P9': 7.081815481185913,
  'P28': 6.612681150436401,
  'P11': 7.140588283538818,
  'P20': 6.920468091964722,
  'P25': 6.847743034362793,
  'P22': 7.042839050292969,
  'P21': 7.065276145935059,
  'P14': 7.3866355419158936,
  'P13': 7.500291347503662,
  'P23': 7.303681373596191,
  'P19': 7.614015579223633,
  'P24': 7.540797472000122,
  'P26': 7.454562664031982,
  'P8': 7.992625713348389,
  'P18': 7.7703938484191895,
  'P17': 7.79569149017334,
  'P5': 8.078985214233398,
  'P29': 7.525783061981201,
  'P15': 7.882227420806885,
  'P27': 8.29872727394104,
  'P30': 8.479958057403564})