In [39]:
import csv
import math
import matplotlib.pyplot as plt
import multiprocessing
import numpy as np
import os
import time
import uproot

def get_total_size(path, n_files):
    filenames = sorted(os.listdir(path))
    total_size = sum([os.path.getsize(path + filenames[i]) for i in range(n_files)])
    return total_size / (2**30)

def col_average(data):
    n_rows = len(data)
    n_cols = len(data[0])
    return [sum([data[i][j] for i in range(1, n_rows)]) / (n_rows - 1) for j in range(n_cols)]

def col_standard_deviation(data):
    n_rows = len(data)
    n_cols = len(data[0])
    mean = col_average(data)
    return [(sum([(data[i][j] - mean[j])**2 for i in range(1, n_rows)]) / (n_rows - 1))**0.5 for j in range(n_cols)]

    
def partition_helper(slice_entries, file_entries, file_curr, entry_curr):
    if slice_entries <= file_entries[file_curr] - entry_curr:
        return [file_curr, slice_entries + entry_curr]
    elif file_curr == len(file_entries) - 1:
        return [file_curr, file_entries[-1]]
    else:
        return partition_helper(slice_entries - file_entries[file_curr] + entry_curr, file_entries, file_curr + 1, 0)

def partition(files, n_processes):
    file_entries = [file.num_entries for file in files]
    slice_entries = math.ceil(sum(file_entries) / n_processes)
    slices = []
    file_start = 0
    entry_start = 0
    for i in range(n_processes):
        slices.append([file_start, entry_start] + partition_helper(slice_entries, file_entries, file_start, entry_start))
        file_start = slices[-1][-2]
        entry_start = slices[-1][-1]
    return slices

def read_slice(files, slices, index, data):
    data_slice = []
    for i in range(slices[index][0], slices[index][2] + 1):
        data_slice.append(files[i].arrays("candidate_vMass", 
                                          "(candidate_charge == 0)\
                                          & (candidate_cosAlpha > 0.99)\
                                          & (candidate_lxy / candidate_lxyErr > 3.0)\
                                          & (candidate_vProb > 0.05)\
                                          & (ditrack_mass > 1.014) & (ditrack_mass < 1.024)\
                                          & (candidate_vMass > 5.33) & (candidate_vMass < 5.4)",
                                          entry_start=slices[index][1] if i == slices[index][0] else None,
                                          entry_stop=slices[index][3] if i == slices[index][2] else None,
                                          array_cache=None,
                                          library="np",
                                         )["candidate_vMass"])
    data.append(np.concatenate(tuple(data_slice)))

def runtime_measure_mp(path, n_files, n_processes):
    if n_files == 0: return 0
    if n_processes == 0: return runtime_measure(path, n_files)
    start = time.time()
    files = [uproot.open(path=path + filename + ":rootuple/CandidateTree", object_cache=None, array_cache=None) for filename in sorted(os.listdir(path))[:n_files]]
    slices = partition(files, n_processes)
    data = multiprocessing.Manager().list()
    processes = []
    for i in range(n_processes):
        p = multiprocessing.Process(target=read_slice, args=[files, slices, i, data])
        p.start()
        processes.append(p)

    for p in processes:
        p.join()
    
    np.concatenate(tuple(data))
    
    return time.time() - start

def runtime_measure(path, n_files):
    if n_files == 0: return 0
    start = time.time()
    files = [uproot.open(path=path + filename + ":rootuple/CandidateTree", object_cache=None, array_cache=None) for filename in sorted(os.listdir(path))[:n_files]]
    data = []
    for file in files:
        data.append(file.arrays("candidate_vMass", 
                              "(candidate_charge == 0)\
                              & (candidate_cosAlpha > 0.99)\
                              & (candidate_lxy / candidate_lxyErr > 3.0)\
                              & (candidate_vProb > 0.05)\
                              & (ditrack_mass > 1.014) & (ditrack_mass < 1.024)\
                              & (candidate_vMass > 5.33) & (candidate_vMass < 5.4)",
                              array_cache=None,
                              library="np")["candidate_vMass"])
        
    np.concatenate(tuple(data))
    
    return time.time() - start

def runtime_vs_variable(path, target_dir, measure_function, variable, step, n_loops, var_max, constant=None):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    result_path = ("%s/runtime_vs_%s_%d_%d_%d_%d.csv" % (target_dir, variable, constant, var_max, step, n_loops)) if constant else ("%s/runtime_vs_%s_%d_%d_%d.csv" % (target_dir, variable, var_max, step, n_loops))
    
    x = [get_total_size(path, a) for a in range(0, var_max + step, step)] if "size" in variable else [a for a in range(0, var_max + step, step)]
        
    with open(result_path, "w+", newline="") as f:
        csv.writer(f).writerow(x)
    for n in range(n_loops):
        y = [measure_function(*(path, i if "size" in variable else constant, constant if "size" in variable else i) if constant else (path, i)) for i in range(0, var_max + step, step)]
        with open(result_path, "a+", newline="") as f:
            csv.writer(f).writerow(y)
    



In [2]:
path = "../data/128_files/"
target_dir = "runtime_tests_uproot/test"

In [3]:
def partition_files(path, n_files, n_processes):
    filenames = sorted(os.listdir(path))
    partitions = []
    curr = 0
    for i in range(n_processes):
        if i >= n_files: break
        n_files_in_partition = n_files // n_processes if i >= n_files % n_processes else n_files // n_processes + 1
        files_to_read = []
        for j in range(n_files_in_partition):
            files_to_read.append(path + filenames[curr + j])
        curr += n_files_in_partition
        partitions.append(files_to_read)
    return partitions

In [4]:
partition_files(path, 128, 64)

[['../data/128_files/file0.root', '../data/128_files/file1.root'],
 ['../data/128_files/file10.root', '../data/128_files/file100.root'],
 ['../data/128_files/file101.root', '../data/128_files/file102.root'],
 ['../data/128_files/file103.root', '../data/128_files/file104.root'],
 ['../data/128_files/file105.root', '../data/128_files/file106.root'],
 ['../data/128_files/file107.root', '../data/128_files/file108.root'],
 ['../data/128_files/file109.root', '../data/128_files/file11.root'],
 ['../data/128_files/file110.root', '../data/128_files/file111.root'],
 ['../data/128_files/file112.root', '../data/128_files/file113.root'],
 ['../data/128_files/file114.root', '../data/128_files/file115.root'],
 ['../data/128_files/file116.root', '../data/128_files/file117.root'],
 ['../data/128_files/file118.root', '../data/128_files/file119.root'],
 ['../data/128_files/file12.root', '../data/128_files/file120.root'],
 ['../data/128_files/file121.root', '../data/128_files/file122.root'],
 ['../data/12

In [5]:
def open_partition_mt(partition, files):
    files += open_mt(partition)

In [6]:
def open_file(file, files):
    files.append(uproot.open(path=file + ":rootuple/CandidateTree"))

In [7]:
import threading
def open_mt(partition):
    files = []
    threads = []
    for file in partition:
        t = threading.Thread(target=open_file, args=[file, files])
        t.start()
        threads.append(t)
        
    for thread in threads:
        thread.join()
        
    return files

In [8]:
def open_mp_mt(path, n_files, n_processes):
    partitions = partition_files(path, n_files, n_processes)
    
    processes = []
    files = multiprocessing.Manager().list()
    
    for i in range(n_processes):
        p = multiprocessing.Process(target=open_partition_mt, args=[partitions[i], files])
        p.start()
        processes.append(p)
        
    for p in processes:
        p.join()
        p.close()
    
    return files

In [12]:
def open_partition(partition, files):
    for file in partition:
        files.append(uproot.open(path=file + ":rootuple/CandidateTree"))

In [13]:
def open_mp(path, n_files, n_processes):
    partitions = partition_files(path, n_files, n_processes)
    
    processes = []
    files = multiprocessing.Manager().list()
    
    for i in range(n_processes):
        p = multiprocessing.Process(target=open_partition, args=[partitions[i], files])
        p.start()
        processes.append(p)
        
    for p in processes:
        p.join()
    
    return files

In [40]:
def read_slice_detailed(files, slices, index, data, process_runtimes):
    key = "P%d" % (index)
    start_time = time.time()
    data_slice = []
    for i in range(slices[index][0], slices[index][2] + 1):
        data_slice.append(files[i].arrays("candidate_vMass", 
                                          "(candidate_charge == 0)\
                                          & (candidate_cosAlpha > 0.99)\
                                          & (candidate_lxy / candidate_lxyErr > 3.0)\
                                          & (candidate_vProb > 0.05)\
                                          & (ditrack_mass > 1.014) & (ditrack_mass < 1.024)\
                                          & (candidate_vMass > 5.33) & (candidate_vMass < 5.4)",
                                          entry_start=slices[index][1] if i == slices[index][0] else None,
                                          entry_stop=slices[index][3] if i == slices[index][2] else None,
                                          array_cache=None,
                                          library="np",
                                         )["candidate_vMass"])
    data.append(np.concatenate(tuple(data_slice)))
    runtime = time.time() - start_time
    
    process_runtimes[key] = runtime

In [41]:
def runtime_measure_mp_detailed(path, n_files, n_processes):
    start_time = time.time()
    
    start_time2 = time.time()
    files = [uproot.open(path=path + filename + ":rootuple/CandidateTree", object_cache=None, array_cache=None) for filename in sorted(os.listdir(path))[:n_files]]
#     files = open_mp(path, n_files, 8)
    runtime2 = time.time() - start_time2
    slices = partition(files, n_processes)

    processes = []
    data = multiprocessing.Manager().list()
    process_runtimes = multiprocessing.Manager().dict()
    
    for i in range(n_processes):
        p = multiprocessing.Process(target=read_slice_detailed, args=[files, slices, i, data, process_runtimes])
        p.start()
        processes.append(p)
        
    for p in processes:
        p.join()
    
    np.concatenate(tuple(data))
    
    runtime = time.time() - start_time
    
    return runtime, runtime2, dict(process_runtimes)

In [42]:
runtime_measure_mp_detailed(path, 128, 64)

(16.260297298431396,
 11.630126237869263,
 {'P0': 2.3633735179901123,
  'P1': 3.0905444622039795,
  'P3': 3.0708401203155518,
  'P5': 3.0549070835113525,
  'P7': 3.1040942668914795,
  'P2': 3.185908317565918,
  'P12': 3.0586369037628174,
  'P9': 3.134207248687744,
  'P4': 3.216351270675659,
  'P13': 3.1436984539031982,
  'P63': 2.3431081771850586,
  'P10': 3.2060048580169678,
  'P14': 3.1521453857421875,
  'P11': 3.2175374031066895,
  'P15': 3.170427083969116,
  'P8': 3.3060684204101562,
  'P6': 3.3497281074523926,
  'P19': 3.159247636795044,
  'P17': 3.213467597961426,
  'P21': 3.182271718978882,
  'P20': 3.224862813949585,
  'P18': 3.290818214416504,
  'P23': 3.212789297103882,
  'P29': 3.135934829711914,
  'P22': 3.2604284286499023,
  'P24': 3.2509756088256836,
  'P26': 3.2447385787963867,
  'P27': 3.2335150241851807,
  'P31': 3.170880079269409,
  'P25': 3.308627128601074,
  'P28': 3.3201732635498047,
  'P36': 3.196148157119751,
  'P32': 3.282304286956787,
  'P34': 3.303463459014892

In [44]:
runtime_measure_mp_detailed(path, 128, 32)

(16.295186281204224,
 10.012354373931885,
 {'P0': 4.112326383590698,
  'P31': 3.9439151287078857,
  'P1': 4.870685815811157,
  'P4': 4.892577886581421,
  'P5': 4.90136194229126,
  'P7': 4.877399682998657,
  'P2': 4.956524610519409,
  'P3': 4.972720623016357,
  'P6': 4.93184757232666,
  'P12': 4.892596483230591,
  'P13': 4.901821851730347,
  'P10': 4.964687347412109,
  'P9': 4.994876146316528,
  'P11': 4.97310471534729,
  'P15': 4.934511661529541,
  'P16': 4.938214540481567,
  'P14': 4.968052625656128,
  'P18': 4.912725448608398,
  'P23': 4.856794357299805,
  'P19': 4.9336371421813965,
  'P17': 4.967451572418213,
  'P25': 4.889923572540283,
  'P21': 4.970155954360962,
  'P29': 4.85200047492981,
  'P20': 4.9906041622161865,
  'P22': 4.994075059890747,
  'P30': 4.9078967571258545,
  'P24': 5.0012736320495605,
  'P26': 5.009260177612305,
  'P27': 5.078582048416138,
  'P28': 5.118932485580444,
  'P8': 5.910770893096924})