In [1]:
import csv
import math
import matplotlib.pyplot as plt
import multiprocessing
import numpy as np
import os
import time
import uproot

def get_total_size(path, n_files):
    filenames = sorted(os.listdir(path))
    total_size = sum([os.path.getsize(path + filenames[i]) for i in range(n_files)])
    return total_size / (2**30)

def col_average(data):
    n_rows = len(data)
    n_cols = len(data[0])
    return [sum([data[i][j] for i in range(1, n_rows)]) / (n_rows - 1) for j in range(n_cols)]

def col_standard_deviation(data):
    n_rows = len(data)
    n_cols = len(data[0])
    mean = col_average(data)
    return [(sum([(data[i][j] - mean[j])**2 for i in range(1, n_rows)]) / (n_rows - 1))**0.5 for j in range(n_cols)]

    
def partition_helper(slice_entries, file_entries, file_curr, entry_curr):
    if slice_entries <= file_entries[file_curr] - entry_curr:
        return [file_curr, slice_entries + entry_curr]
    elif file_curr == len(file_entries) - 1:
        return [file_curr, file_entries[-1]]
    else:
        return partition_helper(slice_entries - file_entries[file_curr] + entry_curr, file_entries, file_curr + 1, 0)

def partition(files, n_processes):
    file_entries = [file.num_entries for file in files]
    slice_entries = math.ceil(sum(file_entries) / n_processes)
    slices = []
    file_start = 0
    entry_start = 0
    for i in range(n_processes):
        slices.append([file_start, entry_start] + partition_helper(slice_entries, file_entries, file_start, entry_start))
        file_start = slices[-1][-2]
        entry_start = slices[-1][-1]
    return slices

def read_slice(files, slices, index, data):
    data_slice = []
    for i in range(slices[index][0], slices[index][2] + 1):
        data_slice.append(files[i].arrays("candidate_vMass", 
                                          "(candidate_charge == 0)\
                                          & (candidate_cosAlpha > 0.99)\
                                          & (candidate_lxy / candidate_lxyErr > 3.0)\
                                          & (candidate_vProb > 0.05)\
                                          & (ditrack_mass > 1.014) & (ditrack_mass < 1.024)\
                                          & (candidate_vMass > 5.33) & (candidate_vMass < 5.4)",
                                          entry_start=slices[index][1] if i == slices[index][0] else None,
                                          entry_stop=slices[index][3] if i == slices[index][2] else None,
                                          array_cache=None,
                                          library="np",
                                         )["candidate_vMass"])
    data.append(np.concatenate(tuple(data_slice)))

def runtime_measure_mp(path, n_files, n_processes):
    if n_files == 0: return 0
    if n_processes == 0: return runtime_measure(path, n_files)
    start = time.time()
    files = [uproot.open(path=path + filename + ":rootuple/CandidateTree", object_cache=None, array_cache=None) for filename in sorted(os.listdir(path))[:n_files]]
    slices = partition(files, n_processes)
    data = multiprocessing.Manager().list()
    processes = []
    for i in range(n_processes):
        p = multiprocessing.Process(target=read_slice, args=[files, slices, i, data])
        p.start()
        processes.append(p)

    for p in processes:
        p.join()
    
    np.concatenate(tuple(data))
    
    return time.time() - start

def runtime_measure(path, n_files):
    if n_files == 0: return 0
    start = time.time()
    files = [uproot.open(path=path + filename + ":rootuple/CandidateTree", object_cache=None, array_cache=None) for filename in sorted(os.listdir(path))[:n_files]]
    data = []
    for file in files:
        data.append(file.arrays("candidate_vMass", 
                              "(candidate_charge == 0)\
                              & (candidate_cosAlpha > 0.99)\
                              & (candidate_lxy / candidate_lxyErr > 3.0)\
                              & (candidate_vProb > 0.05)\
                              & (ditrack_mass > 1.014) & (ditrack_mass < 1.024)\
                              & (candidate_vMass > 5.33) & (candidate_vMass < 5.4)",
                              array_cache=None,
                              library="np")["candidate_vMass"])
        
    np.concatenate(tuple(data))
    
    return time.time() - start

def runtime_vs_variable(path, target_dir, measure_function, variable, step, n_loops, var_max, constant=None):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    result_path = ("%s/runtime_vs_%s_%d_%d_%d_%d.csv" % (target_dir, variable, constant, var_max, step, n_loops)) if constant else ("%s/runtime_vs_%s_%d_%d_%d.csv" % (target_dir, variable, var_max, step, n_loops))
    
    x = [get_total_size(path, a) for a in range(0, var_max + step, step)] if "size" in variable else [a for a in range(0, var_max + step, step)]
        
    with open(result_path, "w+", newline="") as f:
        csv.writer(f).writerow(x)
    for n in range(n_loops):
        y = [measure_function(*(path, i if "size" in variable else constant, constant if "size" in variable else i) if constant else (path, i)) for i in range(0, var_max + step, step)]
        with open(result_path, "a+", newline="") as f:
            csv.writer(f).writerow(y)
    



In [2]:
path = "../data/128_files/"
target_dir = "runtime_tests_uproot/test"

In [8]:
def get_num_entries(path, n_files):
    filenames = sorted(os.listdir(path))
    num_entries = sum([uproot.open(path + filenames[i] + ":rootuple/CandidateTree").num_entries for i in range(n_files)])
    return num_entries

In [12]:
get_num_entries(path, 2)

7154078

In [5]:
def partition_files(path, n_files, n_processes):
    filenames = sorted(os.listdir(path))
    partitions = []
    curr = 0
    for i in range(n_processes):
        if i >= n_files: break
        n_files_in_partition = n_files // n_processes if i >= n_files % n_processes else n_files // n_processes + 1
        files_to_read = []
        for j in range(n_files_in_partition):
            files_to_read.append(path + filenames[curr + j])
        curr += n_files_in_partition
        partitions.append(files_to_read)
    return partitions

In [6]:
# partition_files(path, 128, 64)

In [7]:
def open_partition_mt(partition, files):
    files += open_mt(partition)

In [8]:
def open_file(file, files):
    files.append(uproot.open(path=file + ":rootuple/CandidateTree"))

In [9]:
import threading
def open_mt(partition):
    files = []
    threads = []
    for file in partition:
        t = threading.Thread(target=open_file, args=[file, files])
        t.start()
        threads.append(t)
        
    for thread in threads:
        thread.join()
        
    return files

In [10]:
def open_mp_mt(path, n_files, n_processes):
    partitions = partition_files(path, n_files, n_processes)
    
    processes = []
    files = multiprocessing.Manager().list()
    
    for i in range(n_processes):
        p = multiprocessing.Process(target=open_partition_mt, args=[partitions[i], files])
        p.start()
        processes.append(p)
        
    for p in processes:
        p.join()
        p.close()
    
    return files

In [11]:
def open_partition(partition, files):
    for file in partition:
        files.append(uproot.open(path=file + ":rootuple/CandidateTree"))

In [12]:
def open_mp(path, n_files, n_processes):
    partitions = partition_files(path, n_files, n_processes)
    
    processes = []
    files = multiprocessing.Manager().list()
    
    for i in range(n_processes):
        p = multiprocessing.Process(target=open_partition, args=[partitions[i], files])
        p.start()
        processes.append(p)
        
    for p in processes:
        p.join()
    
    return files

In [13]:
def read_slice_detailed(files, slices, index, data, process_runtimes):
    key = "P%d" % (index)
    start_time = time.time()
    data_slice = []
    for i in range(slices[index][0], slices[index][2] + 1):
        data_slice.append(files[i].arrays("candidate_vMass", 
                                          "(candidate_charge == 0)\
                                          & (candidate_cosAlpha > 0.99)\
                                          & (candidate_lxy / candidate_lxyErr > 3.0)\
                                          & (candidate_vProb > 0.05)\
                                          & (ditrack_mass > 1.014) & (ditrack_mass < 1.024)\
                                          & (candidate_vMass > 5.33) & (candidate_vMass < 5.4)",
                                          entry_start=slices[index][1] if i == slices[index][0] else None,
                                          entry_stop=slices[index][3] if i == slices[index][2] else None,
                                          array_cache=None,
                                          library="np",
                                         )["candidate_vMass"])
    data.append(np.concatenate(tuple(data_slice)))
    runtime = time.time() - start_time
    
    process_runtimes[key] = runtime

In [14]:
def runtime_measure_mp_detailed(path, n_files, n_processes):
    start_time = time.time()
    
    start_time2 = time.time()
    files = [uproot.open(path=path + filename + ":rootuple/CandidateTree", object_cache=None, array_cache=None) for filename in sorted(os.listdir(path))[:n_files]]
#     files = open_mp(path, n_files, 8)
    runtime2 = time.time() - start_time2
    slices = partition(files, n_processes)

    processes = []
    data = multiprocessing.Manager().list()
    process_runtimes = multiprocessing.Manager().dict()
    
    for i in range(n_processes):
        p = multiprocessing.Process(target=read_slice_detailed, args=[files, slices, i, data, process_runtimes])
        p.start()
        processes.append(p)
        
    for p in processes:
        p.join()
    
    np.concatenate(tuple(data))
    
    runtime = time.time() - start_time
    
    return runtime, runtime2, dict(process_runtimes)

In [20]:
runtime_measure_mp_detailed(path, 128, 64)

(27.5116183757782,
 13.680346250534058,
 {'P0': 8.814160108566284,
  'P63': 8.008486986160278,
  'P4': 9.21996283531189,
  'P14': 9.653597116470337,
  'P60': 8.812825679779053,
  'P9': 9.872052431106567,
  'P35': 9.744561195373535,
  'P55': 9.608566284179688,
  'P10': 10.727178812026978,
  'P13': 11.082900524139404,
  'P2': 11.288793563842773,
  'P3': 11.322515487670898,
  'P38': 10.815452575683594,
  'P28': 11.039220809936523,
  'P52': 10.662232875823975,
  'P17': 11.3758065700531,
  'P27': 11.254856824874878,
  'P57': 10.685828685760498,
  'P18': 11.469079732894897,
  'P47': 11.012757539749146,
  'P19': 11.554767370223999,
  'P24': 11.577905178070068,
  'P1': 11.924763202667236,
  'P31': 11.459523916244507,
  'P15': 11.761536836624146,
  'P23': 11.850965023040771,
  'P36': 11.610819339752197,
  'P6': 12.169533967971802,
  'P61': 11.126006841659546,
  'P43': 11.594748735427856,
  'P26': 12.05034327507019,
  'P48': 11.681634664535522,
  'P39': 12.123234987258911,
  'P59': 11.6559600830

In [21]:
runtime_measure_mp_detailed(path, 128, 32)

(26.21840810775757,
 13.130914688110352,
 {'P0': 9.129749774932861,
  'P31': 9.022169351577759,
  'P18': 9.771424531936646,
  'P1': 10.143312692642212,
  'P14': 9.942578315734863,
  'P10': 10.081230640411377,
  'P7': 10.149985551834106,
  'P20': 9.979991912841797,
  'P17': 10.098279237747192,
  'P13': 10.327157258987427,
  'P6': 10.538482666015625,
  'P3': 10.678572416305542,
  'P22': 10.391110181808472,
  'P21': 10.54996371269226,
  'P29': 10.37537670135498,
  'P30': 10.411382913589478,
  'P2': 10.929961204528809,
  'P12': 10.77019715309143,
  'P19': 10.679438591003418,
  'P28': 10.522900581359863,
  'P25': 10.767223834991455,
  'P5': 11.20777153968811,
  'P16': 11.039603233337402,
  'P24': 10.978237390518188,
  'P26': 10.944135904312134,
  'P4': 11.368788242340088,
  'P9': 11.382571935653687,
  'P15': 11.374600648880005,
  'P11': 11.500488042831421,
  'P23': 11.436966896057129,
  'P27': 11.392338275909424,
  'P8': 12.76976990699768})

In [25]:
os.cpu_count()

32