In [1]:
from dask.distributed import Client, SSHCluster
import sys
import dask
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import dask.dataframe as dd
import dask.array as da
import dask.bag as db
import matplotlib.pyplot as plt

from dask import delayed

## DASK CLUSTER

In [2]:
cluster = SSHCluster(
    ["10.67.22.39", "10.67.22.74", "10.67.22.27", "10.67.22.91", "10.67.22.60"],
    connect_options   = {"known_hosts": "/root/.ssh/known_hosts"},
    worker_options    = {"nthreads": 1, "n_workers": 4},
    scheduler_options = {"dashboard_address": ":8777"}
)

distributed.deploy.ssh - INFO - distributed.http.proxy - INFO - To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
distributed.deploy.ssh - INFO - distributed.scheduler - INFO - Clear task state
distributed.deploy.ssh - INFO - distributed.scheduler - INFO -   Scheduler at:   tcp://10.67.22.39:46487
distributed.deploy.ssh - INFO - distributed.nanny - INFO -         Start Nanny at: 'tcp://10.67.22.91:40963'
distributed.deploy.ssh - INFO - distributed.nanny - INFO -         Start Nanny at: 'tcp://10.67.22.91:37815'
distributed.deploy.ssh - INFO - distributed.nanny - INFO -         Start Nanny at: 'tcp://10.67.22.91:33075'
distributed.deploy.ssh - INFO - distributed.nanny - INFO -         Start Nanny at: 'tcp://10.67.22.91:40391'
distributed.deploy.ssh - INFO - distributed.nanny - INFO -         Start Nanny at: 'tcp://10.67.22.74:37625'
distributed.deploy.ssh - INFO - distributed.nanny - INFO -         Start Nanny at: 

In [3]:
client = Client(cluster)

## Functions

In [4]:
def load(path):
    return np.load(path, allow_pickle=True)["a"]

def R_yz(theta_rot, phi_rot):
    return np.array([[np.cos(phi_rot) * np.cos(theta_rot), -np.sin(phi_rot) * np.cos(theta_rot), np.sin(theta_rot)], 
                     [np.sin(phi_rot), np.cos(phi_rot), 0], 
                     [-np.sin(theta_rot) * np.cos(phi_rot), np.sin(theta_rot) * np.sin(phi_rot), np.cos(theta_rot)]])

def convert_pmt_ids(input_ids, conversion_ids):
    cd_ids  = np.array(conversion_ids["CdID"])
    pmt_ids = np.array(conversion_ids["PMTID"])
    mask    = np.isin(cd_ids, input_ids)
    return pmt_ids[mask]

def find_pmt_coord(pmt_positions, data_pmt_id):
    return pmt_positions[
        np.isin(pmt_positions.PMTID, data_pmt_id)
        ].loc[:, ['x', 'y', 'z']].reset_index(drop=True).to_numpy()

## Load mapping data

In [5]:
data_folder       = "/root/data/data/real/train/data/"
pmt_pos_fname     = "PMTPos_CD_LPMT.csv"
pmt_id_conv_fname = "PMT_ID_conversion.csv"
train_data_fname  = "raw_data_train_4.npz"

# test multiple files
name_list = ["raw_data_train_4.npz", "raw_data_train_5.npz"]

In [6]:
pmt_positions     = pd.read_csv(pmt_pos_fname)
pmt_id_conversion = pd.read_csv(pmt_id_conv_fname)
conversion_ids    = pd.read_csv(pmt_id_conv_fname)

## Distributed Processing - rotation distributed by event

In [47]:
def load_bag(path, Nevents):
    data_np = load(path)
    data_np = data_np[:, :Nevents]
    return [np.vstack([ data_np[:, i][j] for j in range(3)]) for i in range(data_np.shape[1])]

In [75]:
Nevents = 10
data_db = db.from_sequence(load_bag(data_folder+train_data_fname, Nevents), npartitions=32)

In [76]:
data_db

dask.bag<from_sequence, npartitions=10>

In [77]:
def rotate_ev(data):

    nonzeros_inds = data[2] != 0.0
    data_pmt_id   = convert_pmt_ids(data[0][nonzeros_inds], conversion_ids)
    pmt_coord     = find_pmt_coord(pmt_positions, data_pmt_id)

    tot_charge = sum(data[1][nonzeros_inds])
    x_cc       = sum(pmt_coord[:,0] * data[1][nonzeros_inds]) / tot_charge
    y_cc       = sum(pmt_coord[:,1] * data[1][nonzeros_inds]) / tot_charge
    z_cc       = sum(pmt_coord[:,2] * data[1][nonzeros_inds]) / tot_charge

    theta_cc   = np.arctan2(
        np.sqrt((x_cc)**2+(y_cc)**2), z_cc
    )
    phi_cc     = np.arctan2(y_cc, x_cc) 

    theta_rot = -theta_cc + np.pi/2
    phi_rot   = -phi_cc
    
    # coord_new = np.matmul(R_yz(theta_rot, phi_rot), pmt_coord.T)
    coord_new = np.matmul(
        R_yz(theta_rot, phi_rot), pmt_coord.T
    )

    R = np.sqrt(np.sum(np.power(coord_new, 2), axis=0))

    charge_hitt = np.vstack([data[1], data[2]])
    charge_hitt = charge_hitt[:,nonzeros_inds]

    rotated = np.vstack([coord_new, R, charge_hitt])
    
    del coord_new
    del charge_hitt
    del pmt_coord
    
    return rotated

In [78]:
rotated = db.map(rotate_ev, data_db)

In [79]:
rotated.compute()

[array([[-2.89144865e+03, -2.70198034e+03, -3.19063757e+03, ...,
          4.21860803e+03,  2.79846057e+03,  4.13239725e+03],
        [-5.95162994e+02,  1.94737878e+02,  8.37997156e+02, ...,
          6.96275181e+02,  4.56375798e+02, -7.66247416e+02],
        [ 1.91615892e+04,  1.91974627e+04,  1.91049414e+04, ...,
         -1.89103078e+04, -1.91791953e+04, -1.89266308e+04],
        [ 1.93876557e+04,  1.93876557e+04,  1.93876557e+04, ...,
          1.93876557e+04,  1.93876557e+04,  1.93876557e+04],
        [ 1.01690963e+00,  1.28989536e+00,  1.45731761e+00, ...,
          8.40746671e-01,  6.20022943e-01,  5.43713687e-01],
        [ 2.78543843e+02,  2.70036129e+02,  2.63317109e+02, ...,
          2.72593537e+02,  2.43800823e+02,  4.18888109e+02]]),
 array([[ 1.88295146e+04,  1.86218740e+04,  1.84410912e+04, ...,
         -1.84550681e+04, -1.83526882e+04, -1.88324052e+04],
        [ 6.82396776e+02,  9.27488257e+02,  4.74162190e+02, ...,
         -1.16047973e+03, -7.89136376e+02,  6.72077

In [74]:
data_db

dask.bag<from_sequence, npartitions=5>

In [80]:
# if dashboard is still up then run this and should go down
client.run_on_scheduler(lambda dask_scheduler=None: 
    dask_scheduler.close() & sys.exit(0))

# an error should occur but its fine 

CommClosedError: in <TCP (closed) ConnectionPool.run_function local=tcp://10.67.22.39:54382 remote=tcp://10.67.22.39:46487>: Stream is closed