# Combine

In this notebook we combine `mc_hits`, `mc_info` and `noise`.

In [106]:
import pandas as pd
import numpy as np

import math

In [43]:
# NOTE we expect the noise dataset with it's positions added (ie. output of notebooks/add-pos-to-noise.py)
noise = pd.read_csv("../data/noise.csv")

In [44]:
noise

Unnamed: 0,time,dom_id,pmt_id,tot,pos_x,pos_y,pos_z,dir_x,dir_y,dir_z,label
0,645700.0,1.0,0.0,27.0,-74.931,101.635,196.300,-0.000,-0.000,-1.000,0
1,773000.0,1.0,0.0,26.0,-74.931,101.635,196.300,-0.000,-0.000,-1.000,0
2,826492.0,1.0,0.0,26.0,-74.931,101.635,196.300,-0.000,-0.000,-1.000,0
3,958714.0,1.0,0.0,22.0,-74.931,101.635,196.300,-0.000,-0.000,-1.000,0
4,995974.0,1.0,0.0,28.0,-74.931,101.635,196.300,-0.000,-0.000,-1.000,0
...,...,...,...,...,...,...,...,...,...,...,...
45330975,56042290.0,2070.0,30.0,24.0,76.892,-77.203,38.011,0.719,-0.415,0.558,0
45330976,61925980.0,2070.0,30.0,25.0,76.892,-77.203,38.011,0.719,-0.415,0.558,0
45330977,63658656.0,2070.0,30.0,28.0,76.892,-77.203,38.011,0.719,-0.415,0.558,0
45330978,63658656.0,2070.0,30.0,25.0,76.892,-77.203,38.011,0.719,-0.415,0.558,0


## mc_hits and mc_info

First things first, align the column names of `mc_hits` with that of `noise`.

In [56]:
info = pd.read_hdf("../data/events.h5", key="/data/mc_info")
hits = pd.read_hdf("../data/events.h5", key="/data/mc_hits")

In [57]:
hits = hits.rename(columns={'h.dom_id': 'dom_id', 'h.pmt_id': 'pmt_id',
    'h.pos.x': 'pos_x', 'h.pos.y': 'pos_y', 'h.pos.z': 'pos_z', 'h.dir.x':
    'dir_x', 'h.dir.y': 'dir_y', 'h.dir.z': 'dir_z', 'h.tot': 'tot', 'h.t':
    'time'})
hits["label"] = 1

In [58]:
hits

Unnamed: 0,dom_id,pmt_id,pos_x,pos_y,pos_z,dir_x,dir_y,dir_z,tot,time,label
0,40,1231,-55.897,101.800,169.059,-0.478,0.827,0.296,28,27286567.0,1
1,93,2879,-26.344,86.850,178.511,0.000,0.830,0.558,27,27287009.0,1
2,187,5769,-74.918,65.363,139.831,0.000,0.531,-0.847,25,27289060.0,1
3,196,6071,-74.774,65.340,56.111,0.719,0.415,0.558,30,27287235.0,1
4,221,6842,-65.186,50.797,160.359,-0.478,0.827,0.296,27,27286697.0,1
...,...,...,...,...,...,...,...,...,...,...,...
489901,1934,59941,77.988,-40.148,130.541,0.000,-0.955,-0.296,18,72056356.0,1
489902,1981,61393,70.237,-59.962,196.389,0.415,-0.720,-0.556,29,72056427.0,1
489903,1995,61845,70.298,-59.901,65.511,0.719,-0.415,0.558,25,72056374.0,1
489904,2012,62366,88.385,-60.262,74.159,0.478,-0.827,0.296,24,72056345.0,1


In [62]:
info["event_id"] = info.index
hits["id"] = hits.index
info.drop_duplicates(subset='nu.hits.end', inplace=True)
info

Unnamed: 0,nu_E,type,nu_dir.x,nu.dir.y,nu.dir.z,nu.pos.x,nu.pos.y,nu.pos.z,nu.hits.start,nu.hits.end,event_id
0,15.540,-14,-0.630831,0.436518,0.641486,-85.337,154.304,35.735,0,5,0
1,11.458,14,0.070157,0.942066,-0.328009,19.081,169.073,-92.435,5,6,1
2,11.485,14,0.367514,0.049922,-0.928677,-61.314,130.036,-64.078,6,11,2
3,11.571,14,-0.177418,0.344584,0.921838,48.457,76.398,112.710,11,15,3
4,13.148,-14,-0.997387,0.065683,0.030071,138.488,-166.544,52.433,15,17,4
...,...,...,...,...,...,...,...,...,...,...,...
5730,73.038,-14,-0.119695,-0.113570,-0.986294,-102.301,10.814,259.625,489453,489455,5730
5731,79.484,14,-0.220923,0.896138,-0.384876,51.252,56.680,72.604,489455,489817,5731
5732,35.602,14,-0.665612,0.267922,0.696548,191.109,-60.389,-7.956,489817,489820,5732
5733,10.022,14,0.868500,-0.195664,0.455438,-126.753,61.647,-71.618,489820,489897,5733


Add the index of `mc_info` to all rows in `mc_hits` whose indices fall within `nu.hits.start` and `nu.hits.end`.

In [63]:
bins = pd.concat([pd.Series([0]), info["nu.hits.end"]])
hits["event_id"] = pd.cut(hits.id, bins=bins, right=False,
                          labels=info["event_id"], include_lowest=True)
hits.iloc[0:20]

Unnamed: 0,dom_id,pmt_id,pos_x,pos_y,pos_z,dir_x,dir_y,dir_z,tot,time,label,id,event_id
0,40,1231,-55.897,101.8,169.059,-0.478,0.827,0.296,28,27286567.0,1,0,0
1,93,2879,-26.344,86.85,178.511,0.0,0.83,0.558,27,27287009.0,1,1,0
2,187,5769,-74.918,65.363,139.831,0.0,0.531,-0.847,25,27289060.0,1,2,0
3,196,6071,-74.774,65.34,56.111,0.719,0.415,0.558,30,27287235.0,1,3,0
4,221,6842,-65.186,50.797,160.359,-0.478,0.827,0.296,27,27286697.0,1,4,0
5,1719,53264,2.53,104.211,121.731,0.0,-0.531,-0.847,21,66714426.0,1,5,1
6,50,1525,-55.801,101.529,73.931,0.0,-0.531,-0.847,26,73258815.0,1,6,2
7,363,11225,1.309,33.178,178.231,0.0,0.531,-0.847,26,73259354.0,1,7,2
8,439,13580,-47.706,12.601,139.831,0.46,0.266,-0.847,26,73259217.0,1,8,2
9,1578,48898,21.741,67.463,94.289,-0.831,-0.0,-0.556,24,73259075.0,1,9,2


## Convert global `pmt_id` to local `pmt_id`

The `pmt_id` in the hits dataset follows the global numbering scheme, to convert it to the local numbering scheme (ie. between 1-31), we apply the following formula: `pmt_id - 31 * (dom_id - 1)`. Additionally, ofset the `noise.pmt_id` by 1 so that all pmt_ids lie in 1-31.

In [52]:
noise.pmt_id = noise.pmt_id + 1

In [64]:
noise.pmt_id.describe()

count    4.533098e+07
mean     1.601094e+01
std      8.942334e+00
min      1.000000e+00
25%      8.000000e+00
50%      1.600000e+01
75%      2.400000e+01
max      3.100000e+01
Name: pmt_id, dtype: float64

In [65]:
hits.pmt_id = hits.pmt_id - 31 * (hits.dom_id - 1)

In [66]:
hits.pmt_id.describe()

count    489906.000000
mean         16.221165
std           8.932694
min           1.000000
25%           9.000000
50%          16.000000
75%          24.000000
max          31.000000
Name: pmt_id, dtype: float64

## Combine `noise` and `hits`

In [74]:
noise["event_id"] = np.nan
hits = hits.drop(columns=["id"])
print("shape of noise: {0}, shape of hits: {1}".format(noise.shape, hits.shape))

shape of noise: (45330980, 12), shape of hits: (489906, 12)


In [75]:
df = pd.concat([hits, noise])
print("df shape: {0}".format(df.shape))

df shape: (45820886, 12)


In [76]:
df

Unnamed: 0,dom_id,pmt_id,pos_x,pos_y,pos_z,dir_x,dir_y,dir_z,tot,time,label,event_id
0,40.0,22.0,-55.897,101.800,169.059,-0.478,0.827,0.296,28.0,27286567.0,1,0.0
1,93.0,27.0,-26.344,86.850,178.511,0.000,0.830,0.558,27.0,27287009.0,1,0.0
2,187.0,3.0,-74.918,65.363,139.831,0.000,0.531,-0.847,25.0,27289060.0,1,0.0
3,196.0,26.0,-74.774,65.340,56.111,0.719,0.415,0.558,30.0,27287235.0,1,0.0
4,221.0,22.0,-65.186,50.797,160.359,-0.478,0.827,0.296,27.0,27286697.0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
45330975,2070.0,31.0,76.892,-77.203,38.011,0.719,-0.415,0.558,24.0,56042290.0,0,
45330976,2070.0,31.0,76.892,-77.203,38.011,0.719,-0.415,0.558,25.0,61925980.0,0,
45330977,2070.0,31.0,76.892,-77.203,38.011,0.719,-0.415,0.558,28.0,63658656.0,0,
45330978,2070.0,31.0,76.892,-77.203,38.011,0.719,-0.415,0.558,25.0,63658656.0,0,


In [77]:
df.isna().any().any()

True

## Drop rows and columns
- drop rows with negative `time`
- drop cols that are not required

In [98]:
df = df[df.time >= 0.0]
df = df.drop(columns=["dom_id", "pmt_id", "dir_x", "dir_y", "dir_z", "tot"])
df = df.sort_values(by=['time'])
df.shape

(45820216, 6)

In [103]:
df.describe()

Unnamed: 0,pos_x,pos_y,pos_z,time,label,event_id
count,45820220.0,45820220.0,45820220.0,45820220.0,45820220.0,489906.0
mean,0.0116844,-0.01597919,117.1977,50051580.0,0.01069192,2862.004542
std,51.25232,62.2267,48.65413,28911250.0,0.1028475,1667.611022
min,-94.627,-115.6,37.7,0.0,0.0,0.0
25%,-45.018,-57.925,74.041,25004660.0,0.0,1392.25
50%,1.309,-4.184,121.7,50042920.0,0.0,2887.0
75%,40.452,48.541,160.241,75092290.0,0.0,4304.75
max,96.243,105.024,196.611,101591400.0,1.0,5734.0


# Adding timeslices
It is valuable to visualize the dataset using timeslices. To start with, 15 microseconds (which is 15000 nanoseconds) is used.

In [107]:
# NOTE we add 15000 to the upper limit to make it larger than df.time.max()
timeslices = list(range(0, math.ceil(df.time.max())+15000, 15000))
len(timeslices)

6774

In [108]:
df['timeslice'] = pd.cut(df.time, bins=timeslices, include_lowest=True, labels=False)

In [109]:
df

Unnamed: 0,pos_x,pos_y,pos_z,time,label,event_id,timeslice
7011482,-17.661,32.245,65.231,0.0,0,,0
44964934,76.840,-77.173,186.931,0.0,0,,0
10507196,-73.403,30.509,94.511,0.0,0,,0
7967768,1.453,33.155,169.111,0.0,0,,0
32473730,49.456,47.904,140.111,0.0,0,,0
...,...,...,...,...,...,...,...
11438019,-57.230,-5.401,196.389,101502104.0,0,,6766
8460308,0.724,66.341,121.789,101516467.0,0,,6767
2062008,-26.436,86.737,160.131,101545421.0,0,,6769
16221364,-26.931,-21.994,178.511,101581891.0,0,,6772


In [110]:
df.to_csv("../data/data.csv", index=False)