# Combine

In this notebook we add positions data to the `noise` dataset and combine `mc_hits` and `mc_info`.

In [58]:
import pandas as pd
import numpy as np

In [2]:
noise = pd.read_csv("../data/noise.csv")

## noise

First we handle the `noise` dataset. Add a column containing the corresponding id in the positions array. The formula was provided to us by Roel.

In [None]:
noise["pos_idx"] = 31 * (noise["dom id"] - 1) + noise["pmt id"]

Next, convert numpy array to pandas dataframe. **NOTE** run notebooks/parse_detx.py and have `positions` in memory

In [3]:
# !python notebooks/parse_detx.py
pos = pd.DataFrame(positions)
pos["pos_idx"] = pos.index

Finally, combine the two, cleanup and save to disk.

In [None]:
df = pd.merge(noise, pos, on='pos_idx')
df = df.drop(columns=['pos_idx'])
df = df.rename(columns={'dom id': 'dom_id', 'pmt id': 'pmt_id',
    'time-over-threshold': 'tot', 'x': 'pos_x', 'y' : 'pos_y', 'z': 'pos_z',
    'dx': 'dir_x', 'dy': 'dir_y', 'dz': 'dir_z'})

## mc_hits and mc_info

First things first, align the column names of `mc_hits` with that of `noise`.

In [113]:
mc_info = pd.read_hdf("../data/events.h5", key="/data/mc_info")
mc_hits = pd.read_hdf("../data/events.h5", key="/data/mc_hits")

In [114]:
mc_hits = mc_hits.rename(columns={'h.dom_id': 'dom_id', 'h.pmt_id': 'pmt_id',
    'h.pos.x': 'pos_x', 'h.pos.y': 'pos_y', 'h.pos.z': 'pos_z', 'h.dir.x':
    'dir_x', 'h.dir.y': 'dir_y', 'h.dir.z': 'dir_z', 'h.tot': 'tot', 'h.t':
    'time'})

In [115]:
mc_hits

Unnamed: 0,dom_id,pmt_id,pos_x,pos_y,pos_z,dir_x,dir_y,dir_z,tot,time
0,40,1231,-55.897,101.800,169.059,-0.478,0.827,0.296,28,27286567.0
1,93,2879,-26.344,86.850,178.511,0.000,0.830,0.558,27,27287009.0
2,187,5769,-74.918,65.363,139.831,0.000,0.531,-0.847,25,27289060.0
3,196,6071,-74.774,65.340,56.111,0.719,0.415,0.558,30,27287235.0
4,221,6842,-65.186,50.797,160.359,-0.478,0.827,0.296,27,27286697.0
...,...,...,...,...,...,...,...,...,...,...
489901,1934,59941,77.988,-40.148,130.541,0.000,-0.955,-0.296,18,72056356.0
489902,1981,61393,70.237,-59.962,196.389,0.415,-0.720,-0.556,29,72056427.0
489903,1995,61845,70.298,-59.901,65.511,0.719,-0.415,0.558,25,72056374.0
489904,2012,62366,88.385,-60.262,74.159,0.478,-0.827,0.296,24,72056345.0


In [124]:
mc_info["event_id"] = mc_info.index
mc_hits["id"] = mc_hits.index
mc_info.drop_duplicates(subset='nu.hits.end', inplace=True)

In [126]:
mc_info

Unnamed: 0,nu_E,type,nu_dir.x,nu.dir.y,nu.dir.z,nu.pos.x,nu.pos.y,nu.pos.z,nu.hits.start,nu.hits.end,event_id
0,15.540,-14,-0.630831,0.436518,0.641486,-85.337,154.304,35.735,0,5,0
1,11.458,14,0.070157,0.942066,-0.328009,19.081,169.073,-92.435,5,6,1
2,11.485,14,0.367514,0.049922,-0.928677,-61.314,130.036,-64.078,6,11,2
3,11.571,14,-0.177418,0.344584,0.921838,48.457,76.398,112.710,11,15,3
4,13.148,-14,-0.997387,0.065683,0.030071,138.488,-166.544,52.433,15,17,4
...,...,...,...,...,...,...,...,...,...,...,...
5730,73.038,-14,-0.119695,-0.113570,-0.986294,-102.301,10.814,259.625,489453,489455,5730
5731,79.484,14,-0.220923,0.896138,-0.384876,51.252,56.680,72.604,489455,489817,5731
5732,35.602,14,-0.665612,0.267922,0.696548,191.109,-60.389,-7.956,489817,489820,5732
5733,10.022,14,0.868500,-0.195664,0.455438,-126.753,61.647,-71.618,489820,489897,5733


Add the index of `mc_info` to all rows in `mc_hits` whose indices fall within `nu.hits.start` and `nu.hits.end`.

In [133]:
bins = pd.concat([pd.Series([0]), mc_info["nu.hits.end"]])
mc_hits["event_id"] = pd.cut(mc_hits.id, bins=bins, right=False,
                             labels=mc_info["event_id"], include_lowest=True)

In [134]:
mc_hits.iloc[0:20]

Unnamed: 0,dom_id,pmt_id,pos_x,pos_y,pos_z,dir_x,dir_y,dir_z,tot,time,id,event_id
0,40,1231,-55.897,101.8,169.059,-0.478,0.827,0.296,28,27286567.0,0,0
1,93,2879,-26.344,86.85,178.511,0.0,0.83,0.558,27,27287009.0,1,0
2,187,5769,-74.918,65.363,139.831,0.0,0.531,-0.847,25,27289060.0,2,0
3,196,6071,-74.774,65.34,56.111,0.719,0.415,0.558,30,27287235.0,3,0
4,221,6842,-65.186,50.797,160.359,-0.478,0.827,0.296,27,27286697.0,4,0
5,1719,53264,2.53,104.211,121.731,0.0,-0.531,-0.847,21,66714426.0,5,1
6,50,1525,-55.801,101.529,73.931,0.0,-0.531,-0.847,26,73258815.0,6,2
7,363,11225,1.309,33.178,178.231,0.0,0.531,-0.847,26,73259354.0,7,2
8,439,13580,-47.706,12.601,139.831,0.46,0.266,-0.847,26,73259217.0,8,2
9,1578,48898,21.741,67.463,94.289,-0.831,-0.0,-0.556,24,73259075.0,9,2


# Manual inspection
Make sure the distribution before and after merging the dataframes is more or less the same.

In [6]:
print("hits shape: {0}, noise shape: {1}, total shape: ({2}, 11)".format(hits.shape, noise.shape, hits.shape[0]+noise.shape[0]))

hits shape: (489906, 11), noise shape: (45330980, 11), total shape: (45820886, 11)


In [5]:
noise.describe()

Unnamed: 0,time,dom_id,pmt_id,tot,pos_x,pos_y,pos_z,dir_x,dir_y,dir_z,label
count,45330980.0,45330980.0,45330980.0,45330980.0,45330980.0,45330980.0,45330980.0,45330980.0,45330980.0,45330980.0,45330980.0
mean,50062860.0,1035.591,15.01094,26.43602,0.008150053,0.0002416813,117.176,0.0001938109,-3.343916e-05,-0.1951633,0.0
std,28909880.0,597.5733,8.942334,2.458031,51.25542,62.22462,48.66544,0.5809644,0.5810222,0.5354744,0.0
min,-2.0,1.0,0.0,12.0,-94.627,-115.6,37.7,-0.955,-0.955,-1.0,0.0
25%,25022730.0,518.0,7.0,25.0,-45.018,-57.925,74.041,-0.478,-0.478,-0.556,0.0
50%,50064290.0,1036.0,15.0,26.0,1.309,-4.184,121.7,0.0,-0.0,-0.296,0.0
75%,75099790.0,1553.0,23.0,28.0,40.452,48.541,160.241,0.478,0.478,0.296,0.0
max,101591400.0,2070.0,30.0,40.0,96.243,105.024,196.611,0.955,0.955,0.558,0.0


In [17]:
hits.describe()

Unnamed: 0,dom_id,pmt_id,pos_x,pos_y,pos_z,dir_x,dir_y,dir_z,tot,time,label
count,489906.0,489906.0,489906.0,489906.0,489906.0,489906.0,489906.0,489906.0,489906.0,489906.0,489906.0
mean,1048.735653,32496.026419,0.34167,-1.514543,119.205209,0.000912,-0.002101,-0.182805,26.056821,48938920.0,1.0
std,592.666757,18372.66124,50.964521,62.401296,47.553276,0.582604,0.583274,0.535609,9.273521,29056000.0,0.0
min,1.0,1.0,-94.627,-115.6,37.7,-0.955,-0.955,-1.0,1.0,28449.0,1.0
25%,537.0,16622.0,-44.772,-58.113,74.211,-0.478,-0.478,-0.556,23.0,23937850.0,1.0
50%,1056.0,32727.0,1.424,-4.581,121.789,0.0,-0.0,-0.296,26.0,48101400.0,1.0
75%,1562.0,48398.75,40.478,48.48,160.241,0.478,0.478,0.296,28.0,74346300.0,1.0
max,2070.0,64170.0,96.243,105.024,196.611,0.955,0.955,0.558,209.0,99986010.0,1.0


# Convert global `pmt_id` to local `pmt_id`

The `pmt_id` in the hits dataset follows the global numbering scheme, to convert it to the local numbering scheme (ie. between 1-31), we apply the following formula: `pmt_id - 31 * (dom_id - 1)`. Additionally, ofset the `noise.pmt_id` by 1 so that all pmt_ids lie in 1-31.

In [3]:
noise.pmt_id = noise.pmt_id + 1

In [4]:
noise.pmt_id.describe()

count    4.533098e+07
mean     1.601094e+01
std      8.942334e+00
min      1.000000e+00
25%      8.000000e+00
50%      1.600000e+01
75%      2.400000e+01
max      3.100000e+01
Name: pmt_id, dtype: float64

In [5]:
hits.pmt_id = hits.pmt_id - 31 * (hits.dom_id - 1)

In [6]:
hits.pmt_id.describe()

count    489906.000000
mean         16.221165
std           8.932694
min           1.000000
25%           9.000000
50%          16.000000
75%          24.000000
max          31.000000
Name: pmt_id, dtype: float64

In [7]:
df = pd.concat([hits, noise])
print("df shape: {0}".format(df.shape))

df shape: (45820886, 11)


In [8]:
df.isna().any().any()

False

In [9]:
df.describe()

Unnamed: 0,dom_id,pmt_id,pos_x,pos_y,pos_z,dir_x,dir_y,dir_z,tot,time,label
count,45820890.0,45820890.0,45820890.0,45820890.0,45820890.0,45820890.0,45820890.0,45820890.0,45820890.0,45820890.0,45820890.0
mean,1035.732,16.01318,0.01171597,-0.01595403,117.1977,0.0002014941,-5.554718e-05,-0.1950312,26.43196,50050850.0,0.01069176
std,597.5226,8.942257,51.25233,62.2267,48.65414,0.580982,0.5810463,0.5354774,2.626463,28911680.0,0.1028467
min,1.0,1.0,-94.627,-115.6,37.7,-0.955,-0.955,-1.0,1.0,-2.0,0.0
25%,518.0,8.0,-45.018,-57.925,74.041,-0.478,-0.478,-0.556,25.0,25003600.0,0.0
50%,1036.0,16.0,1.309,-4.184,121.7,0.0,0.0,-0.296,26.0,50042220.0,0.0
75%,1553.0,24.0,40.452,48.541,160.241,0.478,0.478,0.296,28.0,75091900.0,0.0
max,2070.0,31.0,96.243,105.024,196.611,0.955,0.955,0.558,209.0,101591400.0,1.0


# Drop rows with negative time

In [10]:
df = df[df.time >= 0.0]
df = df.sort_values(by=['time'])
df.shape

(45820216, 11)

In [11]:
df.label.unique()

array([0, 1])

In [12]:
len(df.pmt_id.unique())

31

In [13]:
df

Unnamed: 0,dom_id,pmt_id,pos_x,pos_y,pos_z,dir_x,dir_y,dir_z,tot,time,label
7011482,321.0,5.0,-17.661,32.245,65.231,-0.460,-0.266,-0.847,26.0,0.0,0
36188506,1653.0,23.0,11.595,85.465,65.459,-0.955,-0.000,0.296,27.0,0.0,0
6008280,275.0,9.0,-36.464,67.166,160.189,0.415,0.720,-0.556,26.0,0.0,0
36341659,1660.0,23.0,61.660,101.635,169.059,-0.955,-0.000,0.296,26.0,0.0,0
21139713,966.0,16.0,-54.510,-78.323,94.341,-0.827,0.478,-0.296,24.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
11438019,523.0,13.0,-57.230,-5.401,196.389,0.415,-0.720,-0.556,28.0,101502104.0,0
8460308,387.0,12.0,0.724,66.341,121.789,-0.415,-0.720,-0.556,24.0,101516467.0,0
2062008,95.0,4.0,-26.436,86.737,160.131,-0.460,0.266,-0.847,23.0,101545421.0,0
16221364,741.0,30.0,-26.931,-21.994,178.511,0.000,-0.830,0.558,27.0,101581891.0,0


In [14]:
df.to_csv("../data/data.csv", index=False)