In [1]:
import os, glob
import time
import pyarrow as pa
import pyarrow.parquet as pq
import torch
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import mplhep as hep
plt.style.use([hep.style.ROOT, hep.style.firamath])
#from skimage.transform import rescale
plt.rcParams["figure.figsize"] = (5,5)
from torch.utils.data import *
import pandas as pd
import pickle
import numpy as np

In [2]:
hcal_scale  = 1
ecal_scale  = 0.2
pt_scale    = 0.02
dz_scale    = 10
m0_scale    = 14
def transform_y(y):
    return y/m0_scale

def inv_transform(y):
    return y*m0_scale

In [3]:
class ParquetDataset(Dataset):
    def __init__(self, filename, label):
        self.parquet = pq.ParquetFile(filename)
        #self.cols = None # read all columns
        #self.cols = ['X_jet.list.item.list.item.list.item','am','apt','iphi','ieta']
        self.cols = ['X_jet.list.item.list.item.list.item','am','iphi','ieta']
        self.label = label
    def __getitem__(self, index):
        data = self.parquet.read_row_group(index, columns=self.cols).to_pydict()
        data['X_jet'] = np.float32(data['X_jet'][0])
        data['X_jet'][0] = pt_scale   * data['X_jet'][0] #Track pT
        data['X_jet'][1] = dz_scale   * data['X_jet'][1] #Track dZ
        data['X_jet'][2] = dz_scale   * data['X_jet'][2] #Track d0
        data['X_jet'][3] = ecal_scale * data['X_jet'][3] #ECAL
        data['X_jet'][4] = hcal_scale * data['X_jet'][4] #HCAL
        #data['X_jet'] = np.float32(data['X_jet'][0])/ecal_scale
        data['am'] = transform_y(np.float32(data['am']))
        #data['apt'] = np.float32(data['apt'])
        data['iphi'] = np.float32(data['iphi'])/360.
        data['ieta'] = np.float32(data['ieta'])/140.
        data['label'] = self.label
        # Preprocessing
        #data_dict['X_jet'] = data_dict['X_jet'][:, 20:105, 20:105]
        # High Value Suppressuib
        data['X_jet'][1][data['X_jet'][1] < -20] = 0
        data['X_jet'][1][data['X_jet'][1] >  20] = 0
        data['X_jet'][2][data['X_jet'][2] < -10] = 0
        data['X_jet'][2][data['X_jet'][2] >  10] = 0
        # Zero-Suppression
        data['X_jet'][0][data['X_jet'][0] < 1.e-3] = 0.
        # data['X_jet'][1][data['X_jet'][1] < 1.e-4] = 0.
        # data['X_jet'][2][data['X_jet'][2] < 1.e-4] = 0.
        data['X_jet'][3][data['X_jet'][3] < 1.e-3] = 0.
        data['X_jet'][4][data['X_jet'][4] < 1.e-3] = 0.
        data['label']=self.label
        indices = [0,1,2,3,4,5,6,7,8,9,10,11,12]
        newdata = [data['X_jet'][index,:,:] for index in indices]
        data['X_jet'] = np.reshape(newdata, (len(indices),125,125))
        return dict(data)
    def __len__(self):
        return self.parquet.num_row_groups

In [4]:
def label(mass):
    mass_ = {'/pscratch/sd/b/bbbam/IMG_aToTauTau_Hadronic_tauDR0p4_m1p2To17p2_dataset_2_unbaised_v2_train/IMG_aToTauTau_Hadronic_tauDR0p4_m1p2To3p6_dataset_2_unbaised_unphysical_0003_train.parquet':-1,'/pscratch/sd/b/bbbam/IMG_aToTauTau_Hadronic_tauDR0p4_m1p2To17p2_dataset_2_unbaised_v2_valid/IMG_aToTauTau_Hadronic_tauDR0p4_m1p2To3p6_dataset_2_unbaised_unphysical_0009_train.parquet':-2}.get(mass, None)
    return mass_
train_decays = glob.glob('/pscratch/sd/b/bbbam/IMG_aToTauTau_Hadronic_tauDR0p4_m1p2To17p2_dataset_2_unbaised_v2_train/*unphysical_0003_train*.parquet*')
dset_train = ConcatDataset([ParquetDataset('%s'%d,label(d)) for i,d in enumerate(train_decays)])
val_decays = glob.glob('/pscratch/sd/b/bbbam/IMG_aToTauTau_Hadronic_tauDR0p4_m1p2To17p2_dataset_2_unbaised_v2_valid/*unphysical_0009_train*.parquet*')
dset_val = ConcatDataset([ParquetDataset('%s'%d, label(d)) for i,d in enumerate(val_decays)])


In [5]:
n_train=20
n_val =10
BATCH_SIZE=5
idxs_train_t = np.random.permutation(len(dset_train))
idxs_val_t   = np.random.permutation(len(dset_val))
idxs_train = np.random.permutation(n_train)
idxs_val   = np.random.permutation(n_val)
train_sampler = RandomSampler(dset_train, replacement=True, num_samples=n_train)
train_loader  = DataLoader(dataset=dset_train, batch_size=BATCH_SIZE, num_workers=4, pin_memory=True, sampler=train_sampler)

# Val dataset
val_sampler   = RandomSampler(dset_val, replacement=True, num_samples=n_val)
val_loader    = DataLoader(dataset=dset_val, batch_size=BATCH_SIZE, num_workers=4, pin_memory=True, sampler=val_sampler)

In [6]:
for i, data in enumerate(val_loader):
    am , label = data['am'], data['label']
    print(i, am, label)
    

0 tensor([[0.2122],
        [0.0893],
        [0.2369],
        [0.2387],
        [0.0896]]) tensor([-2, -2, -2, -2, -2])
1 tensor([[0.0935],
        [0.2139],
        [0.1979],
        [0.1592],
        [0.1562]]) tensor([-2, -2, -2, -2, -2])


In [7]:
for i, data in enumerate(train_loader):
    am , label = data['am'], data['label']
    print(i, am, label)

0 tensor([[0.1353],
        [0.2219],
        [0.2427],
        [0.0944],
        [0.0888]]) tensor([-1, -1, -1, -1, -1])
1 tensor([[0.2231],
        [0.1373],
        [0.1843],
        [0.1702],
        [0.1983]]) tensor([-1, -1, -1, -1, -1])
2 tensor([[0.0978],
        [0.2438],
        [0.1766],
        [0.1856],
        [0.1562]]) tensor([-1, -1, -1, -1, -1])
3 tensor([[0.2316],
        [0.2253],
        [0.1387],
        [0.2538],
        [0.1307]]) tensor([-1, -1, -1, -1, -1])


In [8]:
train_decays = glob.glob('/pscratch/sd/b/bbbam/IMG_aToTauTau_Hadronic_tauDR0p4_m1p2To17p2_dataset_2_unbaised_v2_train/*unphysical_0003_train*.parquet*')
dset_train = ConcatDataset([ParquetDataset('%s'%d,i) for i,d in enumerate(train_decays)])
val_decays = glob.glob('/pscratch/sd/b/bbbam/IMG_aToTauTau_Hadronic_tauDR0p4_m1p2To17p2_dataset_2_unbaised_v2_valid/*unphysical_0009_train*.parquet*')
dset_val = ConcatDataset([ParquetDataset('%s'%d, i) for i,d in enumerate(val_decays)])


In [9]:
n_train=20
n_val =5
BATCH_SIZE=2
idxs_train_t = np.random.permutation(len(dset_train))
idxs_val_t   = np.random.permutation(len(dset_val))
idxs_train = np.random.permutation(n_train)
idxs_val   = np.random.permutation(n_val)
train_sampler = RandomSampler(dset_train, replacement=True, num_samples=n_train)
train_loader  = DataLoader(dataset=dset_train, batch_size=BATCH_SIZE, num_workers=4, pin_memory=True, sampler=train_sampler)

# Val dataset
val_sampler   = RandomSampler(dset_val, replacement=True, num_samples=n_val)
val_loader    = DataLoader(dataset=dset_val, batch_size=BATCH_SIZE, num_workers=4, pin_memory=True, sampler=val_sampler)

In [19]:
for i, data in enumerate(val_loader):
    am , label = data['X_jet'], data['label']
    print(i, am, label)
    break
    

0 tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         ...,

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0.

In [3]:
class ParquetDataset(Dataset):
    def __init__(self, filename, label):
        self.parquet = pq.ParquetFile(filename)
        #self.cols = None # read all columns
        #self.cols = ['X_jet.list.item.list.item.list.item','am','apt','iphi','ieta']
        self.cols = ['X_jet.list.item.list.item.list.item','am','iphi','ieta']
        self.label = label
    def __getitem__(self, index):
        data = self.parquet.read_row_group(index, columns=self.cols).to_pydict()
        return dict(data)
    def __len__(self):
        return self.parquet.num_row_groups

In [7]:

val_decays = glob.glob('/pscratch/sd/b/bbbam/IMG_aToTauTau_Hadronic_tauDR0p4_m1p2To17p2_dataset_2_unbaised_v2_valid/*unphysical_0009_train*.parquet*')
dset_val = ConcatDataset([ParquetDataset('%s'%d, i) for i,d in enumerate(val_decays)])
n_val =5
BATCH_SIZE=2
# idxs_train_t = np.random.permutation(len(dset_train))
idxs_val_t   = np.random.permutation(len(dset_val))
# idxs_train = np.random.permutation(n_train)
idxs_val   = np.random.permutation(n_val)
# Val dataset
val_sampler   = RandomSampler(dset_val, replacement=True, num_samples=n_val)
val_loader    = DataLoader(dataset=dset_val, batch_size=BATCH_SIZE, num_workers=1, pin_memory=True, sampler=val_sampler)

In [8]:
len(val_loader)

3

In [None]:
for i, data in enumerate(val_loader):
    x , label = np.float32(data['X_jet'][0]), transform_y(np.float32(data['am']))
    print(i, x, label)
    break
    

Traceback (most recent call last):
  File "/global/homes/b/bbbam/.conda/envs/Pytorch_VEN/lib/python3.8/multiprocessing/queues.py", line 239, in _feed
    obj = _ForkingPickler.dumps(obj)
  File "/global/homes/b/bbbam/.conda/envs/Pytorch_VEN/lib/python3.8/multiprocessing/reduction.py", line 51, in dumps
    cls(buf, protocol).dump(obj)
  File "/global/homes/b/bbbam/.conda/envs/Pytorch_VEN/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 358, in reduce_storage
    fd, size = storage._share_fd_cpu_()
RuntimeError: unable to mmap 16 bytes from file </torch_324830_602622697_63629>: Cannot allocate memory (12)
Traceback (most recent call last):
  File "/global/homes/b/bbbam/.conda/envs/Pytorch_VEN/lib/python3.8/multiprocessing/queues.py", line 239, in _feed
    obj = _ForkingPickler.dumps(obj)
  File "/global/homes/b/bbbam/.conda/envs/Pytorch_VEN/lib/python3.8/multiprocessing/reduction.py", line 51, in dumps
    cls(buf, protocol).dump(obj)
  File "/global/homes/b/bbbam

In [None]:
data[:, 0] *= pt_scale
data[:, 1] *= dz_scale
data[:, 2] *= dz_scale
data[:, 3] *= ecal_scale
data[:, 4] *= hcal_scale

# Transform am, iphi, and ieta
target = transform_y(target)  # Assuming transform_y is a PyTorch compatible function
iphi /= 360.
ieta /= 140.

# High Value Suppression
data[:, 1].clamp_(-20, 20)  # Clamps values to be between -20 and 20
data[:, 2].clamp_(-10, 10)

# Zero-Suppression
# data[data < 1.e-3] = 0.
data[:, 0][data[:, 0]<1.e-3] = 0.
data[:, 3][data[:, 3]<1.e-3] = 0.
data[:, 4][data[:, 4]<1.e-3] = 0.

indices = [0,1,2,3,4,5,6,7,8,9,10,11,12]
data = data[:, indices, :, :].reshape(len(indices), 125, 125)