In [1]:
import ROOT
import os, re, gc, h5py
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

Welcome to JupyROOT 6.14/04


In [2]:
copa = np.array([np.zeros((3,3)),np.ones((3,3))]).T

In [3]:
copa[...,1]

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [4]:
def load_images(path, name_dir='C30keV', n_pols=8):
    """
    Loads images into numpy array of shape (N,h,w,n_pols)
    
    Arguments:
    path -- path to the directory with images
    name_dir -- name of the particular directory with signal or bckg samples
    n_pols -- number of polarizations (channels) of each image
    
    Returns:
    images -- numpy array with N images of shape (h,w) with n_pols channels
    """

    img_ind = []
    path = path+name_dir+"/csvs/"
    img_names = os.listdir(path)
    for name in img_names:
        img_ind.append(list(filter(None, re.split('[_.^a-z]',name))))
    img_ind = pd.DataFrame(np.array(img_ind), columns=['HeaderID','GrainID','ClusterID','Pol'])
    img_ind = img_ind.sort_values(by=['HeaderID','GrainID','Pol'])
    for hd in np.unique(img_ind['HeaderID']):
        hd_imgs = img_ind[ img_ind['HeaderID']==hd ]
        for grain in np.unique(hd_imgs['GrainID']):
            grain_imgs = hd_imgs[ hd_imgs['GrainID']==grain ]
            if grain_imgs.shape[0]!=n_pols:
                img_ind = img_ind.drop(grain_imgs.index)
    img_names = []
    for name in img_ind.values:
        img_names.append(name[0]+'_gr'+name[1]+'_cl'+name[2]+'_pol'+name[3]+'.csv')
    
    gc.collect()
    i=0
    im_array = []
    for name in img_names:
        if i==0:
            tmp_im = []
            tmp_im.append(pd.read_csv(path+name, header=None).drop(32, axis=1).values)
        else: tmp_im.append(pd.read_csv(path+name, header=None).drop([0,33], axis=1).drop(32,axis=0).values)
        i+=1
        if i==n_pols:
            im_array.append(np.array(tmp_im, dtype=np.uint8).T)
            i=0
            gc.collect()
    return np.array(im_array)

In [5]:
def get_pol_feat(id_frame, n_pol, path_dir, class_name, feat_names):
    
    f = ROOT.TFile.Open(path_dir+class_name+'/dm_tracks.dm.root','read')
    t = f.Get('Vdmr')
    all_feat = np.zeros((0,len(feat_names)*n_pol))
    feat_array = []
    for i in range(n_pol):
        for name in feat_names:
            feat_array.append(name+str(i))
    
    eps=1e-3
    for hdr, *cl_ids in id_frame.drop(['ViewID','GrainID','tr_flag'], axis=1).values:
        pol_feat = []
        t.GetEntry(int(hdr))
        for cl_id in cl_ids:
            for name in feat_names[:-1]:
                pol_feat.append(t.GetLeaf('cl.'+name).GetValue(int(cl_id)))
            pol_feat.append( (t.GetLeaf('cl.lx').GetValue(int(cl_id))+eps)/(t.GetLeaf('cl.ly').GetValue(int(cl_id))+eps) )
        all_feat = np.vstack((all_feat, pol_feat))
        gc.collect()
    return pd.DataFrame(all_feat, columns=feat_array)

In [6]:
def clean_zero_images(data, bad = None, class_name='C100keV'):
    """
    Removes images (or any entries) mentioned as 'bad', or containing zero pixels (if 'bad' is not specified)
    
    Arguments:
    data -- NumPy array with data to be cleaned
    bad -- list of indices to be removed. If None - images containing zero pixels are removed
    class_name -- str, name of the data used for saving the 'bad' list, if 'bad' is not specified
    
    Returns:
    data -- NumPy array with 'bad' items removed
    """
    if bad is None:
        bad = []
        for i in np.arange(data.shape[0]):
            if not data[i].all():
                bad.append(i)
        np.savetxt('bad_edge_'+class_name+'.txt',bad,fmt='%d')
        print(class_name+' bad samples: ',len(bad))
    mask = np.ones(data.shape[0],dtype=bool)
    
    mask[bad] = False
    return data[mask,...]

In [7]:
path = "/home/scanner-ml/Artem/"
class_names = ['C30keV','C60keV','gamma']
n_pols = 8

In [8]:
path_dir = '/home/scanner-ml/Artem/'
class_names_ft = ['C30keV','C60keV','C100keV/C1','C100keV/C2','gamma']
class_keys = ['C30keV','C60keV','C100keV','gamma']
feat_names = ['x','y','z','lx','ly','phi','npx','vol','eps']
id_header = ['HeaderID','ViewID','GrainID','pol0','pol1','pol2','pol3','pol4','pol5','pol6','pol7','tr_flag']

In [9]:
'''
%%time
C30 = load_images(path, 'C30keV')
C30 = clean_zero_images(C30, class_name='C30keV')
gc.collect()
C60 = load_images(path, 'C60keV')
C60 = clean_zero_images(C60, class_name='C60keV')
gc.collect()
C100 = load_images(path, 'C100keV/C1')
C100 = clean_zero_images(C100, class_name='C100keV')
gc.collect()
C100_2 = load_images(path, 'C100keV/C2')
C100_2 = clean_zero_images(C100_2, class_name='C100keV')
gc.collect()
gamma = load_images(path, 'gamma')
gamma = clean_zero_images(gamma, class_name='gamma')
gc.collect()

#if os.path.isfile('samples.h5'):
#    os.remove('samples.h5')

with h5py.File('samples.h5','a') as datafile:
    datafile.create_dataset('C30keV', data=C30)
    datafile.create_dataset('C60keV', data=C60)
    datafile.create_dataset('C100keV', data=np.vstack((C100,C100_2)))
    datafile.create_dataset('gamma', data=gamma)
    
print('gamma samples: ',gamma.shape, '\t', getsizeof(gamma)//1024**2, 'Mb')
print('C100keV_1 samples: ',C100.shape, '\t', getsizeof(C100)//1024**2, 'Mb')
print('C100keV_2 samples: ',C100_2.shape, '\t', getsizeof(C100_2)//1024**2, 'Mb')
print('C60keV samples: ',C60.shape, '\t', getsizeof(C60)//1024**2, 'Mb')
print('C30keV samples: ',C30.shape, '\t', getsizeof(C30)//1024**2, 'Mb')
'''
'datasamples'

'datasamples'

In [10]:
'''
%%time
pol_ids = {}
for name in class_names:
    pol_ids[name] = pd.read_csv(path_dir+name+'/yandex_bfcl.txt', header=None, names=id_header)
    pol_ids[name] = pol_ids[name].sort_values(by=['HeaderID','GrainID'])
feat_data = {}
for name in class_names_ft:
    feat_data[name] = get_pol_feat(pol_ids[name], n_pol=8, path_dir=path_dir, class_name=name, feat_names=feat_names)
'''
'load_features. wall time ~4h. cpu time ~24h'

'load_features. wall time ~4h. cpu time ~24h'

In [11]:
'''
bads = {}
for k in class_keys:
    bads[k] = np.loadtxt('/home/scanner-ml/Artem/Python/NEWS/data/bad_edge_'+k+'.txt', dtype=np.int)
for k in class_keys:
    if k=='C100keV':
        feat_data['C100keV/C2'] = clean_zero_images(feat_data['C100keV/C2'].values, bads[k])
    else:
        feat_data[k] = clean_zero_images(feat_data[k].values, bads[k], class_name=k)

### for C100keV/C1 we removed random images, since we didn't save the right indices. Must be fixed!

rand_bad = np.ones(feat_data['C100keV/C1'].shape[0], dtype=bool)
rand_bad[np.random.randint(0, high=feat_data['C100keV/C1'].shape[0], size=32, dtype=np.int)] = False
feat_data['C100keV/C1'] = (feat_data['C100keV/C1'].values)[rand_bad,...]
'''
'cleaning samples with "zero-images"'

'cleaning samples with "zero-images"'

In [12]:
_ = '''
with h5py.File('/home/scanner-ml/Artem/Python/NEWS/data/features.h5','a') as datafile:
    datafile.create_dataset('C30keV', data=feat_data['C30keV'])
    datafile.create_dataset('C60keV', data=feat_data['C60keV'])
    datafile.create_dataset('C100keV', data=np.vstack((feat_data['C100keV/C1'],feat_data['C100keV/C2'])))
    datafile.create_dataset('gamma', data=feat_data['gamma'])
'''

In [13]:
with h5py.File('samples.h5','r') as sampfile:
    with h5py.File('features.h5','r') as featfile:
        print(list(sampfile.keys()))
        print(list(featfile.keys()))

['C100keV', 'C30keV', 'C60keV', 'gamma']
['C100keV', 'C30keV', 'C60keV', 'gamma']


In [14]:
'''
%%time
if os.path.isfile('dataset.h5'):
    os.remove('dataset.h5')
with h5py.File('dataset.h5','a') as datafile:
    with h5py.File('samples.h5','r') as sampfile:
        with h5py.File('features.h5','r') as featfile:
            for k in sampfile.keys():
                datas = sampfile[k][...]
                feats = featfile[k][...]
                im_train, im_test, ft_train, ft_test = train_test_split(datas, feats, test_size=0.15)
                datafile.create_dataset(k+'/images/train', data=im_train)
                datafile.create_dataset(k+'/images/test', data=im_test)
                datafile.create_dataset(k+'/features/train', data=ft_train)
                datafile.create_dataset(k+'/features/test', data=ft_test)
                gc.collect()
    print(list(datafile.keys()))
'''
'train_test_split'

'train_test_split'

In [15]:
with h5py.File('dataset.h5','r') as datafile:
    for k in datafile.keys():
        for p in datafile[k].keys():
            for n in datafile[k+'/'+p].keys():
                print(k+'/'+p+'/'+n+'\t', datafile[k+'/'+p+'/'+n].shape)

C100keV/features/test	 (22425, 72)
C100keV/features/train	 (127075, 72)
C100keV/images/test	 (22425, 32, 32, 8)
C100keV/images/train	 (127075, 32, 32, 8)
C30keV/features/test	 (17493, 72)
C30keV/features/train	 (99125, 72)
C30keV/images/test	 (17493, 32, 32, 8)
C30keV/images/train	 (99125, 32, 32, 8)
C60keV/features/test	 (18670, 72)
C60keV/features/train	 (105795, 72)
C60keV/images/test	 (18670, 32, 32, 8)
C60keV/images/train	 (105795, 32, 32, 8)
gamma/features/test	 (19329, 72)
gamma/features/train	 (109528, 72)
gamma/images/test	 (19329, 32, 32, 8)
gamma/images/train	 (109528, 32, 32, 8)


* Example of loading the libDMRoot in PyROOT

In [17]:
f = ROOT.TFile.Open(path_dir+'C60keV/dm_tracks.dm.root','read')



In [18]:
ROOT.gSystem.Load('libDMRoot')

0

Error in <TCling::RegisterModule>: cannot find dictionary module DMRootCint_rdict.pcm


In [19]:
arun = ROOT.DMRRun("dm_tracks.dm.root")

DMRRun::OpenNew : Open new file dm_tracks.dm.root


