In [1]:
import ROOT

Welcome to JupyROOT 6.14/04


In [2]:
import gc
import numpy as np
import pandas as pd

In [19]:
path_dir = '/home/scanner-ml/Artem/'
class_names = ['C30keV','C60keV','C100keV/C1','C100keV/C2','gamma']
class_keys = ['C30keV','C60keV','C100keV','gamma']
feat_names = ['x','y','z','lx','ly','phi','npx','vol','eps']
id_header = ['HeaderID','ViewID','GrainID','pol0','pol1','pol2','pol3','pol4','pol5','pol6','pol7','tr_flag']

* Example of loading the libDMRoot in PyROOT

In [4]:
f = ROOT.TFile.Open(path_dir+'C60keV/dm_tracks.dm.root','read')



In [5]:
ROOT.gSystem.Load('libDMRoot')

0

Error in <TCling::RegisterModule>: cannot find dictionary module DMRootCint_rdict.pcm


In [6]:
arun = ROOT.DMRRun("dm_tracks.dm.root")

DMRRun::OpenNew : Open new file dm_tracks.dm.root




In [7]:
def get_pol_feat(id_frame, n_pol, path_dir, class_name, feat_names):
    
    f = ROOT.TFile.Open(path_dir+class_name+'/dm_tracks.dm.root','read')
    t = f.Get('Vdmr')
    all_feat = np.zeros((0,len(feat_names)*n_pol))
    feat_array = []
    for i in range(n_pol):
        for name in feat_names:
            feat_array.append(name+str(i))
    
    eps=1e-3
    for hdr, *cl_ids in id_frame.drop(['ViewID','GrainID','tr_flag'], axis=1).values:
        pol_feat = []
        t.GetEntry(int(hdr))
        for cl_id in cl_ids:
            for name in feat_names[:-1]:
                pol_feat.append(t.GetLeaf('cl.'+name).GetValue(int(cl_id)))
            pol_feat.append( (t.GetLeaf('cl.lx').GetValue(int(cl_id))+eps)/(t.GetLeaf('cl.ly').GetValue(int(cl_id))+eps) )
        all_feat = np.vstack((all_feat, pol_feat))
        gc.collect()
    return pd.DataFrame(all_feat, columns=feat_array)

In [8]:
%%time
pol_ids = {}
for name in class_names:
    pol_ids[name] = pd.read_csv(path_dir+name+'/yandex_bfcl.txt', header=None, names=id_header)
    pol_ids[name] = pol_ids[name].sort_values(by=['HeaderID','GrainID'])

CPU times: user 505 ms, sys: 72 ms, total: 577 ms
Wall time: 574 ms


In [9]:
%%time
feat_data = {}
for name in class_names:
    feat_data[name] = get_pol_feat(pol_ids[name], n_pol=8, path_dir=path_dir, class_name=name, feat_names=feat_names)

CPU times: user 22h 24min 8s, sys: 1h 39min 6s, total: 1d 3min 15s
Wall time: 4h 6min 22s


In [13]:
feat_data['C60keV'].shape

(127051, 72)

In [15]:
def clean_zero_images(data, bad = None, class_name='C100keV'):
    """
    Removes images (or any entries) mentioned as 'bad', or containing zero pixels (if 'bad' is not specified)
    
    Arguments:
    data -- NumPy array with data to be cleaned
    bad -- list of indices to be removed. If None - images containing zero pixels are removed
    class_name -- str, name of the data used for saving the 'bad' list, if 'bad' is not specified
    
    Returns:
    data -- NumPy array with 'bad' items removed
    """
    if bad is None:
        bad = []
        for i in np.arange(data.shape[0]):
            if not data[i].all():
                bad.append(i)
        np.savetxt('bad_edge_'+class_name+'.txt',bad,fmt='%d')
        print(class_name+' bad samples: ',len(bad))
    mask = np.ones(data.shape[0],dtype=bool)
    
    mask[bad] = False
    return data[mask,...]

In [26]:
bads = {}
for k in class_keys:
    bads[k] = np.loadtxt('/home/scanner-ml/Artem/Python/NEWS/data/bad_edge_'+k+'.txt', dtype=np.int)

In [28]:
for k in class_keys:
    if k=='C100keV':
        feat_data['C100keV/C2'] = clean_zero_images(feat_data['C100keV/C2'].values, bads[k])
    else:
        feat_data[k] = clean_zero_images(feat_data[k].values, bads[k], class_name=k)

* for C100keV/C1 we removed random images, since we didn't save the right indices. Must be fixed!

In [43]:
rand_bad = np.ones(feat_data['C100keV/C1'].shape[0], dtype=bool)
rand_bad[np.random.randint(0, high=feat_data['C100keV/C1'].shape[0], size=32, dtype=np.int)] = False
feat_data['C100keV/C1'] = (feat_data['C100keV/C1'].values)[rand_bad,...]

In [47]:
import h5py

with h5py.File('/home/scanner-ml/Artem/Python/NEWS/data/features.h5','a') as datafile:
    datafile.create_dataset('C30keV', data=feat_data['C30keV'])
    datafile.create_dataset('C60keV', data=feat_data['C60keV'])
    datafile.create_dataset('C100keV', data=np.vstack((feat_data['C100keV/C1'],feat_data['C100keV/C2'])))
    datafile.create_dataset('gamma', data=feat_data['gamma'])

In [48]:
with h5py.File('/home/scanner-ml/Artem/Python/NEWS/data/features.h5','r') as datafile:
    for k in datafile.keys():
        print(k+'\t',datafile[k].shape)
print('\n')
with h5py.File('/home/scanner-ml/Artem/Python/NEWS/data/samples.h5','r') as datafile:
    for k in datafile.keys():
        print(k+'\t',datafile[k].shape)

C100keV	 (149500, 72)
C30keV	 (116618, 72)
C60keV	 (124465, 72)
gamma	 (128857, 72)


C100keV	 (149500, 32, 32, 8)
C30keV	 (116618, 32, 32, 8)
C60keV	 (124465, 32, 32, 8)
gamma	 (128857, 32, 32, 8)
