In [1]:
import ROOT
import os, re, gc, h5py
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from data_utils import *

Welcome to JupyROOT 6.14/04


In [2]:
copa = np.array([np.zeros((3,3)),np.ones((3,3))]).T

In [3]:
copa[...,1]

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [4]:
path = "/home/scanner-ml/Artem/"
class_names = ['C30keV','C60keV','gamma']
n_pols = 8

In [5]:
path_dir = '/home/scanner-ml/Artem/'
class_names_ft = ['C30keV','C60keV','C100keV/C1','C100keV/C2','gamma']
class_keys = ['C30keV','C60keV','C100keV','gamma']
id_header = ['HeaderID','ViewID','GrainID','pol0','pol1','pol2','pol3','pol4','pol5','pol6','pol7','tr_flag','n_pol']
feat_names = ['x','y','z','lx','ly','phi','npx','vol','eps']

feat_array = []
for i in range(8):
    feat_array += [n+str(i) for n in feat_names]
feat_array += ['tr_flag','n_pol']

In [6]:
%%time
pol_ids = {}
for name in class_names_ft:
    pol_ids[name] = pd.read_csv(path_dir+name+'/yandex_bfcl.txt', header=None, names=id_header)
    pol_ids[name] = pol_ids[name].sort_values(by=['HeaderID','GrainID'])

CPU times: user 814 ms, sys: 81.3 ms, total: 895 ms
Wall time: 1.28 s


In [7]:
pol_ids['gamma'].head(10)

Unnamed: 0,HeaderID,ViewID,GrainID,pol0,pol1,pol2,pol3,pol4,pol5,pol6,pol7,tr_flag,n_pol
0,4,3,1,81200,81322,82433,-1,-1,-1,-1,-1,-1,3
1,4,3,2,81198,81320,81464,81584,81710,-1,-1,-1,-1,5
2,4,3,5,-1,-1,-1,-1,81707,-1,-1,-1,-1,1
3,4,3,6,83502,83510,83515,83520,83532,83545,83558,83576,-1,8
4,4,3,7,82807,82829,82860,82883,82907,-1,-1,-1,-1,5
5,4,3,8,79373,79566,79766,79973,80165,80352,80530,-1,3,7
6,4,3,9,83420,83426,83436,83446,83454,83463,83473,83486,2,8
7,4,3,20,81173,81286,81428,81556,81686,80336,-1,-1,3,6
8,4,3,21,-1,81287,81430,81557,81685,81806,81914,82155,-1,7
9,4,3,22,81184,-1,82428,82476,82524,82579,82635,82165,-1,7


* empty polarizations are loaded with zeros! subject to interpolation?

In [8]:
os.listdir('/home/scanner-ml/Artem/C100keV/C2/csvs/')[:3]

['303_gr_204_pol_1_cl_436_tr_3_npol_8.csv',
 '333_gr_133_pol_6_cl_1628_tr_-1_npol_8.csv',
 '225_gr_267_pol_3_cl_2489_tr_-1_npol_8.csv']

In [9]:
'''%%time
ims = {}
for name in class_names_ft:
    ims[name] = load_pol_images(pol_ids[name], path_dir, name)
    gc.collect()
    with h5py.File('data_raw.h5','a') as dfile:
        dfile.create_dataset(name+'/images', data=ims[name])
'''
'load raw images. wall time ~12h (4 manual threads)'

'load raw images. wall time ~12h (4 manual threads)'

In [10]:
'''%%time
pol_ids = {}
for name in class_names_ft:
    pol_ids[name] = pd.read_csv(path_dir+name+'/yandex_bfcl.txt', header=None, names=id_header)
    pol_ids[name] = pol_ids[name].sort_values(by=['HeaderID','GrainID'])
feat_data = {}
for name in class_names_ft:
    print('loading:  ',name)
    feat_data[name] = get_pol_feat(pol_ids[name], n_pol=8, path_dir=path_dir, class_name=name, feat_names=feat_names)

with h5py.File('data_raw.h5', 'a') as datafile:
    for name in class_names_ft:
        datafile.create_dataset(name+'/features', data=feat_data[name])
'''
        
'load features. wall time ~14h. cpu time ~100h (paralellized automatically)'

'load features. wall time ~14h. cpu time ~100h (paralellized automatically)'

In [11]:
%%time
ims, feats = {}, {}
with h5py.File('data_raw.h5','r') as dfile:
    for name in class_names_ft:
        ims[name] = dfile[name+'/images'][...]
        feats[name] = pd.DataFrame(data=dfile[name+'/features'][...], columns=feat_array)
        print(name,'   \timgs: ',ims[name].shape,'\tfeats: ',feats[name].shape)


C30keV    	imgs:  (182179, 32, 32, 8) 	feats:  (182179, 74)
C60keV    	imgs:  (189537, 32, 32, 8) 	feats:  (189537, 74)
C100keV/C1    	imgs:  (108162, 32, 32, 8) 	feats:  (108162, 74)
C100keV/C2    	imgs:  (95683, 32, 32, 8) 	feats:  (95683, 74)
gamma    	imgs:  (217100, 32, 32, 8) 	feats:  (217100, 74)
CPU times: user 8.53 ms, sys: 2.89 s, total: 2.89 s
Wall time: 14.4 s


In [12]:
name = 'C100keV'
ims[name] = np.vstack((ims['C100keV/C1'],ims['C100keV/C2']))
feats[name] = pd.DataFrame(data=np.vstack((feats['C100keV/C1'].values,
                                           feats['C100keV/C2'].values)), columns=feat_array)
pol_ids[name] = pd.DataFrame(data=np.vstack((pol_ids['C100keV/C1'].values,
                                           pol_ids['C100keV/C2'].values)), columns=id_header)
print(name,'   \timgs: ',ims[name].shape,'\tfeats: ',feats[name].shape)

C100keV    	imgs:  (203845, 32, 32, 8) 	feats:  (203845, 74)


In [13]:
%%time
print('Cleaning divergent samples and edge images')
print('Splitting into train-val-test (80-10-10) and saving to dataset_clean.h5')
bads = {}
for name in class_keys:
    bads[name] = bad_inds(pol_ids[name], imgs=ims[name], features=feats[name],
                          isolated=True, quant=0.999, f_name=name)
    mask = np.ones(pol_ids[name].shape[0], dtype=bool)
    mask[bads[name]] = False
    tmp_ims = ims[name][mask]
    im_tr_val, im_test = train_test_split(tmp_ims, test_size=0.1, shuffle=False)
    im_tr, im_val = train_test_split(im_tr_val, test_size=0.11, shuffle=False)
    tmp_feat = (feats[name].values)[mask]
    feat_tr_val, feat_test = train_test_split(tmp_feat, test_size=0.1, shuffle=False)
    feat_tr, feat_val = train_test_split(feat_tr_val, test_size=0.11, shuffle=False)
    
    with h5py.File('dataset_clean.h5', 'a') as datafile:
        datafile.create_dataset(name+'/images/train', data=im_tr)
        datafile.create_dataset(name+'/images/val', data=im_val)
        datafile.create_dataset(name+'/images/test', data=im_test)
        datafile.create_dataset(name+'/features/train', data=feat_tr)
        datafile.create_dataset(name+'/features/val', data=feat_val)
        datafile.create_dataset(name+'/features/test', data=feat_test)
    _=gc.collect()


_='''print('Wall time ~4 min\n')
for name in class_names_ft:
    f_name = '_'.join(name.split('/'))
    print(np.loadtxt('bad_sample_ids/'+f_name+'.txt').shape[0], '\tBad ',f_name)'''

Cleaning divergent samples and edge images
Splitting into train-val-test (80-10-10) and saving to dataset_clean.h5
50458 	Bad  C30keV
61307 	Bad  C60keV
72722 	Bad  C100keV
127785 	Bad  gamma
CPU times: user 4min 2s, sys: 25.8 s, total: 4min 28s
Wall time: 4min 5s


In [14]:
with h5py.File('dataset_clean.h5', 'r') as dfile:
    for name in class_keys:
        print(name+':')
        for i in ['images','features']:
            print(i+':')
            for t in ['train','val','test']:
                pr = ' \t' if i=='images' else '\t\t'
                print(t+': ', dfile[name+'/'+i+'/'+t].shape, end=pr)
            print('',end='\n')
        print('',end='\n')

C30keV:
images:
train:  (105507, 32, 32, 8) 	val:  (13041, 32, 32, 8) 	test:  (13173, 32, 32, 8) 	
features:
train:  (105507, 74)		val:  (13041, 74)		test:  (13173, 74)		

C60keV:
images:
train:  (102712, 32, 32, 8) 	val:  (12695, 32, 32, 8) 	test:  (12823, 32, 32, 8) 	
features:
train:  (102712, 74)		val:  (12695, 74)		test:  (12823, 74)		

C100keV:
images:
train:  (105028, 32, 32, 8) 	val:  (12982, 32, 32, 8) 	test:  (13113, 32, 32, 8) 	
features:
train:  (105028, 74)		val:  (12982, 74)		test:  (13113, 74)		

gamma:
images:
train:  (71540, 32, 32, 8) 	val:  (8843, 32, 32, 8) 	test:  (8932, 32, 32, 8) 	
features:
train:  (71540, 74)		val:  (8843, 74)		test:  (8932, 74)		



* Example of loading the libDMRoot in PyROOT

In [15]:
f = ROOT.TFile.Open(path_dir+'C60keV/dm_tracks.dm.root','read')



In [16]:
ROOT.gSystem.Load('libDMRoot')

0

Error in <TCling::RegisterModule>: cannot find dictionary module DMRootCint_rdict.pcm


In [17]:
arun = ROOT.DMRRun("dm_tracks.dm.root")

DMRRun::OpenExisting: Open an existing file dm_tracks.dm.root
DMRRun::Open    : ERROR: dm_tracks.dm.root has no Vdmr tree
