In [1]:
import ROOT
import numpy as np
import pandas as pd
import os, re, gc, h5py
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from datetime import datetime
from data_utils import *

Welcome to JupyROOT 6.14/04


In [2]:
dir_path = {'c100':'C100keV_WithFilter_Blue_/','c60':'C60keV_H_2018Oct04_WithFilter_Blue_/','c30':'C30keV_H_withFilter_Blue_Exposed1500_Re_/'}
for carb in ['c100','c60','c30']:
    with open(dir_path[carb]+carb+'_yandex_bfcl.txt','r') as yand:
        lines = yand.readlines()
        for i,line in enumerate(lines):
            lines[i] = ','.join(line.split())
    print(len(lines))
    with open(dir_path[carb]+carb+'_yandex_bfcl_coma.txt','w') as comas:
        for i,line in enumerate(lines):
            comas.write(line+'\n')
            
with open(dir_path[carb]+carb+'_yandex_bfcl_coma.txt','r') as ff:
    print('\n',ff.readlines()[0])

27659
22425
227540


In [2]:
bfcl_names = ['Header','View','Grain']+['pol'+str(i) for i in range(8)]+['tr_flag','n_pol']
test_names = ['Header','View','Grain','Bar_sh','Phy']
for sb in ['sig','bkg']:
    for carb in ['c100','c60','c30']:
        ya_bfcl = pd.read_csv(dir_path[carb]+carb+'_yandex_bfcl_coma.txt', names = bfcl_names)
        ya_sig = pd.read_csv(dir_path[carb]+carb+'_'+sb+'_grains.txt', names=test_names)
        res_bfcl = np.ones((0,*ya_bfcl.shape[1:]),dtype=int)
        i=0
        for _,row in ya_sig.iterrows():
            tmp_bfcl = ya_bfcl[ya_bfcl['View']==row['View']]
            #tmp_bfcl = tmp_bfcl[tmp_bfcl['View']==row['View']]
            if not (tmp_bfcl['Grain']==row['Grain']).any():
                i+=1
                if not i%100: print(i,'\n',row,'\n\n')
            tmp_bfcl = tmp_bfcl[tmp_bfcl['Grain']==row['Grain']]
            res_bfcl = np.vstack((res_bfcl,tmp_bfcl.values))
        np.savetxt(dir_path[carb]+carb+'_'+sb+'_res_yandex.txt',res_bfcl,fmt='%d',delimiter=',')

In [4]:
ya_bfcl.head(5)

In [6]:
ya_sig.head(5)

Unnamed: 0,Header,View,Grain,Bar_sh,Phy
0,1,0,1,0.089997,-0.423279
1,1,0,11,0.084007,0.382205
2,1,0,20,0.058055,-0.11885
3,1,0,22,0.0542,-0.596416
4,1,0,23,0.048908,1.38551


In [3]:
print(res_bfcl)

[[   1    0    5 ... 2397   -1    8]
 [   1    0   10 ... 2400   -1    8]
 [   1    0   21 ... 6072   -2    6]
 ...
 [1500 1499  194 ...   -1   -1    2]
 [1500 1499  203 ...   -1   -2    3]
 [1500 1499  204 ...   -1   -2    3]]


In [4]:
ya_bfcl.shape

(227540, 13)

In [5]:
res_bfcl.shape

(61509, 13)

In [6]:
ya_sig.shape

(61509, 5)

In [2]:
n_pols = 9
path_dir = '/mnt/ML-drive/Artem/Carbon_test/'
class_names_ft = [k for k in os.listdir(path_dir) if ('C60' in k or 'C100' in k)]
#class_keys = ['C30keV','C60keV','C100keV','gamma/Cs137','gamma/Co60','fog']
test_types = ['sig','bckg']
id_header = ['HeaderID','ViewID','GrainID','pol0','pol1','pol2','pol3','pol4','pol5','pol6','pol7','tr_flag','n_pol']
feat_names = ['x','y','z','lx','ly','phi','npx','vol','eps']

feat_array = []
for i in range(9):
    feat_array += [n+str(i) for n in feat_names]
feat_array += ['tr_flag','n_pol']

In [3]:
class_names_ft

['C100keV_WithFilter_Blue_', 'C60keV_H_2018Oct04_WithFilter_Blue_']

In [4]:
def short_name(name):
    if 'keV' in name: return 'Carbon/'+(name.split('_')[0])[1:]
    return name

In [5]:
class_names_ft[0].split('k')[0].lower()

'c100'

In [6]:
%%time
pol_ids = {}
for name in class_names_ft:
    name_ = (name.split('k')[0]).lower()
    for tt in test_types:
        pol_ids[name_+'-'+tt] = pd.read_csv(path_dir+name+'/'+name_+'_'+tt+'_res_yandex.txt', header=None, names=id_header)
        pol_ids[name_+'-'+tt] = pol_ids[name_+'-'+tt].sort_values(by=['HeaderID','GrainID'])

CPU times: user 39.4 ms, sys: 7.99 ms, total: 47.4 ms
Wall time: 46.1 ms


In [7]:
pol_ids[name_+'-'+tt].head(5)

Unnamed: 0,HeaderID,ViewID,GrainID,pol0,pol1,pol2,pol3,pol4,pol5,pol6,pol7,tr_flag,n_pol
0,1,0,26,-1,29,-1,45,55,70,86,122,-1,6
1,1,0,28,3037,3061,3086,3112,3138,3163,3183,3234,-1,8
2,1,0,34,323,493,645,785,917,1059,1205,1483,-1,8
3,1,0,42,-1,-1,-1,-1,923,1063,1209,-1,-1,3
4,1,0,47,319,489,641,783,914,1056,1200,1478,-1,8


In [9]:
%%time
ims_check = []
ims = {}
start = datetime.now()
for name in class_names_ft:
    name_ = (name.split('k')[0]).lower()
    for tt in test_types:
        fold = datetime.now()
        ims[name_+'-'+tt] = load_pol_images(pol_ids[name_+'-'+tt], path_dir, name, csv_dir='csvs_'+tt, n_pol=9)
        print(short_name(name)+'-'+tt,' with ',ims[name_+'-'+tt].shape[0],' images')
        ims_check.append((ims[name_+'-'+tt][...,0]-ims[name_+'-'+tt][...,-1]).sum())
        with h5py.File('data_9pol.h5','a') as dfile:
            dfile.create_dataset(short_name(name)+'/'+tt+'/images', data=ims[name_+'-'+tt])
        print('loaded in ',datetime.now()-fold,'\n')
        del ims[name_+'-'+tt]
        gc.collect()
print('total loading time:',datetime.now()-start)

'load raw images'

Carbon/100keV-sig  with  5982  images
loaded in  0:07:25.307145 

Carbon/100keV-bckg  with  5850  images
loaded in  0:07:02.219042 

Carbon/60keV-sig  with  3911  images
loaded in  0:04:46.820459 

Carbon/60keV-bckg  with  5627  images
loaded in  0:06:41.947188 

total loading time: 0:25:56.434895
CPU times: user 25min 28s, sys: 19.2 s, total: 25min 47s
Wall time: 25min 56s


In [10]:
ims_check

[0, 0, 0, 0]

In [11]:
%%time
start = datetime.now()
feat_check = []
feat_data = {}
for name in class_names_ft:
    name_ = (name.split('k')[0]).lower()
    for tt in test_types:
        fold = datetime.now()
        print('loading features:  ',short_name(name)+'-'+tt)
        feat_data[name_+'-'+tt] = get_pol_feat(pol_ids[name_+'-'+tt], n_pol=9, path_dir=path_dir, class_name=name, feat_names=feat_names)
        feat_check.append((feat_data[name_+'-'+tt].values[:,:9]-feat_data[name_+'-'+tt].values[:,-11:-2]).sum(axis=0))
        with h5py.File('data_9pol.h5', 'a') as datafile:
            datafile.create_dataset(short_name(name)+'/'+tt+'/features', data=feat_data[name_+'-'+tt])
        print('\tin ',datetime.now()-fold,'\n')
        del feat_data[name_+'-'+tt]
        gc.collect()
print('total loading time:',datetime.now()-start)

        
'load features (paralellized automatically)'

loading features:   Carbon/100keV-sig
	in  0:03:49.613946 

loading features:   Carbon/100keV-bckg
	in  0:03:41.772294 

loading features:   Carbon/60keV-sig
	in  0:02:24.176151 

loading features:   Carbon/60keV-bckg
	in  0:03:27.434858 

total loading time: 0:13:23.135177
CPU times: user 41min 23s, sys: 2.35 s, total: 41min 25s
Wall time: 13min 23s




In [12]:
pd.DataFrame(data=np.array(feat_check))

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,
1,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
print(feat_names)

['x', 'y', 'z', 'lx', 'ly', 'phi', 'npx', 'vol', 'eps']


In [14]:
for name in class_names_ft:
    name_ = (name.split('k')[0]).lower()
    for tt in test_types:
        with h5py.File('data_9pol.h5','a') as dfile:
            dfile.create_dataset(short_name(name)+'/'+tt+'/pol_ids', data=pol_ids[name_+'-'+tt].values)
_=gc.collect()

In [21]:
with h5py.File('data_9pol.h5','r') as dfile:
    for k in dfile['Carbon'].keys():
        for t in dfile['Carbon/'+k].keys():
            print(list(dfile['Carbon/'+k+'/'+t].keys()))
            print(dfile['Carbon/'+k+'/'+t+'/pol_ids'][0])

['features', 'images', 'pol_ids']
[   1    0    9  231  449  660  877 1114 1346   -1   -1   -2    6]
['features', 'images', 'pol_ids']
[   1    0    1 2467  457  667  887 1125 1354 3662 4068   -1    8]
['features', 'images', 'pol_ids']
[  1   0  26  -1  29  -1  45  55  70  86 122  -1   6]
['features', 'images', 'pol_ids']
[   1    0   12 2832 2852 2872 2893 2691 2711 2739 2788   -1    8]


In [27]:
[str(k) if (k-int(k)) else str(int(k)) for k in np.hstack((np.ones(5,dtype=int),np.ones(1)*np.pi/4))]

['1', '1', '1', '1', '1', '0.7853981633974483']