In [1]:
import os, sys, argparse, importlib, time, inspect
import numpy as np
import matplotlib.pyplot as plt
import os.path as osp
if hasattr(__builtins__,'__IPYTHON__'):
    print('Notebook')
    from tqdm.notebook import tqdm
else:
    print('Not notebook')
    from tqdm import tqdm
from tensorflow.keras import backend as K
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
import tensorflow as tf
import tensorflow_probability as tfp

gpu_devices = tf.config.list_physical_devices('GPU') 
if len(gpu_devices) > 0:
    print("GPU detected")
    tf.config.experimental.set_memory_growth(gpu_devices[0], True)
else:
    print('No GPU detected')

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.models import load_model, Model
import spektral
from sklearn.preprocessing import normalize
from spektral.data import DisjointLoader, BatchLoader, SingleLoader
from importlib import reload
import datetime as dt

Notebook
No GPU detected


In [2]:
transform_path='../db_files/dev_lvl7/transformers.pkl'
db_path= '../db_files/dev_lvl7/dev_lvl7_mu_nu_e_classification_v003.db'
set_path='../db_files/dev_lvl7/sets.pkl'

In [6]:
from pandas import read_sql, read_pickle, concat, read_csv, DataFrame
from sklearn.preprocessing import normalize, RobustScaler
from sklearn.neighbors import kneighbors_graph as knn
import matplotlib.pyplot as plt

from spektral.data import Dataset, Graph
from scipy.sparse import csr_matrix
import sqlite3
import pickle

def get_event_no():
    print('Reading sets')
    sets = read_pickle(set_path)
    train_events = sets['train']
    test_events = sets['test']
    return train_events['event_no'].to_numpy(), test_events['event_no'].to_numpy()
features=["dom_x", "dom_y", "dom_z",  "dom_time", "charge_log10", "width", "rqe"]
targets= ["energy_log10", "zenith","azimuth"]

n_steps=10
n_neighbors=30
db_file   = db_path
path='processed/where_classic_{n_neighbors}'

In [30]:

if not os.path.isdir(path):
    os.mkdir(path)
print("Connecting to db-file")
with sqlite3.connect(db_file) as conn:
    # Find indices to cut after

    # SQL queries format
    feature_call = ", ".join(features)
    target_call  = ", ".join(targets)

    # Load data from db-file
    print("Reading files")

    train_events1, test_events1=get_event_no()
    train_events = np.array_split(train_events1,n_steps)
    test_events  = np.array_split(test_events1,n_steps)

    for i, (train, test) in enumerate(zip(train_events, test_events)):
        if i==0:
            for tt, events in zip(['train', 'test'], [train, test]):
                if tt=='train':
                    events=events[:10000]
                    start=time.time()
                    df_feat  = read_sql(f"select event_no, {feature_call} from features where event_no in {tuple(events)}", conn).sort_values('event_no')
                    print('Features read')
                    df_targ  = read_sql(f"select {target_call}, event_no from truth where event_no in {tuple(events)}", conn).sort_values('event_no')
                    stop=time.time()
                    print(f'All read in {np.round(stop-start,2)} s, transforming')
                    transformers = pickle.load(open(transform_path, 'rb'))
                    trans_x      = transformers['features']
                    trans_y      = transformers['truth']
                    for col in ["dom_x", "dom_y", "dom_z"]:
                        df_feat[col] = trans_x[col].inverse_transform(np.array(df_feat[col]).reshape(1, -1)).T/1000

                    for col in ["energy_log10", "zenith","azimuth"]:
                        # print(col)
                        df_targ[col] = trans_y[col].inverse_transform(np.array(df_targ[col]).reshape(1, -1)).T



                    # Cut indices
                    print("Splitting data to events")
                    idx_list    = np.array(df_feat['event_no'])
                    # df_feat.drop('event_no', axis=1, inplace=True)
                    x_not_split = np.array(df_feat)

                    _, idx = np.unique(idx_list.flatten(), return_index = True) 
                    xs          = np.split(x_not_split, idx[1:])

                    ys          = np.array(df_targ)
                    print(df_feat.head())
                    print(df_targ.head())

                    graph_list=[]
                    # Generate adjacency matrices
                    for x, y in tqdm(zip(xs, ys), total = len(xs), position=1, desc=f'Transform {tt} {i}'):
                        try:
                            a = knn(x[:, :3], n_neighbors)
                        except:
                            a = csr_matrix(np.ones(shape = (x.shape[0], x.shape[0])) - np.eye(x.shape[0]))
                        graph_list.append(Graph(x = x, a = a, y = y))
                    print('List->array')
                    graph_list = np.array(graph_list, dtype = object)
                    print(f"Saving dataset {tt} {i}: {len(graph_list)} {tt}")
                    pickle.dump(graph_list, open(osp.join(path, f"{tt}_{i}.dat"), 'wb'))
                    # pickle.dump(graph_list, open(osp.join('processed/debug', f"{tt}_{i}.dat"), 'wb'))
                    stop=time.time()
                    print(f"Process {tt} {i} took {np.round(stop-start, 2)} s")


Connecting to db-file
Reading files
Reading sets
Features read
All read in 0.81 s, transforming
Splitting data to events
    event_no    dom_x    dom_y    dom_z  dom_time  charge_log10  width   rqe
0   13188817  0.00171 -0.15063 -0.28570 -0.659176     -0.666667    1.0 -0.35
14  13188817 -0.01097  0.00672 -0.28315  0.415730      0.000000    1.0  0.00
13  13188817 -0.00968 -0.07950 -0.42274 -1.026217      0.083333    1.0  0.00
12  13188817 -0.00968 -0.07950 -0.38069  1.007491      0.166667    1.0  0.00
10  13188817 -0.00968 -0.07950 -0.28957 -0.350187      1.166667    0.0  0.00
   energy_log10    zenith   azimuth  event_no
0      2.340637  1.027825  4.888147  13188817
1      2.416496  1.107861  4.361933  13188820
2      2.409569  1.011339  4.044844  13188832
3      2.405540  0.917276  5.294104  13188856
4      2.333035  1.197129  1.402828  13188860


HBox(children=(HTML(value='Transform train 0'), FloatProgress(value=0.0, max=10000.0), HTML(value='')))


List->array
Saving dataset train 0: 10000 train
Process train 0 took 6.91 s


In [27]:
lns=[]
for x in xs:
    lns.append(len(np.unique(x[:,0])))

In [28]:
print(np.sum(lns))

10000


In [24]:
# np.sum(np.unique(ys[:,3])==np.unique(x_not_split[:,7]))
# np.sum(np.in1d(np.unique(x_not_split[:,7]), test_events1))
data  = pickle.load(open(osp.join(path, f"train_{0}.dat"), 'rb'))     

In [31]:
enoy=[]
enox=[]
lenox=[]
doms=[]
for i in tqdm(range(len(data))):
    enoy.append(data[i].y[0])
    lenox.append(len(np.unique(data[i].x[:,0])))
    enox.append(np.unique(data[i].x[:,0]))
    doms.append(len(data[i].x))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [37]:
import dev.datawhere as dl
reload(dl)
graph_data=dl.graph_data
dataset=graph_data(n_steps=10, n_neighbors=30,
        transform_path='../db_files/dev_lvl7/transformers.pkl',\
             db_path= '../db_files/dev_lvl7/dev_lvl7_mu_nu_e_classification_v003.db', restart=1)

Removed and ready to reload
Connecting to db-file
Reading files
Reading sets
Features read
Targets read, transforming
Reading train 0 took 349.35 s
Splitting data to events
           dom_x    dom_y    dom_z  dom_time  charge_log10  width   rqe
7989078  0.11319 -0.06047 -0.25221  1.812734     -0.750000    1.0  0.00
7989069  0.12497 -0.13125 -0.40845  1.058052      1.333333    1.0 -0.35
7989070  0.07237 -0.06660 -0.30000  1.528090      0.166667    1.0  0.00
7989071  0.07237 -0.06660 -0.31402 -0.546816      0.333333    0.0  0.00
7989072  0.07237 -0.06660 -0.32103  0.837079      0.750000    0.0  0.00
        energy_log10    zenith   azimuth  event_no
303764      0.421948  2.676098  1.044445         1
303765      0.463034  0.693687  6.133094         6
303766      0.671135  2.486230  5.891520        37
303767      0.566209  1.957056  0.632692        39
303768      0.568009  1.858723  2.070458        40


HBox(children=(HTML(value='Transform train 0'), FloatProgress(value=0.0, max=663345.0), HTML(value='')))


List->array
Saving dataset train 0: 663345 train
Process train 0 took 783.82 s
Features read


In [74]:
len(dataset)

100000

In [60]:
df=dataset.df_event
test=np.arange(0,10000,3)

In [62]:
dataset[0]

Graph(n_nodes=9, n_node_features=7, n_edge_features=None, n_labels=3)

In [65]:
idx_lists = [np.array(df[df['event_no'].isin(test)].index)] 

In [66]:
dataset_test=dataset[idx_lists[0]]

In [63]:
loader = DisjointLoader(dataset, epochs=1, batch_size=512)

In [4]:
path='processed/submit_muon_0_n_data_10000_type_classic_nn_30/data.npy'
data=np.load(path, allow_pickle=True)

In [67]:
loader = DisjointLoader(dataset_test, epochs=20, batch_size=512)

In [68]:
@tf.function(input_signature = loader.tf_signature(), experimental_relax_shapes = True)
def train_step(inputs, targets):
    with tf.GradientTape() as tape:
        predictions = model(inputs, training = True)
        targets     = tf.cast(targets, tf.float32)
        loss        = loss_func(predictions, targets)
        loss       += sum(model.losses)

    gradients = tape.gradient(loss, model.trainable_variables)
    opt.apply_gradients(zip(gradients, model.trainable_variables))
    return loss


In [69]:
loader.dataset.signature

{'x': {'spec': tensorflow.python.framework.tensor_spec.TensorSpec,
  'shape': (None, 7),
  'dtype': tf.float64},
 'a': {'spec': tensorflow.python.framework.sparse_tensor.SparseTensorSpec,
  'shape': (None, None),
  'dtype': tf.float64},
 'y': {'spec': tensorflow.python.framework.tensor_spec.TensorSpec,
  'shape': (3,),
  'dtype': tf.float64}}

In [57]:
dataset[1000]

array([Graph(n_nodes=20, n_node_features=7, n_edge_features=None, n_labels=3)],
      dtype=object)