In [4]:
import sys, os, time
import pandas as pd

In [7]:
# train_meta.parquet is quite large and takes a long time to load, and consumes a lot of memory while you are 
#           holding it. Since it is a parquet file (and the row group is quite large), you cannot easily read it 
#           batch by batch. So create a dataset that split this file into 660 files, one for each batch.

DATA_DIR='data'
OUTPUT_DIR='data_'
META_FILE = os.path.join(DATA_DIR, 'train_meta.parquet')
start=time.time()            
meta_train = pd.read_parquet(os.path.join(DATA_DIR, 'train_meta.parquet'))
# meta_train.head(3) => 
#    batch_id  event_id  first_pulse_index  last_pulse_index   azimuth    zenith
# 0         1        24                  0                60  5.029555  2.087498
# 1         1        41                 61               111  0.417742  1.549686
# 2         1        59                112               147  1.160466  2.401942



In [14]:
for idx, df in meta_train.groupby('batch_id'):
    # idx => 1, 2, and so on.

    # type(df) => <class 'pandas.core.frame.DataFrame'>
    
    # df.head(3) =>                   (when idx=2)
    #          batch_id  event_id  first_pulse_index  last_pulse_index   azimuth          zenith  
    # 200000         2   3266199                  0                73  0.523428       2.094954  
    # 200001         2   3266228                 74               149  3.125242       2.695349  
    # 200002         2   3266282                150               191  5.720676       2.581088  

    
    file = os.path.join(OUTPUT_DIR, 'meta', f'train_meta_{idx}.parquet')
    df = df[['event_id', 'first_pulse_index', 'last_pulse_index', 'azimuth', 'zenith']]
    df.to_parquet(file)

    print(f'{time.time()-start:8.1f} wrote {file}')

  2746.3 wrote data_/meta/train_meta_1.parquet
  2746.4 wrote data_/meta/train_meta_2.parquet
  2746.6 wrote data_/meta/train_meta_3.parquet
  2746.7 wrote data_/meta/train_meta_4.parquet
  2746.8 wrote data_/meta/train_meta_5.parquet
  2746.9 wrote data_/meta/train_meta_6.parquet
  2747.1 wrote data_/meta/train_meta_7.parquet
  2747.2 wrote data_/meta/train_meta_8.parquet
  2747.3 wrote data_/meta/train_meta_9.parquet
  2747.4 wrote data_/meta/train_meta_10.parquet
  2747.7 wrote data_/meta/train_meta_11.parquet
  2747.8 wrote data_/meta/train_meta_12.parquet
  2748.0 wrote data_/meta/train_meta_13.parquet
  2748.1 wrote data_/meta/train_meta_14.parquet
  2748.4 wrote data_/meta/train_meta_15.parquet
  2748.5 wrote data_/meta/train_meta_16.parquet
  2748.8 wrote data_/meta/train_meta_17.parquet
  2749.0 wrote data_/meta/train_meta_18.parquet


KeyboardInterrupt: 

In [15]:
!python prepare_data.py config.json

  df = df.groupby("event_id").agg([pl.count()])
100%|█████████████████████████████████████████| 660/660 [09:48<00:00,  1.12it/s]


### Train

In [37]:
# download graphnet from: https://github.com/graphnet-team/graphnet/tree/main   (as installation is complicated)
 
# "pytorch_scatter"/"torch_scatter" =>                       (https://data.pyg.org/whl/) (https://github.com/rusty1s/pytorch_scatter)
# pip3 install https://data.pyg.org/whl/torch-2.0.0%2Bcu118/torch_scatter-2.1.2%2Bpt20cu118-cp38-cp38-linux_x86_64.whl

# "torch_cluster" =>
# pip3 install https://data.pyg.org/whl/torch-2.0.0%2Bcu118/torch_cluster-1.6.3%2Bpt20cu118-cp38-cp38-linux_x86_64.whl

# B model 32
!python train.py config.json \
        MODEL DeepIceModel \
        MODEL_KWARGS.dim 384 \
        MODEL_KWARGS.dim_base 128 \
        MODEL_KWARGS.depth 12 \
        MODEL_KWARGS.head_size 32 \
        OUT B_MODEL_32 \
        LR_MAX 5e-4 \
        MOMS false \
        LOSS_FUNC loss_vms \
        BS 64 # batch_size (default: 32)
# OUT is output path
# LOSS_FUNC after reaching near the peak = loss_comb

[1;34mgraphnet[0m [MainProcess] [32mINFO    [0m 2023-10-28 17:43:10 - NodesAsPulses.__init__ - Writing log to [1mlogs/graphnet_20231028-174310.log[0m
Training with the following configuration:
{
    "SELECTION": "total",
    "OUT": "B_MODEL_32",
    "PATH": "data/",
    "NUM_WORKERS": 4,
    "SEED": 2023,
    "BS": 64,
    "BS_VALID": 32,
    "L": 192,
    "L_VALID": 192,
    "EPOCHS": 8,
    "LR_MAX": 0.0005,
    "MODEL": "DeepIceModel",
    "MOMS": false,
    "DIV": 25,
    "DIV_FINAL": 25,
    "EMA": false,
    "MODEL_KWARGS": {
        "dim": 384,
        "dim_base": 128,
        "depth": 12,
        "head_size": 32
    },
    "WEIGHTS": false,
    "LOSS_FUNC": "loss_vms",
    "METRIC": "loss"
}
_______________________________________________________
epoch     train_loss  valid_loss  loss      time    
^Coch 1/8 : |█-------------------| 6.82% [17431/255770 2:24:17<32:52:54 2.0548]
Traceback (most recent call last):
  File "train.py", line 198, in <module>
    main()
  File "t

In [34]:
# B model 4 REL
!python train.py config.json \
       MODEL DeepIceModel \
       MODEL_KWARGS.dim 768 \
       MODEL_KWARGS.dim_base 192 \
       MODEL_KWARGS.depth 12 \
       MODEL_KWARGS.head_size 32 \
       MODEL_KWARGS.n_rel 4 \
       OUT B_MODEL_B_64_REL \
       LR_MAX 1e-4 \
       MOMS false  \
       BS 64 # batch_size (default: 32)    
# OUT is output path
# LOSS_FUNC after reaching near the peak = loss_comb

[1;34mgraphnet[0m [MainProcess] [32mINFO    [0m 2023-10-17 22:26:02 - NodesAsPulses.__init__ - Writing log to [1mlogs/graphnet_20231017-222602.log[0m
Training with the following configuration:
{
    "SELECTION": "total",
    "OUT": "B_MODEL_B_64",
    "PATH": "data/",
    "NUM_WORKERS": 4,
    "SEED": 2023,
    "BS": 2,
    "BS_VALID": 32,
    "L": 192,
    "L_VALID": 192,
    "EPOCHS": 8,
    "LR_MAX": 0.0001,
    "MODEL": "DeepIceModel",
    "MOMS": false,
    "DIV": 25,
    "DIV_FINAL": 25,
    "EMA": false,
    "MODEL_KWARGS": {
        "dim": 768,
        "dim_base": 192,
        "depth": 12,
        "head_size": 32,
        "n_rel": 4
    },
    "WEIGHTS": false,
    "LOSS_FUNC": "loss_vms",
    "METRIC": "loss"
}
_______________________________________________________


In [30]:
# S + GNN
!python train.py config.json \
       MODEL EncoderWithDirectionReconstructionV22 \
       MODEL_KWARGS.dim 384 \
       MODEL_KWARGS.dim_base 128 \
       MODEL_KWARGS.depth 8 \
       MODEL_KWARGS.head_size 32 \
       OUT S_MODEL_GNN_64 \
       LR_MAX 1e-4 \
       MOMS false  \
       BS 64 # batch_size (default: 32)    

# OUT is output path
# LOSS_FUNC after reaching near the peak = loss_comb

[1;34mgraphnet[0m [MainProcess] [32mINFO    [0m 2023-10-17 22:18:11 - NodesAsPulses.__init__ - Writing log to [1mlogs/graphnet_20231017-221811.log[0m
Training with the following configuration:
{
    "SELECTION": "total",
    "OUT": "B_MODEL_B_64",
    "PATH": "data/",
    "NUM_WORKERS": 4,
    "SEED": 2023,
    "BS": 2,
    "BS_VALID": 32,
    "L": 192,
    "L_VALID": 192,
    "EPOCHS": 8,
    "LR_MAX": 0.0001,
    "MODEL": "EncoderWithDirectionReconstructionV22",
    "MOMS": false,
    "DIV": 25,
    "DIV_FINAL": 25,
    "EMA": false,
    "MODEL_KWARGS": {
        "dim": 384,
        "dim_base": 128,
        "depth": 8,
        "head_size": 32
    },
    "WEIGHTS": false,
    "LOSS_FUNC": "loss_vms",
    "METRIC": "loss"
}
_______________________________________________________


In [33]:
# B + GNN
!python train.py config.json \
       MODEL EncoderWithDirectionReconstructionV23 \
       MODEL_KWARGS.dim 768 \
       MODEL_KWARGS.dim_base 128 \
       MODEL_KWARGS.depth 12 \
       MODEL_KWARGS.head_size 64 \
       OUT B_MODEL_GNN_64 \
       LR_MAX 1e-4 \
       MOMS false  \
       BS 64 # batch_size (default: 32)    
# OUT is output path
# LOSS_FUNC after reaching near the peak = loss_comb

[1;34mgraphnet[0m [MainProcess] [32mINFO    [0m 2023-10-17 22:20:20 - NodesAsPulses.__init__ - Writing log to [1mlogs/graphnet_20231017-222020.log[0m
Training with the following configuration:
{
    "SELECTION": "total",
    "OUT": "B_MODEL_B_64",
    "PATH": "data/",
    "NUM_WORKERS": 4,
    "SEED": 2023,
    "BS": 2,
    "BS_VALID": 32,
    "L": 192,
    "L_VALID": 192,
    "EPOCHS": 8,
    "LR_MAX": 0.0001,
    "MODEL": "EncoderWithDirectionReconstructionV23",
    "MOMS": false,
    "DIV": 25,
    "DIV_FINAL": 25,
    "EMA": false,
    "MODEL_KWARGS": {
        "dim": 768,
        "dim_base": 128,
        "depth": 12,
        "head_size": 64
    },
    "WEIGHTS": false,
    "LOSS_FUNC": "loss_vms",
    "METRIC": "loss"
}
_______________________________________________________
