In [1]:
import sys
import numpy as np
import matplotlib.pyplot as plt
from ncmcm.bundlenet.bundlenet import BunDLeNet, train_model
from ncmcm.bundlenet.utils import prep_data, timeseries_train_test_split
from ncmcm.visualisers.latent_space import LatentSpaceVisualiser
from ray import tune, train
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.bayesopt import BayesOptSearch

2024-09-11 14:45:08.759900: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-11 14:45:08.759919: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-11 14:45:08.760767: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-11 14:45:08.765709: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm
202

In [2]:
algorithm = 'BunDLeNet'
rat_name = 'gatsby' # ['achilles', 'gatsby','cicero', 'buddy']
data = np.load(f'../../data/raw/rat_hippocampus/{rat_name}.npz')
x, b = data['x'], data['b']
x = x - np.min(x)


In [3]:
def train_bundlenet(config):    
    # Extract hyperparameters from the config dictionary
    learning_rate = config["learning_rate"]
    latent_dim = int(config["latent_dim"])
    n_epochs = int(config["n_epochs"])
    win = int(config["win"])
    
    x_, b_ = prep_data(x, b, win=win)

    x_train, x_test, b_train_1, b_test_1 = timeseries_train_test_split(x_, b_)

    model = BunDLeNet(latent_dim=latent_dim, num_behaviour=b_train_1.shape[1])

    train_history, test_history = train_model(
        x_train,
        b_train_1,
        model,
        b_type='continuous',
        gamma=0.9,
        learning_rate=learning_rate,
        n_epochs=n_epochs,
        initialisation= (5, 20),
        validation_data=(x_test, b_test_1),
    )

    # Report validation loss (or another metric) back to Ray Tune
    train.report({"val_loss": test_history[-1, -1]})

In [4]:
search_space = {
    "win": tune.loguniform(1, 50),
    "learning_rate": tune.loguniform(1e-5, 1e-1),
    "latent_dim": tune.uniform(1,10),
    "n_epochs": tune.uniform(10, 500)
}

scheduler = ASHAScheduler(metric="val_loss", mode="min", max_t=500, grace_period=20, reduction_factor=2)
search_algo = BayesOptSearch(metric="val_loss", mode="min")


tuner = tune.Tuner(
    tune.with_parameters(train_bundlenet),
    tune_config=tune.TuneConfig(
        search_alg=search_algo,
        num_samples=100,
        scheduler=scheduler,
    ),
    param_space=search_space,
)
results = tuner.fit()

0,1
Current time:,2024-09-11 14:48:34
Running for:,00:03:02.21
Memory:,59.8/62.7 GiB

Trial name,status,loc,latent_dim,learning_rate,n_epochs,win
train_bundlenet_9e0e40b6,RUNNING,131.130.118.84:3778710,4.37086,0.0950719,368.677,30.3343
train_bundlenet_0dd452ff,RUNNING,131.130.118.84:3778765,2.40417,0.0156079,38.461,43.4426
train_bundlenet_cbcff758,RUNNING,131.130.118.84:3778880,6.41004,0.0708102,20.0864,48.5256
train_bundlenet_89eb97dd,RUNNING,131.130.118.84:3779276,8.49198,0.0212418,99.0942,9.98682
train_bundlenet_f22ef9cd,RUNNING,131.130.118.84:3779686,3.73818,0.0524804,221.653,15.2702
train_bundlenet_d53578f2,RUNNING,131.130.118.84:3780151,6.50668,0.013958,153.151,18.9517
train_bundlenet_a5370f47,RUNNING,131.130.118.84:3780725,5.10463,0.0785197,107.84,26.1975
train_bundlenet_184bf5a0,RUNNING,131.130.118.84:3781487,6.33173,0.00465458,307.697,9.35568
train_bundlenet_148c34c2,RUNNING,131.130.118.84:3782313,1.58546,0.0948891,483.16,40.6115
train_bundlenet_2034799b,RUNNING,131.130.118.84:3783109,3.74152,0.00977623,345.274,22.5675


[36m(pid=3778710)[0m 2024-09-11 14:45:34.151773: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3778710)[0m 2024-09-11 14:45:34.151801: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3778710)[0m 2024-09-11 14:45:34.152653: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(pid=3778710)[0m 2024-09-11 14:45:34.157451: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
[36m(pid=3778710)[0m To enable the following instructions: AVX2 FMA, in other operat

Loss [Markov, Behaviour, Total]: [0.0026 0.0287 0.0312]:  55%|█████▌    | 11/20 [00:08<00:05,  1.76it/s]
Loss [Markov, Behaviour, Total]: [0.0046 0.0292 0.0338]:   5%|▌         | 1/20 [00:04<01:27,  4.61s/it]
Loss [Markov, Behaviour, Total]: [0.0026 0.0286 0.0313]:  60%|██████    | 12/20 [00:08<00:05,  1.49it/s]
  0%|          | 0/20 [00:00<?, ?it/s] 
[36m(pid=3779276)[0m 2024-09-11 14:45:46.168503: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3779276)[0m 2024-09-11 14:45:46.168559: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3779276)[0m 2024-09-11 14:45:46.170017: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register facto

  0%|          | 0/20 [00:00<?, ?it/s] 
Loss [Markov, Behaviour, Total]: [0.0035 0.0225 0.026 ]:  45%|████▌     | 9/20 [00:13<00:12,  1.15s/it]
[36m(pid=3780725)[0m 2024-09-11 14:46:04.894789: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3780725)[0m 2024-09-11 14:46:04.894875: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3780725)[0m 2024-09-11 14:46:04.896674: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(pid=3780725)[0m 2024-09-11 14:46:04.905453: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use avail

Loss [Markov, Behaviour, Total]: [0.0024 0.0278 0.0302]:  95%|█████████▌| 19/20 [00:29<00:01,  1.53s/it]
  0%|          | 0/20 [00:00<?, ?it/s] 
Loss [Markov, Behaviour, Total]: [0.0027 0.0273 0.03  ]:  15%|█▌        | 3/20 [00:08<00:39,  2.31s/it][32m [repeated 3x across cluster][0m
Loss [Markov, Behaviour, Total]: [0.0028 0.0282 0.031 ]:  95%|█████████▌| 19/20 [00:40<00:02,  2.45s/it]
[36m(pid=3783109)[0m 2024-09-11 14:46:26.206985: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=3783109)[0m 2024-09-11 14:46:26.207074: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=3783109)[0m 2024-09-11 14:46:26.208844: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS

Loss [Markov, Behaviour, Total]: [0.0027 0.0282 0.031 ]:  70%|███████   | 14/20 [00:48<00:18,  3.05s/it][32m [repeated 2x across cluster][0m
Loss [Markov, Behaviour, Total]: [0.0042 0.0139 0.0181]: 100%|██████████| 20/20 [01:03<00:00,  3.19s/it]
Loss [Markov, Behaviour, Total]: [0.0034 0.0145 0.0179]:  80%|████████  | 16/20 [00:38<00:08,  2.08s/it][32m [repeated 17x across cluster][0m
  0%|          | 0/20 [00:00<?, ?it/s] 
Loss [Markov, Behaviour, Total]: [0.0049 0.0094 0.0143]:  95%|█████████▌| 19/20 [00:47<00:02,  2.29s/it]
Loss [Markov, Behaviour, Total]: [0.0049 0.0092 0.014 ]: 100%|██████████| 20/20 [00:49<00:00,  2.49s/it]
Loss [Markov, Behaviour, Total]: [0.0027 0.0284 0.0311]:  70%|███████   | 14/20 [00:43<00:15,  2.61s/it]
Loss [Markov, Behaviour, Total]: [0.0027 0.0284 0.0311]:  75%|███████▌  | 15/20 [00:43<00:12,  2.56s/it]
Loss [Markov, Behaviour, Total]: [0.0029 0.0282 0.031 ]:  80%|████████  | 16/20 [00:53<00:11,  2.86s/it][32m [repeated 3x across cluster][0m
Loss 

Loss [Markov, Behaviour, Total]: [0.0041 0.008  0.0122]: 100%|██████████| 20/20 [00:54<00:00,  2.71s/it]
  0%|          | 0/20 [00:00<?, ?it/s] 
  0%|          | 0/20 [00:00<?, ?it/s] 
Loss [Markov, Behaviour, Total]: [0.0023 0.0277 0.03  ]:  40%|████      | 8/20 [00:33<00:44,  3.69s/it]
Loss [Markov, Behaviour, Total]: [0.0025 0.0288 0.0313]:  65%|██████▌   | 13/20 [00:59<00:35,  5.12s/it][32m [repeated 7x across cluster][0m
Loss [Markov, Behaviour, Total]: [0.0042 0.0124 0.0166]:  95%|█████████▌| 19/20 [01:01<00:03,  3.53s/it]
Loss [Markov, Behaviour, Total]: [0.0059 0.0276 0.0335]:   5%|▌         | 1/20 [00:12<03:49, 12.09s/it]
Loss [Markov, Behaviour, Total]: [0.004 0.012 0.016]: 100%|██████████| 20/20 [01:05<00:00,  3.25s/it]   
Loss [Markov, Behaviour, Total]: [0.0026 0.0279 0.0305]:  50%|█████     | 10/20 [00:38<00:31,  3.14s/it][32m [repeated 8x across cluster][0m
  0%|          | 0/20 [00:00<?, ?it/s] 
Loss [Markov, Behaviour, Total]: [0.0027 0.0283 0.031 ]:  50%|█████    

RuntimeError: Caught unexpected exception: Task was killed due to the node running low on memory.
Memory on the node (IP: 131.130.118.84, ID: 4562bd7a3c6eb2f14550e4d8aef42ccaf9f5ba05b996e6bc223b4500) where the task (actor ID: 375bd7aececd97c368cf036f01000000, name=ImplicitFunc.__init__, pid=3783109, memory used=2.01GB) was running was 59.60GB / 62.71GB (0.950306), which exceeds the memory usage threshold of 0.95. Ray killed this worker (ID: 2c828c597d5a28eb0bb868c7d3094cde979aaccf465d0827e19ba32d) because it was the most recently scheduled task; to see more information about memory usage on this node, use `ray logs raylet.out -ip 131.130.118.84`. To see the logs of the worker, use `ray logs worker-2c828c597d5a28eb0bb868c7d3094cde979aaccf465d0827e19ba32d*out -ip 131.130.118.84. Top 10 memory users:
PID	MEM(GB)	COMMAND
1552548	5.77	/snap/pycharm-professional/409/bin/pycharm .
3793934	4.07	ray::ImplicitFunc.train
3778880	3.74	ray::ImplicitFunc.train
3793175	3.66	ray::ImplicitFunc.train
3778765	3.48	ray::ImplicitFunc.train
3782313	3.18	ray::ImplicitFunc.train
3792400	2.66	ray::ImplicitFunc.train
3778710	2.62	ray::ImplicitFunc.train
3780725	2.36	ray::ImplicitFunc.train
3783109	2.01	ray::ImplicitFunc.train
Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. Set max_restarts and max_task_retries to enable retry when the task crashes due to OOM. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.

[33m(raylet)[0m [2024-09-11 14:49:24,846 E 3778040 3778040] (raylet) node_manager.cc:3064: 1 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: 4562bd7a3c6eb2f14550e4d8aef42ccaf9f5ba05b996e6bc223b4500, IP: 131.130.118.84) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 131.130.118.84`
[33m(raylet)[0m 
[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.


In [None]:
best_result = results.get_best_result(metric='val_loss', mode='min')
print("Minimum validation loss:", best_result.metrics['val_loss'])
print("Best hyperparameters found were: ", best_result.config)


In [None]:
learning_rate = best_result.config["learning_rate"]
latent_dim = int(best_result.config["latent_dim"])
n_epochs = int(best_result.config["n_epochs"])
win = int(best_result.config["win"])


In [None]:
x_, b_ = prep_data(x, b, win=win)

x_train, x_test, b_train_1, b_test_1 = timeseries_train_test_split(x_, b_)


# Deploy BunDLe Net
model = BunDLeNet(latent_dim=latent_dim, num_behaviour=b_.shape[1])

train_history, test_history = train_model(
    x_train,
    b_train_1,
    model,
    b_type='continuous',
    gamma=0.9,
    learning_rate=learning_rate,
    n_epochs=n_epochs,
    initialisation=(5,20),
    validation_data=(x_test, b_test_1),
)
print(f'val loss: {test_history[-1,-1]}')

In [None]:
plt.figure()
colors =  ['#1f77b4', '#ff7f0e', '#2ca02c']
for i, label in enumerate([
    r"$\mathcal{L}_{\mathrm{Markov}}$",
    r"$\mathcal{L}_{\mathrm{ehavior}}$",
    r"Total $\mathcal{L}$"
]):
    plt.plot(train_history[:, i], label=label, c=colors[i])
    plt.plot(test_history[:, i], label=label + ' test', c=colors[i], linestyle='--')
plt.legend()

In [None]:
# Projecting into latent space
y0_tr = model.tau(x_train[:, 0]).numpy()
y1_tr = model.tau(x_train[:, 1]).numpy()

y0_tst = model.tau(x_test[:, 0]).numpy()
y1_tst = model.tau(x_test[:, 1]).numpy()

y0_ = model.tau(x_[:, 0]).numpy()
y1_ = model.tau(x_[:, 1]).numpy()

In [None]:
%matplotlib notebook
plt.figure()
plt.plot(y0_)

In [None]:
y0_tr = np.c_[y0_tr, np.zeros_like(y0_tr[:,0])]
y1_tr = np.c_[y1_tr, np.zeros_like(y1_tr[:,0])]
y0_tst = np.c_[y0_tst, np.zeros_like(y0_tst[:,0])]
y1_tst = np.c_[y1_tst, np.zeros_like(y1_tst[:,0])]
y0_tr

In [None]:
%matplotlib notebook
# Continuous variable plotting
fig = plt.figure(figsize=(4, 4))
ax = plt.axes(projection='3d')
# ax.axis('off')
tr_pts = ax.scatter(y0_tr[:, 0], y0_tr[:, 1], y0_tr[:, 2], c=b_train_1[:, 0], s=0.5)
tst_pts = ax.scatter(y0_tst[:, 0], y0_tst[:, 1], y0_tst[:, 2], c=b_test_1[:, 0], s=10)
plt.colorbar(tr_pts)
plt.show()

In [None]:
# Discrete variable plotting
fig = plt.figure(figsize=(4, 4))
ax = plt.axes(projection='3d')

vis = LatentSpaceVisualiser(
    y=y0_tr,
    b=b_train_1[:, 1].astype(int),
    b_names=['0', '1']
)
fig, ax = vis._plot_ps(fig, ax, arrow_length_ratio=0.0001)

vis = LatentSpaceVisualiser(
    y=y0_tst,
    b=b_test_1[:, 1].astype(int),
    b_names=['0', '1'],
    show_points=True
)
fig, ax = vis._plot_ps(fig, ax, arrow_length_ratio=0.1)

plt.show()

In [None]:
save_data=False
if save_data:
    # Save the weights
    # model.save_weights(f'data/generated/BunDLeNet_model_rat_{rat_name}')
    print(f'data/generated/saved_Y/y0_tr__{algorithm}_rat_{rat_name}')
    np.savetxt(f'data/generated/saved_Y/y0_tr__{algorithm}_rat_{rat_name}', y0_tr)
    np.savetxt(f'data/generated/saved_Y/y1_tr__{algorithm}_rat_{rat_name}', y1_tr)
    np.savetxt(f'data/generated/saved_Y/y0_tst__{algorithm}_rat_{rat_name}', y0_tst)
    np.savetxt(f'data/generated/saved_Y/y1_tst__{algorithm}_rat_{rat_name}', y1_tst)
    np.savetxt(f'data/generated/saved_Y/b_train_1__{algorithm}_rat_{rat_name}', b_train_1)
    np.savetxt(f'data/generated/saved_Y/b_test_1__{algorithm}_rat_{rat_name}', b_test_1)
