In [1]:
import os

In [2]:
os.chdir("../")

In [3]:

from datetime import datetime
from pathlib import Path
from shutil import copy, copytree, rmtree

import seaborn as sns
import yaml
from loguru import logger
from matplotlib import pyplot as plt

from src.analysis import analysis_factory
from src.data.load_data import load_fitness_data
from src.factors import factor_factory
from src.preprocessing import preprocessing_factory
from src.utils.misc import set_seed

set_seed()


In [4]:

sns.set_theme()
sns.set_context("paper")

OUTPUT_PATH = Path("data/output")
HISTORY_PATH = Path("data/output_history")
CONFIG_FILE = Path("src/config.yaml")

In [5]:
# load config file

logger.info("Loading config file.")

with open(CONFIG_FILE, "r") as f:
    config = yaml.safe_load(f)


[32m2025-01-31 23:57:13.065[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mLoading config file.[0m


In [6]:
run_name = f"{config['run_name']}_{datetime.now().strftime('%Y_%m_%d_%H_%M_%S')}"

rmtree(OUTPUT_PATH, ignore_errors=True)
OUTPUT_PATH.mkdir(exist_ok=True)

copy(CONFIG_FILE, OUTPUT_PATH)


logger.add(OUTPUT_PATH / "logs.log")
logger.info("Start run {}.", run_name)

[32m2025-01-31 23:57:13.105[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mStart run cnn_vae_synonymous_batches_2025_01_31_23_57_13.[0m


In [7]:
# load data

logger.info("Loading data.")

indata = load_fitness_data()


[32m2025-01-31 23:57:14.945[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mLoading data.[0m


In [8]:
# preprocess data

logger.info("Start preprocessing.")

predata = indata.copy(deep=True)

for preprocessing_step in config["preprocessing"]:
    logger.info("Apply {} preprocessing.", preprocessing_step["name"])
    function = preprocessing_factory(**preprocessing_step)
    predata = function(predata)

predata.to_csv(OUTPUT_PATH / "preprocessed.csv")
logger.info("Preprocessed data written to {}.", OUTPUT_PATH / "preprocessed.csv")
copytree(OUTPUT_PATH, HISTORY_PATH / run_name)



[32m2025-01-31 23:57:17.363[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mStart preprocessing.[0m
[32m2025-01-31 23:57:17.396[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mApply extend_mutated_sequence preprocessing.[0m
[32m2025-01-31 23:57:17.848[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mApply mutated_amino_acids preprocessing.[0m
[32m2025-01-31 23:57:22.929[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mApply rna_loops_minimum_free_energy preprocessing.[0m
[32m2025-02-01 00:34:29.512[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mPreprocessed data written to data/output/preprocessed.csv.[0m


PosixPath('data/output_history/cnn_vae_synonymous_batches_2025_01_31_23_57_13')

In [9]:
# add factors

logger.info("Start adding factors.")

data = predata.copy(deep=True)

for factor in config["factors"]:
    logger.info("Add {} factor.", factor["name"])
    function = factor_factory(**factor)
    data = function(data)
    dir(data)

data.to_csv(OUTPUT_PATH / "with_factors.csv")
logger.info("Data with factors written to {}.", OUTPUT_PATH / "with_factors.csv")



[32m2025-02-01 00:34:29.627[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mStart adding factors.[0m
[32m2025-02-01 00:34:29.786[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mAdd neural_network factor.[0m
CometLogger will be initialized in online mode
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/aapsonn/cnn-vae-synonymous-batches/3ca3ba2d7774405fa68a939a70019206

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/spot/.cache/pypoetry/virtualenvs/geneticfitness-dxHE3pA3-py3.13/lib/python3.13/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /media/DATA/man/folA_synon/GeneticFitness/data/models/cnn_vae_synonymous_batches exists and is not empty.

  | Name      | Type       | Params | Mode 
-------------------------------------------------
0 | embedding | Embedding  | 96     | train
1 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/spot/.cache/pypoetry/virtualenvs/geneticfitness-dxHE3pA3-py3.13/lib/python3.13/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/home/spot/.cache/pypoetry/virtualenvs/geneticfitness-dxHE3pA3-py3.13/lib/python3.13/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=50` reached.
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : blonde_dividend_6210
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/aapsonn/cnn-vae-synonymous-batches/3ca3ba2d7774405fa68a939a70019206
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     train_loss [610] : (0.17826060950756073, 0.7279582619667053)
[1;38;5;39mCOMET INFO:[0m     val_loss [50]    : (0.3143503963947296, 0.35667458176612854)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Created from : pytorch-lightning


Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml ExistingExperiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : blonde_dividend_6210
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/aapsonn/cnn-vae-synonymous-batches/3ca3ba2d7774405fa68a939a70019206
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     val_loss : 0.31474319100379944
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Created from : pytorch-lightning
[1;38;5;39mCOMET INFO:[0m   Parameters:
[1;38;5;39mCOMET INFO:[0m     embedding_size       : 32
[1;38;5;39mCOMET INFO:[0m     filter_size          : 5
[1;38;5;39mCOMET INFO:[

Testing: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml ExistingExperiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : blonde_dividend_6210
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/aapsonn/cnn-vae-synonymous-batches/3ca3ba2d7774405fa68a939a70019206
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     test_loss : 0.31425148248672485
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Created from : pytorch-lightning
[1;38;5;39mCOMET INFO:[0m   Parameters:
[1;38;5;39mCOMET INFO:[0m     embedding_size       : 32
[1;38;5;39mCOMET INFO:[0m     filter_size          : 5
[1;38;5;39mCOMET INFO:

Predicting: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml ExistingExperiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : blonde_dividend_6210
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/aapsonn/cnn-vae-synonymous-batches/3ca3ba2d7774405fa68a939a70019206
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Created from : pytorch-lightning
[1;38;5;39mCOMET INFO:[0m   Parameters:
[1;38;5;39mCOMET INFO:[0m     embedding_size       : 32
[1;38;5;39mCOMET INFO:[0m     filter_size          : 5
[1;38;5;39mCOMET INFO:[0m     fully_connected_size : 32
[1;38;5;39mCOMET INFO:[0m     input_size           : 117
[1;38;5;3

Predicting: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml ExistingExperiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : blonde_dividend_6210
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/aapsonn/cnn-vae-synonymous-batches/3ca3ba2d7774405fa68a939a70019206
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Created from : pytorch-lightning
[1;38;5;39mCOMET INFO:[0m   Parameters:
[1;38;5;39mCOMET INFO:[0m     embedding_size       : 32
[1;38;5;39mCOMET INFO:[0m     filter_size          : 5
[1;38;5;39mCOMET INFO:[0m     fully_connected_size : 32
[1;38;5;39mCOMET INFO:[0m     input_size           : 117
[1;38;5;3

In [6]:

# Define the optimizer (must match the one used in training)
def optimizer_fn(params):
    return torch.optim.Adam(params, lr=1e-3)


In [4]:
from src.modeling import CNN_VAE

In [7]:

# Load model from checkpoint
checkpoint_path = "/media/DATA/man/folA_synon/GeneticFitness/data/models/cnn_vae_synonymous_batches/epoch=48-val_loss=0.31.ckpt"
model = CNN_VAE.load_from_checkpoint(checkpoint_path, optimizer=optimizer_fn)

# Set the model to evaluation mode
model.eval()

CNN_VAE(
  (embedding): Embedding(3, 32)
  (encoder): Sequential(
    (0): Conv1d(32, 32, kernel_size=(5,), stride=(1,))
    (1): ReLU()
    (2): Conv1d(32, 32, kernel_size=(5,), stride=(1,))
    (3): ReLU()
    (4): Conv1d(32, 32, kernel_size=(5,), stride=(1,))
    (5): ReLU()
    (6): Flatten(start_dim=1, end_dim=-1)
    (7): Linear(in_features=3360, out_features=32, bias=True)
    (8): ReLU()
  )
  (mu): Linear(in_features=32, out_features=5, bias=True)
  (var): Linear(in_features=32, out_features=5, bias=True)
  (decoder): Sequential(
    (0): Linear(in_features=5, out_features=3360, bias=True)
    (1): ReLU()
    (2): Unflatten(dim=1, unflattened_size=(32, 105))
    (3): ConvTranspose1d(32, 32, kernel_size=(5,), stride=(1,))
    (4): ReLU()
    (5): ConvTranspose1d(32, 32, kernel_size=(5,), stride=(1,))
    (6): ReLU()
    (7): ConvTranspose1d(32, 32, kernel_size=(5,), stride=(1,))
  )
)