In [1]:
import os



In [2]:
os.chdir("../")

In [3]:

from datetime import datetime
from pathlib import Path
from shutil import copy, copytree, rmtree

import seaborn as sns
import yaml
from loguru import logger
from matplotlib import pyplot as plt

from src.analysis import analysis_factory
from src.data.load_data import load_fitness_data
from src.factors import factor_factory
from src.preprocessing import preprocessing_factory
from src.utils.misc import set_seed

set_seed()


In [4]:

sns.set_theme()
sns.set_context("paper")

OUTPUT_PATH = Path("data/output")
HISTORY_PATH = Path("data/output_history")
CONFIG_FILE = Path("src/config.yaml")



In [5]:

# load config file

logger.info("Loading config file.")

with open(CONFIG_FILE, "r") as f:
    config = yaml.safe_load(f)



[32m2025-01-25 01:41:31.007[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mLoading config file.[0m


In [6]:

run_name = f"{config['run_name']}_{datetime.now().strftime('%Y_%m_%d_%H_%M_%S')}"

rmtree(OUTPUT_PATH, ignore_errors=True)
OUTPUT_PATH.mkdir(exist_ok=True)

copy(CONFIG_FILE, OUTPUT_PATH)


logger.add(OUTPUT_PATH / "logs.log")
logger.info("Start run {}.", run_name)



[32m2025-01-25 01:41:33.035[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mStart run cnn_vae_synonymous_batches_2025_01_25_01_41_33.[0m


In [7]:

# load data

logger.info("Loading data.")

data = load_fitness_data()


[32m2025-01-25 01:41:35.144[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mLoading data.[0m


In [8]:

# preprocess data

logger.info("Start preprocessing.")

for preprocessing_step in config["preprocessing"]:
    logger.info("Apply {} preprocessing.", preprocessing_step["name"])
    function = preprocessing_factory(**preprocessing_step)
    data = function(data)

data.to_csv(OUTPUT_PATH / "preprocessed.csv")
logger.info("Preprocessed data written to {}.", OUTPUT_PATH / "preprocessed.csv")



[32m2025-01-25 01:41:36.989[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mStart preprocessing.[0m
[32m2025-01-25 01:41:36.990[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mApply remove_non_functional preprocessing.[0m
[32m2025-01-25 01:41:36.992[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mApply extend_mutated_sequence preprocessing.[0m
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["mutated_wildtype_dna"] = df["sequence_dna"].apply(
[32m2025-01-25 01:41:37.014[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mApply rna_loops preprocessing.[0m
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,co

In [9]:

# add factors

logger.info("Start adding factors.")

for factor in config["factors"]:
    logger.info("Add {} factor.", factor["name"])
    function = factor_factory(**factor)
    data = function(data)

data.to_csv(OUTPUT_PATH / "with_factors.csv")
logger.info("Data with factors written to {}.", OUTPUT_PATH / "with_factors.csv")



[32m2025-01-25 01:48:49.110[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mStart adding factors.[0m
[32m2025-01-25 01:48:49.111[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mAdd mutated_amino_acids factor.[0m
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["mutated_amino_acids"] = df["sequence_dna"].apply(dna_to_aa)
[32m2025-01-25 01:48:49.313[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mAdd neural_network factor.[0m
CometLogger will be initialized in online mode
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/aapsonn/cnn-vae-synonymous-batches/88bf1c6041d4423a90399a1c2c442222

GPU available: False, used: False
TPU available: False, usi

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/spot/.cache/pypoetry/virtualenvs/geneticfitness-dxHE3pA3-py3.13/lib/python3.13/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/home/spot/.cache/pypoetry/virtualenvs/geneticfitness-dxHE3pA3-py3.13/lib/python3.13/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/home/spot/.cache/pypoetry/virtualenvs/geneticfitness-dxHE3pA3-py3.13/lib/python3.13/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (43) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_st

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : human_fort_5737
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/aapsonn/cnn-vae-synonymous-batches/88bf1c6041d4423a90399a1c2c442222
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     val_loss : 0.7594785094261169
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Created from : pytorch-lightning
[1;38;5;39mCOMET INFO:[0m   Parameters:
[1;38;5;39mCOMET INFO:[0m     batch_size           : 256
[1;38;5;39mCOMET INFO:[0m     embedding_size  

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml ExistingExperiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : human_fort_5737
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/aapsonn/cnn-vae-synonymous-batches/88bf1c6041d4423a90399a1c2c442222
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     val_loss : 0.7598253488540649
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Created from : pytorch-lightning
[1;38;5;39mCOMET INFO:[0m   Parameters:
[1;38;5;39mCOMET INFO:[0m     embedding_size       : 32
[1;38;5;39mCOMET INFO:[0m     filter_size          : 5
[1;38;5;39mCOMET INFO:[0m    

Testing: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml ExistingExperiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : human_fort_5737
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/aapsonn/cnn-vae-synonymous-batches/88bf1c6041d4423a90399a1c2c442222
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     test_loss : 0.7600781321525574
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Created from : pytorch-lightning
[1;38;5;39mCOMET INFO:[0m   Parameters:
[1;38;5;39mCOMET INFO:[0m     embedding_size       : 32
[1;38;5;39mCOMET INFO:[0m     filter_size          : 5
[1;38;5;39mCOMET INFO:[0m   

Predicting: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml ExistingExperiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : human_fort_5737
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/aapsonn/cnn-vae-synonymous-batches/88bf1c6041d4423a90399a1c2c442222
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Created from : pytorch-lightning
[1;38;5;39mCOMET INFO:[0m   Parameters:
[1;38;5;39mCOMET INFO:[0m     embedding_size       : 32
[1;38;5;39mCOMET INFO:[0m     filter_size          : 5
[1;38;5;39mCOMET INFO:[0m     fully_connected_size : 32
[1;38;5;39mCOMET INFO:[0m     input_size           : 117
[1;38;5;39mCOM

Structure of val_prediction: <class 'list'> [(tensor([[[-0.6136, -0.6136, -0.6136,  ..., -0.5692, -0.5692, -0.5692],
         [ 0.0316,  0.0316,  0.0316,  ...,  0.9200,  0.9200,  0.9200],
         [-0.4927, -0.4927, -0.4927,  ...,  1.1108,  1.1108,  1.1108],
         ...,
         [-1.2341, -1.2341, -1.2341,  ..., -0.9565, -0.9565, -0.9565],
         [ 1.8197,  1.8197,  1.8197,  ...,  0.0335,  0.0335,  0.0335],
         [-0.5515, -0.5515, -0.5515,  ...,  0.7101,  0.7101,  0.7101]]]), tensor([[[-0.0761, -0.1967, -0.3799,  ..., -0.1007, -0.0495, -0.0200],
         [ 0.0595,  0.0192, -0.0262,  ...,  0.1796,  0.1219,  0.0878],
         [-0.0512, -0.1451, -0.2504,  ...,  0.2146,  0.0979,  0.0453],
         ...,
         [-0.0079, -0.0149, -0.1146,  ..., -0.1990, -0.0971, -0.0527],
         [ 0.1725,  0.3525,  0.5074,  ...,  0.2541,  0.1856,  0.1264],
         [-0.1018, -0.1523, -0.2226,  ...,  0.0092, -0.0241, -0.0731]]]), tensor([[ 0.0008,  0.0230,  0.0278,  0.0100, -0.0202]]), tensor([[-0

IndexError: index 1 is out of bounds for axis 0 with size 1

In [None]:


# run analysis

logger.info("Start analysis.")

for analysis_step in config["analysis"]:
    logger.info("Run {} analysis.", analysis_step["name"])

    function = analysis_factory(output_path=OUTPUT_PATH, **analysis_step)
    function(data)

    plt.clf()

# copy to history

copytree(OUTPUT_PATH, HISTORY_PATH / run_name)
