In [1]:
import os
import importlib
import math
import time

import itertools
from collections import defaultdict

import numpy as np
import pandas as pd
import scipy.stats as ss
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm as tqdm_auto
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import pytorch_lightning as pl

from IPython.display import clear_output

In [2]:
sys.path.append("../model/")
import stability_data as stability_data
from legnet import LegNetClassifier
from pl_regressor import RNARegressor

## Loading data

In [3]:
PATH_FROM = './ratios_log_stability.csv'
df_src = pd.read_csv(PATH_FROM)
df_src.head(3)

Unnamed: 0,seq,fold,log_ratio
0,AAAAAAAAACAACAGCACCTGTCCAGGCTTCCTTAGGTACATCTTC...,train,-0.806329
1,AAAAAACTCACCCGTTTTCCTGGGATTTGTTGTAAGGAGTTTTCAC...,train,-0.798038
2,AAAAAGACATAAACTGGCACCAGTTAACTTTCTTGTACTTTTTTGC...,train,-0.84301


In [4]:
Y_NAME = "log_ratio"

In [5]:
splits = dict(tuple(df_src.groupby('fold')))
for split_df in splits.values():
    split_df.reset_index(drop=True, inplace=True)
splits["val"].head()

Unnamed: 0,seq,fold,log_ratio
0,AAACAGGCCCCCTTCCCATCTACCTAGCCAGTACCCATCCAATGAG...,val,-0.665711
1,AAAGTCCCAAAGGCGGACCCGCTGGTGCTGGAAGCCAGAACTGTGG...,val,-0.801373
2,AAATACATATTTAGTGTTACTTGGAAAACAGCTGCTGCCAGCTAGC...,val,-0.989821
3,AACCACATGAACTGGACTGAGAGGGGGAAGAAGCGGGGAGGAAGAA...,val,-0.843702
4,AACCGGAGGGGCTGCCATTACGAGTTTACCAGCTTTTGCACGGGTA...,val,-0.737987


In [6]:
batch_size = 128
steps_per_epoch = max(1, splits["train"].shape[0] // batch_size)

In [7]:
num_workers = 32

In [8]:
def launch_model(
    seed: int,
    train_ds_kws: dict,
    val_ds_kws: dict,
    model_class,
    model_kws: dict,
    criterion_class,
    criterion_kws: dict,
    optimizer_class,
    optimizer_kws: dict,
    lr_scheduler_class,
    lr_scheduler_kws: dict,
    epochs: int,
):
    pl.seed_everything(seed)

    # Creating Datasets
    train_set = stability_data.StabilityData(
        df=splits["train"],
        **train_ds_kws,
    )
    val_set = stability_data.StabilityData(
        df=splits["val"],
        **val_ds_kws,
    )

    assert train_set.num_channels == val_set.num_channels

    # Creating DataLoaders
    dl_train = DataLoader(
        train_set,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=True,
        drop_last=True
    )
    dl_val = DataLoader(
        val_set,
        batch_size=val_set.__len__(),
        num_workers=num_workers,
        shuffle=False,
        drop_last=False
    )

    model = RNARegressor(
        model_class=model_class,
        model_kws=model_kws | dict(
            in_channels=train_set.num_channels
        ),
        criterion_class=criterion_class,
        criterion_kws=criterion_kws,
        optimizer_class=optimizer_class,
        optimizer_kws=optimizer_kws,
        lr_scheduler_class=lr_scheduler_class,
        lr_scheduler_kws=lr_scheduler_kws,
    )
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        dirpath="saved_models",
        save_top_k=1,
        save_last=False,
        monitor="val_pearson_r_0",
        mode="max"
    )
    progressbar_callback = pl.callbacks.TQDMProgressBar(refresh_rate=0.5)

    logger = pl.loggers.tensorboard.TensorBoardLogger("tb_logs", name=model.model_name)
    trainer = pl.Trainer(
        callbacks=[checkpoint_callback, progressbar_callback],
        logger=logger,
        accelerator="gpu",
        devices=1,
        deterministic=True,
        max_epochs=epochs,
        num_sanity_val_steps=0,
        log_every_n_steps=10,
        gradient_clip_val=1e-3,
        gradient_clip_algorithm="value",
    )
    trainer.fit(model=model, train_dataloaders=dl_train, val_dataloaders=dl_val)
    best_model = RNARegressor.load_from_checkpoint(checkpoint_callback.best_model_path)

    prediction = trainer.predict(model=best_model, dataloaders=dl_val)
    val_pred, val_real = zip(*prediction)
    val_pred = torch.concat(val_pred).numpy()
    val_real = torch.concat(val_real).numpy()
    val_df = splits["val"].copy()
    val_df["prediction"] = val_pred[:, 0]

    return trainer, val_df

In [None]:
checked = {
    "seed": [0],
    "features": [
        ("sequence",),  # ("sequence", "positional",)
    ],
    "epochs": [25],
}

for subset in itertools.product(
    *checked.values()
):
    PARAMS = dict(zip(checked.keys(), subset))

    trainer_last, prediction_best_last = launch_model(
        seed=PARAMS["seed"],
        train_ds_kws=dict(
            features=PARAMS["features"],
            predict_cols=[Y_NAME],
        ),
        val_ds_kws=dict(
            features=PARAMS["features"],
            predict_cols=[Y_NAME],
        ),
        model_class=LegNetClassifier,
        model_kws=dict(
            seqsize=186,
            ks=3,
            out_channels=1,
            conv_sizes=(128, 64, 64, 32, 32),
            mapper_size=256,
            linear_sizes=None,
            use_max_pooling=False,
            final_activation=nn.Identity
        ),
        criterion_class=nn.HuberLoss,  # nn.MSELoss,  # 
        criterion_kws=dict(),
        optimizer_class=torch.optim.AdamW,
        optimizer_kws=dict(
            # lr=0.01,
            weight_decay=0.1,
        ),
        lr_scheduler_class=torch.optim.lr_scheduler.OneCycleLR,
        lr_scheduler_kws=dict(
            max_lr=0.025,
            steps_per_epoch=steps_per_epoch,
            epochs=PARAMS["epochs"],
            pct_start=0.3,
            three_phase=False,
            cycle_momentum=True,
        ),
        epochs=PARAMS["epochs"],
    )