In [1]:
import sys
sys.path.append("..")

import pandas as pd

from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.tensorboard import SummaryWriter

from dataset import *
from features import *
from train import *
from model import *

In [2]:
de_df = pd.read_parquet("../data/de_train.parquet")

train_index, val_index = stratified_split(de_df["cell_type"], 0.2, 45)
de_df_dataset_train = DataFrameDataset(de_df.iloc[train_index], mode="df")
de_df_dataset_val = DataFrameDataset(de_df.iloc[val_index], mode="df")

In [3]:
mtypes = list(set(de_df["sm_name"].to_list()))
mol_transforms = {
    "morgan_fp": TransformList([Sm2Smiles("../config/sm_smiles.csv", mode="path"), Smiles2Mol(), Mol2Morgan()]),
    "one_hot": TransformList([Type2OneHot(mtypes)])
}

ctypes = list(set(de_df["cell_type"].to_list()))

file_names = ["../data/temp/"+name.replace(" ", "_").replace("+", "")+"_control_mean.csv"
              for name in ctypes]
gene_num = len(pd.read_csv(file_names[0]))

cell_transforms = {
    "one_hot": TransformList([Type2OneHot(ctypes)]),
    "gene_exp": TransformList([CType2CSVEncoding(ctypes, file_names)])
}

In [4]:
de_dataset_train = DEDataset(de_df_dataset_train, mol_transforms, cell_transforms)
de_dataset_val = DEDataset(de_df_dataset_val, mol_transforms, cell_transforms)

In [5]:
import os
os.chdir("..")

In [6]:
#### MODEL ####
cell_ae = VecAutoEncoder([gene_num, 200, 100, 50])

#### LOADERS, DATA ####
train_dataloader_ = DataLoader(de_dataset_train, 128)
val_dataloader_ = DataLoader(de_dataset_val, 128)
de_dataset_train.configure(cell_out_feature="gene_exp", ae_mode="cell")
de_dataset_val.configure(cell_out_feature="gene_exp", ae_mode="cell")

#### CONFIG ####
lr = 0.01
epochs = 300
device = "cuda:0"

loss_fn = nn.MSELoss()
optimizer = Adam(cell_ae.parameters(), lr=lr, weight_decay=1e-4)
scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.8, patience=7)

#### TENSORBOARD ####
writer = SummaryWriter("./runs/cell_ae/1")


In [7]:
train_many_epochs(cell_ae, train_dataloader_, val_dataloader_, epochs, 
                  loss_fn, optimizer, scheduler, writer=writer, device=device)

100%|██████████| 300/300 [00:09<00:00, 31.39it/s]


In [13]:
device = "cuda:0"

#### MODEL ####
mol_enc = nn.Sequential(
    nn.Linear(2048, 100)
).float()
cell_enc = nn.Sequential(
    VecAEEncoder(cell_ae, grads=False, device=device)
    # nn.Linear(len(ctypes), 50)
).float()
regressor = nn.Sequential(
    nn.Linear(150, 100),
    nn.Tanh(),
    nn.Dropout(0.2),
    nn.Linear(100, 100),
    nn.Tanh(),
    nn.Dropout(0.2),
    nn.Linear(100, 500),
    nn.Tanh(),
    nn.Linear(500, len(de_df.columns)-5)
).float()

model = CombinerModel(
        mol_encoder=mol_enc,
        cell_encoder=cell_enc,
        regressor=regressor)
torch.save(model.state_dict(), "temp/ini_model.pkl")

#### LOADERS, DATA ####
de_dataset_train.configure(sm_out_feature="morgan_fp",
                           cell_out_feature="gene_exp", ae_mode=False)
de_dataset_val.configure(sm_out_feature="morgan_fp",
                           cell_out_feature="gene_exp", ae_mode=False)
train_dataloader = DataLoader(de_dataset_train, 128)
val_dataloader = DataLoader(de_dataset_val, 128)

#### CONFIG ####
lr = 0.01
epochs = 200

loss_fn = loss_fn
optimizer = Adam(model.parameters(), lr=lr, weight_decay=1e-4)
scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.8, patience=7)

#### TENSORBOARD ####
writer = SummaryWriter("./runs/kaggle/trying_out")

In [14]:
train_many_epochs(model, train_dataloader, val_dataloader, epochs, 
                  loss_fn, optimizer, scheduler, writer=writer, device=device)

100%|██████████| 200/200 [00:27<00:00,  7.18it/s]
