In [1]:
import sys
sys.path.append("..")

import pandas as pd

from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.tensorboard import SummaryWriter

from dataset import *
from features import *
from train import *
from model import *

In [2]:
de_df = pd.read_parquet("../data/de_train.parquet")

train_index, val_index = stratified_split(de_df["cell_type"], 0.2, 45)
de_df_dataset_train = DataFrameDataset(de_df.iloc[train_index], mode="df")
de_df_dataset_val = DataFrameDataset(de_df.iloc[val_index], mode="df")

In [3]:
mtypes = list(set(de_df["sm_name"].to_list()))
mol_transforms = {
    "morgan2_fp": TransformList([Sm2Smiles("../config/sm_smiles.csv", mode="path"), Smiles2Mol(), Mol2Morgan(2048, 2)]),
    "morgan3_fp": TransformList([Sm2Smiles("../config/sm_smiles.csv", mode="path"), Smiles2Mol(), Mol2Morgan(2048, 3)]),
    "one_hot": TransformList([Type2OneHot(mtypes)])
}

ctypes = list(set(de_df["cell_type"].to_list()))

file_names = ["../data/temp/"+name.replace(" ", "_").replace("+", "")+"_control_mean.csv"
              for name in ctypes]
gene_num = len(pd.read_csv(file_names[0]))

cell_transforms = {
    "one_hot": TransformList([Type2OneHot(ctypes)]),
    "gene_exp": TransformList([CType2CSVEncoding(ctypes, file_names)])
    # "gene_exp": TransformList([CType2CSVEncoding(ctypes, file_names), NormCount2CPM()])
}

In [4]:
de_dataset_train = DEDataset(de_df_dataset_train, mol_transforms, cell_transforms)
de_dataset_val = DEDataset(de_df_dataset_val, mol_transforms, cell_transforms)

In [5]:
import os
os.chdir("..")

In [82]:
#### MODEL ####
cell_ae = VecAutoEncoder([gene_num, 500, 250, 100])

#### LOADERS, DATA ####
train_dataloader_ = DataLoader(de_dataset_train, 128)
val_dataloader_ = DataLoader(de_dataset_val, 128)
de_dataset_train.configure(cell_out_feature="gene_exp", ae_mode="cell")
de_dataset_val.configure(cell_out_feature="gene_exp", ae_mode="cell")

#### CONFIG ####
lr = 0.01
epochs = 100
device = "cuda:0"

loss_fn = nn.MSELoss()
optimizer = Adam(cell_ae.parameters(), lr=lr, weight_decay=1e-4)
scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.8, patience=7)

#### TENSORBOARD ####
writer = SummaryWriter("./runs/cell_ae/4")


In [83]:
train_many_epochs(cell_ae, train_dataloader_, val_dataloader_, epochs, 
                  loss_fn, optimizer, scheduler, writer=writer, device=device)

100%|██████████| 100/100 [00:03<00:00, 30.83it/s]


In [84]:
#### MODEL ####
mol_ae = VecAutoEncoder([2048, 500, 250, 100])

#### LOADERS, DATA ####
train_dataloader_ = DataLoader(de_dataset_train, 128)
val_dataloader_ = DataLoader(de_dataset_val, 128)
de_dataset_train.configure(sm_out_feature="morgan2_fp", ae_mode="sm")
de_dataset_val.configure(sm_out_feature="morgan2_fp", ae_mode="sm")

#### CONFIG ####
lr = 0.01
epochs = 100
device = "cuda:0"

loss_fn = nn.MSELoss()
optimizer = Adam(mol_ae.parameters(), lr=lr, weight_decay=1e-4)
scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.8, patience=7)

#### TENSORBOARD ####
writer = SummaryWriter("./runs/mol_ae/2")


In [85]:
train_many_epochs(mol_ae, train_dataloader_, val_dataloader_, epochs, 
                  loss_fn, optimizer, scheduler, writer=writer, device=device)

100%|██████████| 100/100 [00:04<00:00, 23.98it/s]


In [15]:
class MyMolEncoder(nn.Module):

    def __init__(self, in_size, num_h, out_size, skip=3):
        super(MyMolEncoder, self).__init__()

        self.layer_1 = nn.Linear(in_size, 100)
        self.act_1 = nn.LeakyReLU()
        self.h_layers = nn.ModuleList()
        for i in range(num_h):
            self.h_layers.append(nn.Sequential(
                nn.Linear(100, 100),
                nn.BatchNorm1d(100),
                nn.LeakyReLU(),
                nn.Dropout(0.2)
            ))
        self.last_layer = nn.Linear(100, out_size)
        self.norm = nn.BatchNorm1d(200)

        self.skip = skip
        self.num_h = num_h

    def forward(self, x):

        x = self.layer_1(x)
        x = self.act_1(x)
        for i, layer in enumerate(self.h_layers):
            x = layer(x)
            if i % self.skip == 0:
                x_skip = x
            if i % self.skip == self.skip - 1:
                x = x + x_skip

        return x

class MyEncoder(nn.Module):

    def __init__(self, in_size, num_h, out_size, skip=3, h_size=100, ini_batch_norm=False):
        super(MyEncoder, self).__init__()

        self.if_ini_batch = ini_batch_norm
        self.ini_batch_norm = nn.BatchNorm1d(in_size)
        

        self.layer_1 = nn.Linear(in_size, h_size)
        self.act_1 = nn.LeakyReLU()
        self.h_layers = nn.ModuleList()
        for i in range(num_h):
            self.h_layers.append(nn.Sequential(
                nn.Linear(h_size, h_size),
                nn.BatchNorm1d(h_size),
                nn.LeakyReLU(),
                nn.Dropout(0.2)
            ))
        self.last_layer = nn.Linear(h_size, out_size)

        self.skip = skip
        self.num_h = num_h

        self.float()

    def forward(self, x):

        if self.if_ini_batch == True:
            x = self.ini_batch_norm(x)
            
        x = self.layer_1(x)
        x = self.act_1(x)
        for i, layer in enumerate(self.h_layers):
            x = layer(x)
            if i % self.skip == 0:
                x_skip = x
            if i % self.skip == self.skip - 1:
                x = x + x_skip
        x = self.last_layer(x)
        
        return x

In [18]:
device = "cuda:0"

#### LOADERS, DATA ####
de_dataset_train.configure(sm_out_feature="morgan3_fp",
                           cell_out_feature="gene_exp", ae_mode=False)
de_dataset_val.configure(sm_out_feature="morgan3_fp",
                           cell_out_feature="gene_exp", ae_mode=False)
train_dataloader = DataLoader(de_dataset_train, 64)
val_dataloader = DataLoader(de_dataset_val, 64)

#### MODEL ####
mol_enc = MyEncoder(2048, 6, 300, 3, 450)
# mol_enc = VecAEEncoder(mol_ae, grads=True, device=device)
cell_enc = MyEncoder(gene_num, 6, 300, 3, 450)
# cell_enc = VecAEEncoder(cell_ae, grads=True, device=device)
regressor = MyEncoder(600, 6, len(de_df.columns)-5, 3, 800)

model = CombinerModel(
        mol_encoder=mol_enc,
        cell_encoder=cell_enc,
        regressor=regressor)
torch.save(model.state_dict(), "temp/ini_model.pkl")

#### CONFIG ####
lr = 0.01
epochs = 1000

loss_fn = loss_fn
optimizer = Adam(model.parameters(), lr=lr, weight_decay=5e-4)
scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.8, patience=10)

#### TENSORBOARD ####
writer = SummaryWriter("./runs/test/18")

In [19]:
train_many_epochs(model, train_dataloader, val_dataloader, epochs, 
                  loss_fn, optimizer, scheduler, writer=writer, device=device)

100%|██████████| 1000/1000 [05:09<00:00,  3.23it/s]


In [24]:
device = "cuda:0"

sm_feats = de_dataset_train.sm_feats.keys()
cell_feats = de_dataset_train.cell_feats.keys()

for sm_feat in sm_feats:
    for cell_feat in cell_feats:
        
        sm_input_size = len(mtypes)
        cell_input_size = len(ctypes)
        
        if "fp" in sm_feat:
            sm_input_size = 2048
        if cell_feat == "gene_exp":
            cell_input_size = gene_num

        #### LOADERS, DATA ####
        de_dataset_train.configure(sm_out_feature=sm_feat,
                                cell_out_feature=cell_feat, ae_mode=False)
        de_dataset_val.configure(sm_out_feature=sm_feat,
                                cell_out_feature=cell_feat, ae_mode=False)
        train_dataloader = DataLoader(de_dataset_train, 256)
        val_dataloader = DataLoader(de_dataset_val, 256)

        #### MODEL ####
        mol_enc = nn.Sequential(
            nn.Linear(sm_input_size, 150),
            nn.Tanh(),
            nn.Linear(150, 300)
        ).float()
        if cell_feat == "gene_exp":
            cell_enc = nn.Sequential(
                # VecAEEncoder(cell_ae, grads=False, device=device)
                nn.BatchNorm1d(cell_input_size),
                nn.Linear(cell_input_size, 150),
                nn.Tanh(),
                nn.Linear(150, 300)
            ).float()
        else:
            cell_enc = nn.Sequential(
                # VecAEEncoder(cell_ae, grads=False, device=device)
                # nn.BatchNorm1d(cell_input_size),
                nn.Linear(cell_input_size, 150),
                nn.Tanh(),
                nn.Linear(150, 300)
            ).float()
        regressor = nn.Sequential(
            nn.Linear(600, 1000),
            nn.Tanh(),
            nn.Dropout(0.3),
            nn.Linear(1000, 1000),
            nn.Tanh(),
            nn.Dropout(0.3),
            nn.Linear(1000, 1000),
            nn.Tanh(),
            nn.Linear(1000, len(de_df.columns)-5)
        ).float()

        model = CombinerModel(
                mol_encoder=mol_enc,
                cell_encoder=cell_enc,
                regressor=regressor)
        torch.save(model.state_dict(), "temp/ini_model.pkl")

        #### CONFIG ####
        lr = 0.01
        epochs = 500

        loss_fn = loss_fn
        optimizer = Adam(model.parameters(), lr=lr, weight_decay=1e-4)
        scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.85, patience=7)

        #### TENSORBOARD ####
        writer = SummaryWriter(f"./runs/compare_xxxlarge/{sm_feat}_{cell_feat}")

        #### TRAIN ####
        print(f"mol feat: {sm_feat}, cell feat: {cell_feat}")
        train_many_epochs(model, train_dataloader, val_dataloader, epochs, 
                  loss_fn, optimizer, scheduler, writer=writer, device=device)
            

mol feat: morgan2_fp, cell feat: one_hot


100%|██████████| 500/500 [01:00<00:00,  8.22it/s]


mol feat: morgan2_fp, cell feat: gene_exp


100%|██████████| 500/500 [01:00<00:00,  8.24it/s]


mol feat: morgan3_fp, cell feat: one_hot


100%|██████████| 500/500 [01:01<00:00,  8.14it/s]


mol feat: morgan3_fp, cell feat: gene_exp


100%|██████████| 500/500 [01:01<00:00,  8.10it/s]


mol feat: one_hot, cell feat: one_hot


100%|██████████| 500/500 [01:00<00:00,  8.29it/s]


mol feat: one_hot, cell feat: gene_exp


100%|██████████| 500/500 [00:59<00:00,  8.41it/s]
