In [1]:
import sys
sys.path.append("..")

from data_utils import SCPDataSet, stratified_split
from data_utils import Sm2Smiles, Smiles2Mol, Mol2Morgan, Type2OneHot
from train import train_many_epochs, loss_fn
from model import *

import torch
from torch.optim import SGD, Adam
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import DataLoader

from torch.utils.tensorboard import SummaryWriter

from sklearn.model_selection import KFold

import pandas as pd

In [2]:
de_train = pd.read_parquet("../data/de_train.parquet")

In [3]:
import os
os.chdir("..")

In [4]:
sm_smiles_df = pd.read_csv("config/sm_smiles.csv")
sm_smiles_dict = sm_smiles_df.set_index("sm_name").to_dict()["SMILES"]
ctypes = de_train['cell_type'].unique()
mtypes = de_train['sm_name'].unique()

In [5]:
#### MODEL ####
mol_enc = nn.Sequential(
    nn.Linear(2048, 200)
)
cell_enc = nn.Sequential(
    nn.Linear(len(ctypes), 10)
)
regressor = nn.Sequential(
    nn.Linear(210, 100),
    nn.Tanh(),
    nn.Dropout(0.2),
    nn.Linear(100, 100),
    nn.Tanh(),
    nn.Dropout(0.2),
    nn.Linear(100, 500),
    nn.Tanh(),
    nn.Linear(500, len(de_train.columns)-5)
)

model = CombinerModel(
        mol_encoder=mol_enc,
        cell_encoder=cell_enc,
        regressor=regressor)
torch.save(model.state_dict(), "temp/ini_model.pkl")

#### LOADERS, DATA ####
train_index, val_index = stratified_split(list(de_train['cell_type']), 0.30, 194)
train_df = de_train.iloc[train_index]
val_df = de_train.iloc[val_index]

train_dataset = SCPDataSet(train_df)
val_dataset = SCPDataSet(val_df)
train_dataloader = DataLoader(train_dataset, 256)
val_dataloader = DataLoader(val_dataset, 256)

#### CONFIG ####
lr = 0.01
epochs = 45
device = "cuda:0"

loss_fn = loss_fn
optimizer = Adam(model.parameters(), lr=lr)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.8, patience=7)

#### TENSORBOARD ####
writer = SummaryWriter("./runs/kaggle/trying_out")