In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import opendatasets
import torch
import rdkit
from torch.utils.data import Dataset
from dataset import *
from features import *
from joblib import parallel_backend

In [5]:
import os
os.getcwd()

'c:\\Users\\yayag\\Kaggle_SCP'

In [2]:
#get train and val x and y
de_df = pd.read_parquet("data/de_train.parquet")

from dataset import stratified_split
train_index, val_index = stratified_split(de_df["cell_type"], 0.2, 45)
from dataset import DataFrameDataset
de_df_dataset_train = DataFrameDataset(de_df.iloc[train_index], mode="df")
de_df_dataset_val = DataFrameDataset(de_df.iloc[val_index], mode="df")

In [3]:
mtypes = list(set(de_df["sm_name"].to_list()))
mol_transforms = {
    "morgan2_fp": TransformList([Sm2Smiles("config/sm_smiles.csv", mode="path"), Smiles2Mol(), Mol2Morgan(2048, 2)]),
    "morgan3_fp": TransformList([Sm2Smiles("config/sm_smiles.csv", mode="path"), Smiles2Mol(), Mol2Morgan(2048, 3)]),
    "one_hot": TransformList([Type2OneHot(mtypes)])
}

ctypes = list(set(de_df["cell_type"].to_list()))

file_names = ["data/temp/"+name.replace(" ", "_").replace("+", "")+"_control_mean.csv"
              for name in ctypes]
gene_num = len(pd.read_csv(file_names[0]))

cell_transforms = {
    "one_hot": TransformList([Type2OneHot(ctypes)]),
    "gene_exp": TransformList([CType2CSVEncoding(ctypes, file_names)])
    # "gene_exp": TransformList([CType2CSVEncoding(ctypes, file_names), NormCount2CPM()])
}

In [4]:
de_dataset_train = DEDataset(de_df_dataset_train, mol_transforms, cell_transforms)
de_dataset_val = DEDataset(de_df_dataset_val, mol_transforms, cell_transforms)

In [5]:
de_dataset_train.configure(sm_out_feature="one_hot", cell_out_feature="one_hot", return_y=True, ae_mode=False)
de_dataset_val.configure(sm_out_feature="one_hot", cell_out_feature="one_hot", return_y=True, ae_mode=False)

In [29]:
X_train, y_train = de_dataset_train[:]
X_val, y_val = de_dataset_val[:]

In [44]:
mols = np.array(X_train[0])
cells = np.array(X_train[1])
X_train = np.concatenate([mols, cells], axis=1)

In [45]:
#scale
x_scaler = StandardScaler()
y_scaler = StandardScaler()
X_train = x_scaler.fit_transform(np.array(X_train))
y_train = y_scaler.fit_transform(np.array(y_train))

In [50]:
y_train[:,0].shape

(491,)

In [66]:
regressor = SVR(kernel="poly")
regressor.fit(X_train, y_train[:,2])

In [53]:
mols = np.array(X_val[0])
cells = np.array(X_val[1])
X_val = np.concatenate([mols, cells], axis=1)

In [64]:
#evaluate
y_0_pred = regressor.predict(X_val)


In [74]:
mse = torch.nn.MSELoss()
float(mse(torch.tensor(y_0_pred), y_val[:,2]))

1.6135245772365234

In [82]:
from tqdm import tqdm
all_y_pred = pd.DataFrame()
for i in tqdm(range(18211)):
    regressor = SVR(kernel="poly")
    regressor.fit(X_train, y_train[:,i])
    y_pred = regressor.predict(X_val)
    all_y_pred = pd.concat([pd.DataFrame(y_pred), all_y_pred], axis=1)


   

    


100%|██████████| 18211/18211 [08:27<00:00, 35.90it/s]


In [83]:
from train import loss_mrrmse
poly_loss = loss_mrrmse(np.array(all_y_pred), y_val)

In [84]:
poly_loss

tensor(1.1790, dtype=torch.float64)

In [85]:
from tqdm import tqdm
rbf_all_y_pred = pd.DataFrame()
for i in tqdm(range(18211)):
    regressor = SVR(kernel="rbf")
    regressor.fit(X_train, y_train[:,i])
    y_pred = regressor.predict(X_val)
    rbf_all_y_pred = pd.concat([pd.DataFrame(y_pred), rbf_all_y_pred], axis=1)

100%|██████████| 18211/18211 [10:40<00:00, 28.44it/s]


In [86]:
poly_loss = loss_mrrmse(np.array(rbf_all_y_pred), y_val)
poly_loss

tensor(1.1777, dtype=torch.float64)

In [87]:
from tqdm import tqdm
train_all_y_pred = pd.DataFrame()
for i in tqdm(range(18211)):
    regressor = SVR(kernel="poly")
    regressor.fit(X_train, y_train[:,i])
    y_pred = regressor.predict(X_train)
    train_all_y_pred = pd.concat([pd.DataFrame(y_pred), train_all_y_pred], axis=1)


100%|██████████| 18211/18211 [17:34<00:00, 17.27it/s]


In [90]:
y_train

array([[-0.25960985, -0.9292144 , -1.8282267 , ...,  0.82168317,
        -1.3241875 , -0.00286939],
       [ 0.015005  ,  0.21437776,  0.2190293 , ...,  1.6843605 ,
         0.4202997 ,  0.22115749],
       [-0.3298378 ,  0.03484344,  0.16391155, ..., -0.62679297,
        -0.07077432, -1.7265558 ],
       ...,
       [-0.156492  ,  0.15783927, -0.08796211, ...,  0.8739869 ,
         0.44605368, -0.5428026 ],
       [-0.25941092, -0.21589983,  0.07954812, ..., -0.13734768,
         0.17077693, -0.7044342 ],
       [-0.82053643, -0.24625742,  0.03101921, ..., -0.65131146,
         0.52928084, -0.97066206]], dtype=float32)

In [92]:
poly_loss = loss_mrrmse(torch.tensor(np.array(train_all_y_pred)), torch.tensor(y_train))
poly_loss

tensor(0.7299, dtype=torch.float64)

In [34]:
    de_dataset_train.configure(sm_out_feature="morgan3_fp", cell_out_feature="gene_exp", return_y=True, ae_mode=False)
    de_dataset_val.configure(sm_out_feature="morgan3_fp", cell_out_feature="gene_exp", return_y=True, ae_mode=False)

    X_train, y_train = de_dataset_train[:]
    X_val, y_val = de_dataset_val[:]

    mols = np.array(X_train[0])
    cells = np.array(X_train[1])
    X_train = np.concatenate([mols, cells], axis=1)
    val_mols = np.array(X_val[0])
    val_cells = np.array(X_val[1])
    X_val = np.concatenate([val_mols, val_cells], axis=1)

In [95]:
X_train.shape

(491, 3166)

In [103]:
from tqdm import tqdm
morgan_exp_all_y_pred = pd.DataFrame()
for i in tqdm(range(18211)):
    regressor = SVR(kernel="poly")
    regressor.fit(X_train, y_train[:,i])
    y_pred = regressor.predict(X_val)
    morgan_exp_all_y_pred = pd.concat([pd.DataFrame(y_pred), morgan_exp_all_y_pred], axis=1)

100%|██████████| 18211/18211 [41:53<00:00,  7.25it/s]


In [104]:
exp_loss = loss_mrrmse(torch.tensor(np.array(train_all_y_pred)), torch.tensor(y_train))
exp_loss

  exp_loss = loss_mrrmse(torch.tensor(np.array(train_all_y_pred)), torch.tensor(y_train))


tensor(1.3407, dtype=torch.float64)

In [47]:
from train import loss_mrrmse  
from sklearn.multioutput import MultiOutputRegressor
with parallel_backend("loky", n_jobs=8):
    regressor = SVR(kernel="rbf", C=5, gamma=0.001)
    regressor = MultiOutputRegressor(regressor, )
    regressor.fit(X_train, y_train[:,:500])
    val_y_pred = regressor.predict(X_val)
    train_y_pred = regressor.predict(X_train)

val_loss = loss_mrrmse(y_val[:,:500], torch.tensor(val_y_pred))
train_loss = loss_mrrmse(y_train[:,:500], torch.tensor(train_y_pred))
print("val:", val_loss, "train:", train_loss)

val: tensor(1.0642, dtype=torch.float64) train: tensor(1.0426, dtype=torch.float64)


In [11]:
y_train[:,:100].shape

torch.Size([491, 100])