In [1]:
import os
import warnings

import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from scipy import stats
from sklearn.linear_model import Ridge
from tqdm import tqdm

warnings.simplefilter("ignore")
os.environ["PYTHONWARNINGS"] = "ignore" 

In [2]:
test = pd.read_csv(
    "../DrugCell/data_rcellminer/test_DNA.txt",
    header=None,
    sep="\t",
)
pred = np.loadtxt("../DrugCell/code/Result/drugcell.predict")

In [3]:
def get_list(i):
    """
    Get list of drug indices for a given drug

    Parameters
    ----------
    i : Drug
    
    """
    
    tmp = test[test[1] == i]
    return [list(tmp.index), list(test[test[1] == i][1])[0]]

In [4]:
t = Parallel(n_jobs=-1)(delayed(get_list)(i) for i in tqdm(set(test[1])))
t = pd.DataFrame(t)
t.columns = ["drug_index", "drug"]

100%|██████████| 244/244 [00:00<00:00, 408.43it/s]


In [5]:
def get_corr(X, y):
    """
    Get correlation score between final prediction and prediction from each hidden feature

    Parameters
    ----------

    X : Hidden feature
    y : Final prediction

    """

    regr = Ridge()
    regr.fit(X, y)
    y_pred = regr.predict(X)
    p_rho, _ = stats.spearmanr(y_pred, y)
        
    return p_rho

In [6]:
def collect_corr(term, t):
    """
    Collect correlation score for each drug

    term : GO term
    t : List of drug indices and drug name

    """

    hidden = pd.read_csv(
        "../DrugCell/code/Hidden/" + term, header=None, sep=" "
    )
    corr = []

    for i in list(t["drug_index"]):
        y = pred[i]
        X = hidden.loc[i]
        corr.append(get_corr(X, y))

    return corr

In [7]:
GO = (
    pd.read_csv("../DrugCell/data_rcellminer/go.txt", header=None, sep="\t")[0]
    .unique()
    .tolist()
)

p = Parallel(n_jobs=-1)(delayed(collect_corr)(i + ".hidden", t) for i in tqdm(GO))
importance = pd.DataFrame(p, columns=list(t["drug"]), index=GO)

100%|██████████| 2086/2086 [01:34<00:00, 22.17it/s]


In [8]:
pubchem_id =  pd.read_csv('../data/nsc_cid_smiles.csv')
pubchem_id = {pubchem_id['SMILES'][i]:pubchem_id['CID'][i] for i in pubchem_id.index}

In [9]:
# importance.columns = [pubchem_id[i] for i in importance.columns]
importance = importance.T.sort_index().T
importance = importance.fillna(0)
importance.to_csv('../DrugCell/data_rcellminer/corr_score.csv')