In [107]:
import pandas as pd
import numpy as np
import warnings

warnings.simplefilter('ignore')

from tqdm import tqdm
from os import listdir
from scipy.stats import spearmanr
from sklearn.linear_model import RidgeCV, Ridge, LinearRegression
from joblib import Parallel, delayed

In [2]:
test = pd.read_csv(
    '../DrugCell/data_rcellminer/test_rcell_wo_other.txt',
    header=None,
    sep='\t'
)

In [3]:
def get_list(i):
    tmp = test[test[1] == i]
    return [len(tmp[2]), tmp.index[0], list(tmp[2]), list(test[test[1] == i][1])[0]]

In [4]:
t = Parallel(n_jobs=-1)(delayed(get_list)(i) for i in tqdm(set(test[1])))

100%|██████████| 309/309 [00:00<00:00, 502.28it/s]


In [5]:
t = pd.DataFrame(t).sort_values(0)
t = t[t[0] > 9]
t = t[[1, 2, 3]].reset_index(drop=True)
t.columns = ['drug_index', 'drug_response', 'drug']

In [6]:
def get_corr(GO, t):
    hidden = pd.read_csv(
        '/export/scratch/inoue019/Hidden/' + GO,
        header=None,
        sep=' '
    )

    drug_index = list(t['drug_index'])

    corr = []
    for i, j in enumerate(drug_index):
        y = t['drug_response'][i]
        X = np.tile(list(hidden.iloc[j]), (len(y), 1))


        regr = RidgeCV(cv=3) 
        regr.fit(X, y)
        y_pred = regr.predict(X)
        corr.append(np.correlate(y, y_pred)[0])
        
    return corr

In [7]:
hidden = listdir('/export/scratch/inoue019/Hidden/')
hidden = [i for i in hidden if "GO" in i]

In [8]:
p = Parallel(n_jobs=-1)(delayed(get_corr)(i, t) for i in tqdm(hidden))

100%|██████████| 2086/2086 [06:39<00:00,  5.22it/s]


In [9]:
pubchem_id = pd.read_csv(
    '../DrugCell/data_rcellminer/SMILES_from_PubchemID.txt',
    header=None,
    sep='\t'
)
pubchem_id = {pubchem_id[1][i]:pubchem_id[0][i] for i in pubchem_id.index}

In [10]:
res = pd.DataFrame(
    p,
    index=[i.split('.')[0] for i in hidden],
    columns=[pubchem_id[i] for i in list(t['drug'])]
)
# .to_csv("correlation_score.csv")

In [11]:
res

Unnamed: 0,5746,77082,222160,3136570,397887,4267516,54613769,6712269,5702003,396363,...,395430,72403,249332,54608410,4578,24203889,60148419,5702228,65348,11683
GO:0043368,0.051898,0.999991,0.283846,2.367686,0.038054,1.223366,0.442696,0.210713,0.060825,0.009162,...,0.000534,0.001514,0.019282,0.044856,0.010541,0.004961,0.299788,0.027025,0.059299,0.029592
GO:0015914,0.051898,0.999991,0.283846,2.367686,0.038054,1.223366,0.442696,0.210713,0.060825,0.009162,...,0.000534,0.001514,0.019282,0.044856,0.010541,0.004961,0.299788,0.027025,0.059299,0.029592
GO:0021766,0.051898,0.999991,0.283846,2.367686,0.038054,1.223366,0.442696,0.210713,0.060825,0.009162,...,0.000534,0.001514,0.019282,0.044856,0.010541,0.004961,0.299788,0.027025,0.059299,0.029592
GO:0072378,0.051898,0.999991,0.283846,2.367686,0.038054,1.223366,0.442696,0.210713,0.060825,0.009162,...,0.000534,0.001514,0.019282,0.044856,0.010541,0.004961,0.299788,0.027025,0.059299,0.029592
GO:0061001,0.051898,0.999991,0.283846,2.367686,0.038054,1.223366,0.442696,0.210713,0.060825,0.009162,...,0.000534,0.001514,0.019282,0.044856,0.010541,0.004961,0.299788,0.027025,0.059299,0.029592
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GO:0045773,0.051898,0.999991,0.283846,2.367686,0.038054,1.223366,0.442696,0.210713,0.060825,0.009162,...,0.000534,0.001514,0.019282,0.044856,0.010541,0.004961,0.299788,0.027025,0.059299,0.029592
GO:0071392,0.051898,0.999991,0.283846,2.367686,0.038054,1.223366,0.442696,0.210713,0.060825,0.009162,...,0.000534,0.001514,0.019282,0.044856,0.010541,0.004961,0.299788,0.027025,0.059299,0.029592
GO:0045926,0.051898,0.999991,0.283846,2.367686,0.038054,1.223366,0.442696,0.210713,0.060825,0.009162,...,0.000534,0.001514,0.019282,0.044856,0.010541,0.004961,0.299788,0.027025,0.059299,0.029592
GO:0007286,0.051898,0.999991,0.283846,2.367686,0.038054,1.223366,0.442696,0.210713,0.060825,0.009162,...,0.000534,0.001514,0.019282,0.044856,0.010541,0.004961,0.299788,0.027025,0.059299,0.029592


In [12]:
pd.DataFrame(p)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,188,189,190,191,192,193,194,195,196,197
0,0.051898,0.999991,0.283846,2.367686,0.038054,1.223366,0.442696,0.210713,0.060825,0.009162,...,0.000534,0.001514,0.019282,0.044856,0.010541,0.004961,0.299788,0.027025,0.059299,0.029592
1,0.051898,0.999991,0.283846,2.367686,0.038054,1.223366,0.442696,0.210713,0.060825,0.009162,...,0.000534,0.001514,0.019282,0.044856,0.010541,0.004961,0.299788,0.027025,0.059299,0.029592
2,0.051898,0.999991,0.283846,2.367686,0.038054,1.223366,0.442696,0.210713,0.060825,0.009162,...,0.000534,0.001514,0.019282,0.044856,0.010541,0.004961,0.299788,0.027025,0.059299,0.029592
3,0.051898,0.999991,0.283846,2.367686,0.038054,1.223366,0.442696,0.210713,0.060825,0.009162,...,0.000534,0.001514,0.019282,0.044856,0.010541,0.004961,0.299788,0.027025,0.059299,0.029592
4,0.051898,0.999991,0.283846,2.367686,0.038054,1.223366,0.442696,0.210713,0.060825,0.009162,...,0.000534,0.001514,0.019282,0.044856,0.010541,0.004961,0.299788,0.027025,0.059299,0.029592
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2081,0.051898,0.999991,0.283846,2.367686,0.038054,1.223366,0.442696,0.210713,0.060825,0.009162,...,0.000534,0.001514,0.019282,0.044856,0.010541,0.004961,0.299788,0.027025,0.059299,0.029592
2082,0.051898,0.999991,0.283846,2.367686,0.038054,1.223366,0.442696,0.210713,0.060825,0.009162,...,0.000534,0.001514,0.019282,0.044856,0.010541,0.004961,0.299788,0.027025,0.059299,0.029592
2083,0.051898,0.999991,0.283846,2.367686,0.038054,1.223366,0.442696,0.210713,0.060825,0.009162,...,0.000534,0.001514,0.019282,0.044856,0.010541,0.004961,0.299788,0.027025,0.059299,0.029592
2084,0.051898,0.999991,0.283846,2.367686,0.038054,1.223366,0.442696,0.210713,0.060825,0.009162,...,0.000534,0.001514,0.019282,0.044856,0.010541,0.004961,0.299788,0.027025,0.059299,0.029592


In [36]:
hidden = pd.read_csv(
    '/export/scratch/inoue019/Hidden/GO:0000038.hidden',
    header=None,
    sep=' '
)

hidden_ = pd.read_csv(
    '/export/scratch/inoue019/Hidden/GO:0000083.hidden',
    header=None,
    sep=' '
)

In [82]:
y = t['drug_response'][0]
y

[0.239485127060008,
 -0.731824908794209,
 1.17339839993705,
 0.3305345238508499,
 -0.485100183955274,
 -0.898323972305552,
 0.328549454124694,
 1.1624068225864,
 -0.066223845292541,
 -0.3324974357802669]

In [99]:
X = np.array([list(hidden.loc[t['drug_index'][0]])]*len(y))
regr = RidgeCV(cv=3) 
regr.fit(X, y)
y_pred = regr.predict(X)
np.correlate(y, y_pred)[0]

0.051898189646186574

In [111]:
model = LinearRegression()
model.fit(X, y)
model.predict(X)

array([0.0720404, 0.0720404, 0.0720404, 0.0720404, 0.0720404, 0.0720404,
       0.0720404, 0.0720404, 0.0720404, 0.0720404])

In [112]:
model = LinearRegression()
model.fit(X_, y)
model.predict(X_)

array([0.0720404, 0.0720404, 0.0720404, 0.0720404, 0.0720404, 0.0720404,
       0.0720404, 0.0720404, 0.0720404, 0.0720404])

In [101]:
X = np.array([list(hidden_.loc[t['drug_index'][0]])]*len(y))
regr = RidgeCV(cv=3) 
regr.fit(X, y)
y_pred = regr.predict(X)
np.correlate(y, y_pred)[0]

0.051898189646186574

In [102]:
X

array([[-0.21737,  0.12057,  0.14649, -0.14202,  0.01379,  0.22923],
       [-0.21737,  0.12057,  0.14649, -0.14202,  0.01379,  0.22923],
       [-0.21737,  0.12057,  0.14649, -0.14202,  0.01379,  0.22923],
       [-0.21737,  0.12057,  0.14649, -0.14202,  0.01379,  0.22923],
       [-0.21737,  0.12057,  0.14649, -0.14202,  0.01379,  0.22923],
       [-0.21737,  0.12057,  0.14649, -0.14202,  0.01379,  0.22923],
       [-0.21737,  0.12057,  0.14649, -0.14202,  0.01379,  0.22923],
       [-0.21737,  0.12057,  0.14649, -0.14202,  0.01379,  0.22923],
       [-0.21737,  0.12057,  0.14649, -0.14202,  0.01379,  0.22923],
       [-0.21737,  0.12057,  0.14649, -0.14202,  0.01379,  0.22923]])