This notebook uses the representations learnt by various methods for drugs and proteins, and uses a downstream classifier to predict the presence/absence of a drug-protein interaction in the test split.

In [1]:
import pandas as pd
import numpy as np

In [2]:
sample_id = 1
repr_folder = f"../../CSGNN/"
emb_file = f"PolyP3_1_repr.npy"

In [3]:
repr_np = np.load(repr_folder + emb_file)
repr_np.shape

(1482, 64)

In [4]:
drug_repr_np = repr_np[:645][:]
drug_repr_np.shape

(645, 64)

In [5]:
protein_repr_np = repr_np[645:][:]
protein_repr_np.shape

(837, 64)

In [6]:
emb_dict = {}
for i in range(drug_repr_np.shape[0]):
    emb_dict[str(i)] = drug_repr_np[i]
for i in range(protein_repr_np.shape[0]):
    emb_dict[str(645 + i)] = protein_repr_np[i]

In [7]:
len(emb_dict)

1482

In [8]:
ncmf_input_files_path = f"../../datasets/NCMF/PolyP3/"
train_df = pd.read_csv(ncmf_input_files_path + f"sampled{sample_id}_link.dat", sep= "\t", header=None)
train_df.columns = ["left", "right", "link_type", "value"]
train_df

Unnamed: 0,left,right,link_type,value
0,0,0,0,1.0
1,0,6,0,1.0
2,0,8,0,1.0
3,0,10,0,1.0
4,0,17,0,1.0
...,...,...,...,...
160429,1480,1429,2,1.0
160430,1480,1448,2,1.0
160431,1480,1455,2,1.0
160432,1480,1459,2,1.0


In [9]:
train_df[train_df["link_type"] == 1]["value"].value_counts()

1.0    12545
Name: value, dtype: int64

In [10]:
entity_df = pd.read_csv(ncmf_input_files_path + "entity.csv")
entity_df

Unnamed: 0,Entity Names
0,CID000000085
1,CID000000119
2,CID000000143
3,CID000000158
4,CID000000159
...,...
1477,84816
1478,4190
1479,92483
1480,10988


In [11]:
reduced_train_df = train_df[train_df["link_type"] ==1]
reduced_train_df

Unnamed: 0,left,right,link_type,value
127591,1,804,1,1.0
127592,1,828,1,1.0
127593,1,829,1,1.0
127594,1,830,1,1.0
127595,1,831,1,1.0
...,...,...,...,...
140131,628,1446,1,1.0
140132,629,892,1,1.0
140133,632,892,1,1.0
140134,632,942,1,1.0


In [12]:
X1_train = np.zeros((645, 837))
for idx, row in reduced_train_df.iterrows():
    left_idx = int(row["left"])
    right_idx = int(row["right"]) - 645
    X1_train[left_idx][right_idx] = 1
X1_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
(X1_train == 1).sum()

12545

In [14]:
reduced_train_df = pd.DataFrame(columns = ["left", "right", "link_type", "value"])
reduced_train_df["left"] = sorted(list(range(0, 645)) * 837)
reduced_train_df["right"] = list(range(645, 645 + 837)) * 645
reduced_train_df["link_type"] = 1
reduced_train_df["value"] = X1_train.flatten()
reduced_train_df

Unnamed: 0,left,right,link_type,value
0,0,645,1,0.0
1,0,646,1,0.0
2,0,647,1,0.0
3,0,648,1,0.0
4,0,649,1,0.0
...,...,...,...,...
539860,644,1477,1,0.0
539861,644,1478,1,0.0
539862,644,1479,1,0.0
539863,644,1480,1,0.0


In [15]:
# # add in zeros for training
# x, y = np.where(X1_train == 0)
# for i, j in list(zip(x, y)):
#     reduced_train_df = reduced_train_df.append({"left": i, "right": j + 645, "link_type": 1, "value": 0}, ignore_index=True)

In [16]:
reduced_train_df["value"].value_counts()

0.0    527320
1.0     12545
Name: value, dtype: int64

In [17]:
test_df = pd.read_csv(ncmf_input_files_path + f"sampled{sample_id}_link.dat.test", sep="\t", header=None)
test_df.columns = ["left", "right", "value"]
test_df

Unnamed: 0,left,right,value
0,89,964,0
1,315,718,0
2,375,1230,0
3,170,1476,0
4,467,900,0
...,...,...,...
107968,346,1457,0
107969,37,1140,0
107970,384,1448,0
107971,358,906,0


In [18]:
test_df["value"].value_counts()

0    104943
1      3030
Name: value, dtype: int64

In [19]:
def get_hadamard_product(row):
    left_emb = emb_dict[str(int(row["left"]))]
    right_emb = emb_dict[str(int(row["right"]))]
    pdt_emb = np.multiply(left_emb, right_emb)
    return pdt_emb

In [20]:
reduced_train_df["emb_hadamard"] = reduced_train_df.apply(lambda row: get_hadamard_product(row), axis = 1)
reduced_train_df

Unnamed: 0,left,right,link_type,value,emb_hadamard
0,0,645,1,0.0,"[-1.8212148, 5.9452343, 1.0131403, -0.88977885..."
1,0,646,1,0.0,"[0.13809718, 0.079022944, 0.2682429, -0.076491..."
2,0,647,1,0.0,"[0.10282508, 0.017780248, 0.3075451, -0.158705..."
3,0,648,1,0.0,"[-1.4421167, 1.9020268, 1.4195262, -1.09569, -..."
4,0,649,1,0.0,"[0.5706106, 2.0810227, 0.7682032, -0.9107525, ..."
...,...,...,...,...,...
539860,644,1477,1,0.0,"[-0.13021949, 0.023691248, 0.33992016, 0.16471..."
539861,644,1478,1,0.0,"[-0.13021949, 0.023691248, 0.33992016, 0.16471..."
539862,644,1479,1,0.0,"[-0.13021949, 0.023691248, 0.33992016, 0.16471..."
539863,644,1480,1,0.0,"[-0.13021949, 0.023691248, 0.33992016, 0.16471..."


In [21]:
reduced_train_df.reset_index(drop=True, inplace=True)

In [22]:
test_df["emb_hadamard"] = test_df.apply(lambda row: get_hadamard_product(row), axis = 1)
test_df

Unnamed: 0,left,right,value,emb_hadamard
0,89,964,0,"[0.09075111, 0.041328628, 0.38914156, -0.07902..."
1,315,718,0,"[-0.13021949, 0.023691248, 0.33992016, 0.16471..."
2,375,1230,0,"[0.008635388, 0.0052813864, 0.043045323, -0.00..."
3,170,1476,0,"[0.24490812, 0.07306566, 0.65255815, 0.0586173..."
4,467,900,0,"[-0.1494785, 0.030308595, 0.36949617, 0.188757..."
...,...,...,...,...
107968,346,1457,0,"[0.03434074, 0.014892706, 0.067611165, 0.00090..."
107969,37,1140,0,"[0.043088626, 0.096630685, 0.003501656, -0.099..."
107970,384,1448,0,"[-0.23969303, 0.100399494, 0.21276757, 0.18248..."
107971,358,906,0,"[0.11612494, 0.035137523, 0.2492491, 0.2015531..."


In [23]:
test_df.reset_index(inplace=True, drop=True)

In [24]:
reduced_train_df["emb_hadamard"].values[0].shape

(64,)

In [25]:
reduced_train_df[[f"emb_{i}" for i in range(0, 64)]] = pd.DataFrame(reduced_train_df.emb_hadamard.tolist(), index= reduced_train_df.index)
reduced_train_df

Unnamed: 0,left,right,link_type,value,emb_hadamard,emb_0,emb_1,emb_2,emb_3,emb_4,...,emb_54,emb_55,emb_56,emb_57,emb_58,emb_59,emb_60,emb_61,emb_62,emb_63
0,0,645,1,0.0,"[-1.8212148, 5.9452343, 1.0131403, -0.88977885...",-1.821215,5.945234,1.013140,-0.889779,-0.262592,...,-1.827264,3.288082,8.221779,0.120182,1.301546,27.221521,1.169931,-1.073008,15.123514,0.445496
1,0,646,1,0.0,"[0.13809718, 0.079022944, 0.2682429, -0.076491...",0.138097,0.079023,0.268243,-0.076492,-0.096612,...,-0.021794,0.117816,1.169727,1.123920,-0.663700,-0.183654,0.029088,-0.152775,0.228305,0.062976
2,0,647,1,0.0,"[0.10282508, 0.017780248, 0.3075451, -0.158705...",0.102825,0.017780,0.307545,-0.158706,-0.132873,...,-1.542418,2.812193,6.782505,0.021862,1.024804,22.191002,1.042344,-0.829485,12.697905,0.255706
3,0,648,1,0.0,"[-1.4421167, 1.9020268, 1.4195262, -1.09569, -...",-1.442117,1.902027,1.419526,-1.095690,-0.598701,...,-1.353783,2.198962,6.231427,0.160635,0.732945,18.974056,0.813757,-0.777589,10.135212,0.182005
4,0,649,1,0.0,"[0.5706106, 2.0810227, 0.7682032, -0.9107525, ...",0.570611,2.081023,0.768203,-0.910752,-0.470612,...,0.309119,0.631366,2.334873,3.523643,-1.394017,0.013561,0.029503,0.327611,1.622178,1.507172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
539860,644,1477,1,0.0,"[-0.13021949, 0.023691248, 0.33992016, 0.16471...",-0.130219,0.023691,0.339920,0.164714,0.247615,...,0.034062,0.062687,0.594033,0.035101,-0.247830,-0.071430,0.149042,0.080147,0.081064,0.001927
539861,644,1478,1,0.0,"[-0.13021949, 0.023691248, 0.33992016, 0.16471...",-0.130219,0.023691,0.339920,0.164714,0.247615,...,0.034062,0.062687,0.594033,0.035101,-0.247830,-0.071430,0.149042,0.080147,0.081064,0.001927
539862,644,1479,1,0.0,"[-0.13021949, 0.023691248, 0.33992016, 0.16471...",-0.130219,0.023691,0.339920,0.164714,0.247615,...,0.034062,0.062687,0.594033,0.035101,-0.247830,-0.071430,0.149042,0.080147,0.081064,0.001927
539863,644,1480,1,0.0,"[-0.13021949, 0.023691248, 0.33992016, 0.16471...",-0.130219,0.023691,0.339920,0.164714,0.247615,...,0.034062,0.062687,0.594033,0.035101,-0.247830,-0.071430,0.149042,0.080147,0.081064,0.001927


In [26]:
test_df[[f"emb_{i}" for i in range(0, 64)]] = pd.DataFrame(test_df.emb_hadamard.tolist(), index= test_df.index)
test_df

Unnamed: 0,left,right,value,emb_hadamard,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,...,emb_54,emb_55,emb_56,emb_57,emb_58,emb_59,emb_60,emb_61,emb_62,emb_63
0,89,964,0,"[0.09075111, 0.041328628, 0.38914156, -0.07902...",0.090751,0.041329,0.389142,-0.079029,0.036516,-0.033170,...,-0.019441,0.113710,1.408695,1.338279,-0.833943,-0.213304,0.024551,-0.122698,0.235410,0.064528
1,315,718,0,"[-0.13021949, 0.023691248, 0.33992016, 0.16471...",-0.130219,0.023691,0.339920,0.164714,0.247615,0.004210,...,0.034062,0.062687,0.594033,0.035101,-0.247830,-0.071430,0.149042,0.080147,0.081064,0.001927
2,375,1230,0,"[0.008635388, 0.0052813864, 0.043045323, -0.00...",0.008635,0.005281,0.043045,-0.001166,0.007864,0.001038,...,0.036951,0.068085,0.615460,-0.009978,-0.277466,-0.075500,0.174148,0.076937,0.089647,0.001895
3,170,1476,0,"[0.24490812, 0.07306566, 0.65255815, 0.0586173...",0.244908,0.073066,0.652558,0.058617,-0.250775,0.032265,...,0.009511,0.064844,0.574805,0.260778,-0.229256,-0.071897,0.075553,0.018091,0.091310,0.013220
4,467,900,0,"[-0.1494785, 0.030308595, 0.36949617, 0.188757...",-0.149478,0.030309,0.369496,0.188758,0.279063,0.004809,...,0.010077,0.062762,0.622890,0.314750,-0.240313,-0.075523,0.063400,0.029537,0.087154,0.012400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107968,346,1457,0,"[0.03434074, 0.014892706, 0.067611165, 0.00090...",0.034341,0.014893,0.067611,0.000910,-0.024365,0.002636,...,0.040826,0.130356,0.888121,0.031663,-0.690417,-0.124907,0.238199,0.063981,0.162968,0.016487
107969,37,1140,0,"[0.043088626, 0.096630685, 0.003501656, -0.099...",0.043089,0.096631,0.003502,-0.099873,-0.085165,-0.024032,...,-0.018292,0.096528,1.326070,1.143845,-0.523089,-0.179127,0.005689,-0.107749,0.201472,0.047806
107970,384,1448,0,"[-0.23969303, 0.100399494, 0.21276757, 0.18248...",-0.239693,0.100399,0.212768,0.182483,0.229122,0.008924,...,0.009226,0.062764,0.634729,0.350784,-0.247161,-0.077289,0.057294,0.031603,0.087792,0.013259
107971,358,906,0,"[0.11612494, 0.035137523, 0.2492491, 0.2015531...",0.116125,0.035138,0.249249,0.201553,0.276799,0.003305,...,-0.010370,0.032173,0.329862,1.086370,0.580083,-0.004442,0.016063,-0.056522,0.022758,0.024745


In [29]:
from sklearn.svm import SVC
clf = SVC(random_state=42, probability=True)
clf.fit(reduced_train_df[[f"emb_{i}" for i in range(0, 64)]].values, reduced_train_df["value"].values)

SVC(probability=True, random_state=42)

In [30]:
preds_prob = clf.predict_proba(test_df[[f"emb_{i}" for i in range(0, 64)]].values)
y_true = test_df["value"].values

In [31]:
preds_prob[:, 1].shape

(107973,)

In [32]:
y_true.shape

(107973,)

In [33]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
auroc = roc_auc_score(y_true, preds_prob[:, 1])
precision, recall, thresholds = precision_recall_curve(y_true, preds_prob[:, 1])
auprc = auc(recall, precision)
print(f"AUROC = {auroc}")
print(f"AUPRC = {auprc}")

AUROC = 0.9283406859024429
AUPRC = 0.5857911321901195


In [27]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 42, max_depth=50)
rf.fit(reduced_train_df[[f"emb_{i}" for i in range(0, reduced_train_df["emb_hadamard"].values[0].shape[0])]].values, reduced_train_df["value"].values)

RandomForestClassifier(max_depth=50, random_state=42)

In [28]:
preds_prob_rf = rf.predict_proba(test_df[[f"emb_{i}" for i in range(0, reduced_train_df["emb_hadamard"].values[0].shape[0])]].values)
y_true = test_df["value"].values

In [29]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
auroc = roc_auc_score(y_true, preds_prob_rf[:, 1])
precision, recall, thresholds = precision_recall_curve(y_true, preds_prob_rf[:, 1])
auprc = auc(recall, precision)
print('Random forest')
print(f"AUROC = {auroc}")
print(f"AUPRC = {auprc}")

Random forest
AUROC = 0.9587179999552801
AUPRC = 0.6218023172614912
