This notebook uses the representations learnt by various methods for drugs and proteins, and uses a downstream classifier to predict the presence/absence of a drug-protein interaction in the test split.

In [1]:
import pandas as pd
import numpy as np

In [2]:
sample_id = 1
repr_folder = f"../../GRGMF/"
drug_emb_file = f"PolyP3_{sample_id}_drug_repr.npy"
protein_emb_file = f"PolyP3_{sample_id}_protein_repr.npy"

In [3]:
drug_repr_np = np.load(repr_folder + drug_emb_file)
drug_repr_np.shape

(645, 50)

In [4]:
protein_repr_np = np.load(repr_folder + protein_emb_file)
protein_repr_np.shape

(837, 50)

In [5]:
emb_dict = {}
for i in range(drug_repr_np.shape[0]):
    emb_dict[str(i)] = drug_repr_np[i]
for i in range(protein_repr_np.shape[0]):
    emb_dict[str(645 + i)] = protein_repr_np[i]

In [6]:
len(emb_dict)

1482

In [7]:
ncmf_input_files_path = f"../../datasets/NCMF/PolyP3/"
train_df = pd.read_csv(ncmf_input_files_path + f"sampled{sample_id}_link.dat", sep= "\t", header=None)
train_df.columns = ["left", "right", "link_type", "value"]
train_df

Unnamed: 0,left,right,link_type,value
0,0,0,0,1.0
1,0,6,0,1.0
2,0,8,0,1.0
3,0,10,0,1.0
4,0,17,0,1.0
...,...,...,...,...
160429,1480,1429,2,1.0
160430,1480,1448,2,1.0
160431,1480,1455,2,1.0
160432,1480,1459,2,1.0


In [8]:
train_df[train_df["link_type"] == 1]["value"].value_counts()

1.0    12545
Name: value, dtype: int64

In [9]:
entity_df = pd.read_csv(ncmf_input_files_path + "entity.csv")
entity_df

Unnamed: 0,Entity Names
0,CID000000085
1,CID000000119
2,CID000000143
3,CID000000158
4,CID000000159
...,...
1477,84816
1478,4190
1479,92483
1480,10988


In [10]:
reduced_train_df = train_df[train_df["link_type"] ==1]
reduced_train_df

Unnamed: 0,left,right,link_type,value
127591,1,804,1,1.0
127592,1,828,1,1.0
127593,1,829,1,1.0
127594,1,830,1,1.0
127595,1,831,1,1.0
...,...,...,...,...
140131,628,1446,1,1.0
140132,629,892,1,1.0
140133,632,892,1,1.0
140134,632,942,1,1.0


In [11]:
X1_train = np.zeros((645, 837))
for idx, row in reduced_train_df.iterrows():
    left_idx = int(row["left"])
    right_idx = int(row["right"]) - 645
    X1_train[left_idx][right_idx] = 1
X1_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
(X1_train == 1).sum()

12545

In [13]:
reduced_train_df = pd.DataFrame(columns = ["left", "right", "link_type", "value"])
reduced_train_df["left"] = sorted(list(range(0, 645)) * 837)
reduced_train_df["right"] = list(range(645, 645 + 837)) * 645
reduced_train_df["link_type"] = 1
reduced_train_df["value"] = X1_train.flatten()
reduced_train_df

Unnamed: 0,left,right,link_type,value
0,0,645,1,0.0
1,0,646,1,0.0
2,0,647,1,0.0
3,0,648,1,0.0
4,0,649,1,0.0
...,...,...,...,...
539860,644,1477,1,0.0
539861,644,1478,1,0.0
539862,644,1479,1,0.0
539863,644,1480,1,0.0


In [14]:
# # add in zeros for training
# x, y = np.where(X1_train == 0)
# for i, j in list(zip(x, y)):
#     reduced_train_df = reduced_train_df.append({"left": i, "right": j + 645, "link_type": 1, "value": 0}, ignore_index=True)

In [15]:
reduced_train_df["value"].value_counts()

0.0    527320
1.0     12545
Name: value, dtype: int64

In [16]:
test_df = pd.read_csv(ncmf_input_files_path + f"sampled{sample_id}_link.dat.test", sep="\t", header=None)
test_df.columns = ["left", "right", "value"]
test_df

Unnamed: 0,left,right,value
0,89,964,0
1,315,718,0
2,375,1230,0
3,170,1476,0
4,467,900,0
...,...,...,...
107968,346,1457,0
107969,37,1140,0
107970,384,1448,0
107971,358,906,0


In [17]:
test_df["value"].value_counts()

0    104943
1      3030
Name: value, dtype: int64

In [18]:
def get_hadamard_product(row):
    left_emb = emb_dict[str(int(row["left"]))]
    right_emb = emb_dict[str(int(row["right"]))]
    pdt_emb = np.multiply(left_emb, right_emb)
    return pdt_emb

In [19]:
reduced_train_df["emb_hadamard"] = reduced_train_df.apply(lambda row: get_hadamard_product(row), axis = 1)
reduced_train_df

Unnamed: 0,left,right,link_type,value,emb_hadamard
0,0,645,1,0.0,"[-0.054444745, 0.10296837, -0.54536915, -0.001..."
1,0,646,1,0.0,"[-0.054514788, 0.11160903, -1.5704677, -0.0480..."
2,0,647,1,0.0,"[-0.0802612, 0.11765806, -0.9328888, 0.0865237..."
3,0,648,1,0.0,"[0.0035546252, -0.46733618, -0.5065348, 0.0191..."
4,0,649,1,0.0,"[-0.011081617, -0.0079977065, -0.9280622, 0.01..."
...,...,...,...,...,...
539860,644,1477,1,0.0,"[0.018072033, 0.054573316, 0.01602536, -0.1418..."
539861,644,1478,1,0.0,"[0.06536987, 0.029236367, 0.39910662, -0.05270..."
539862,644,1479,1,0.0,"[0.04196575, -0.0018556889, 0.39130285, -0.001..."
539863,644,1480,1,0.0,"[-0.04260242, -0.061229695, 0.3627048, 0.35160..."


In [20]:
reduced_train_df.reset_index(drop=True, inplace=True)

In [21]:
test_df["emb_hadamard"] = test_df.apply(lambda row: get_hadamard_product(row), axis = 1)
test_df

Unnamed: 0,left,right,value,emb_hadamard
0,89,964,0,"[0.0057807583, 0.15669572, 0.42289734, 0.01115..."
1,315,718,0,"[-0.037276134, -0.18977623, -1.1039468, 0.0287..."
2,375,1230,0,"[0.016731013, 0.36551604, -0.79233557, -0.0020..."
3,170,1476,0,"[0.26960438, -0.7249424, -0.5593923, 0.0334723..."
4,467,900,0,"[0.2801725, -0.048824374, -0.709033, 0.0257184..."
...,...,...,...,...
107968,346,1457,0,"[0.02255107, -0.1489515, -0.7086432, 0.0412895..."
107969,37,1140,0,"[0.21558098, -0.2971523, -2.292778, 0.01702204..."
107970,384,1448,0,"[-0.102002464, 3.156612, -5.9415064, -0.003712..."
107971,358,906,0,"[0.25591987, -0.108881, -0.026541354, 0.021006..."


In [22]:
test_df.reset_index(inplace=True, drop=True)

In [23]:
reduced_train_df["emb_hadamard"].values[0].shape

(50,)

In [24]:
reduced_train_df[[f"emb_{i}" for i in range(0, 50)]] = pd.DataFrame(reduced_train_df.emb_hadamard.tolist(), index= reduced_train_df.index)
reduced_train_df

Unnamed: 0,left,right,link_type,value,emb_hadamard,emb_0,emb_1,emb_2,emb_3,emb_4,...,emb_40,emb_41,emb_42,emb_43,emb_44,emb_45,emb_46,emb_47,emb_48,emb_49
0,0,645,1,0.0,"[-0.054444745, 0.10296837, -0.54536915, -0.001...",-0.054445,0.102968,-0.545369,-0.001834,-0.017372,...,0.239521,-0.004145,-0.011741,-0.001939,0.006379,-0.070699,-0.015004,0.015661,0.093016,0.131670
1,0,646,1,0.0,"[-0.054514788, 0.11160903, -1.5704677, -0.0480...",-0.054515,0.111609,-1.570468,-0.048077,0.019404,...,-0.124141,-0.008147,-0.001551,0.008083,-0.196888,0.048356,0.015469,-0.031457,0.082788,-0.110117
2,0,647,1,0.0,"[-0.0802612, 0.11765806, -0.9328888, 0.0865237...",-0.080261,0.117658,-0.932889,0.086524,0.003305,...,0.022434,-0.001976,0.000526,-0.000988,-0.020426,0.249783,-0.010896,0.015477,0.071581,0.020198
3,0,648,1,0.0,"[0.0035546252, -0.46733618, -0.5065348, 0.0191...",0.003555,-0.467336,-0.506535,0.019187,0.003867,...,-0.018792,0.024355,-0.008535,-0.003843,0.036640,-0.101031,0.033899,0.013880,0.007801,-0.000989
4,0,649,1,0.0,"[-0.011081617, -0.0079977065, -0.9280622, 0.01...",-0.011082,-0.007998,-0.928062,0.016688,0.001426,...,0.112709,-0.020461,0.003058,-0.001441,-0.098515,0.032770,0.021029,-0.039871,-0.065157,0.169396
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
539860,644,1477,1,0.0,"[0.018072033, 0.054573316, 0.01602536, -0.1418...",0.018072,0.054573,0.016025,-0.141852,-0.021044,...,0.008449,-0.395031,-0.138288,-0.014148,-0.071140,0.287880,-0.185896,-0.449274,-0.106277,-0.102826
539861,644,1478,1,0.0,"[0.06536987, 0.029236367, 0.39910662, -0.05270...",0.065370,0.029236,0.399107,-0.052706,0.011207,...,0.006542,0.046676,0.052032,-0.019163,-0.102175,0.077064,-0.094373,-0.257220,-0.070449,-0.152707
539862,644,1479,1,0.0,"[0.04196575, -0.0018556889, 0.39130285, -0.001...",0.041966,-0.001856,0.391303,-0.001653,-0.006171,...,0.005797,0.058086,0.061256,-0.011417,-0.090069,0.088907,0.014429,-0.066296,-0.023275,-0.100700
539863,644,1480,1,0.0,"[-0.04260242, -0.061229695, 0.3627048, 0.35160...",-0.042602,-0.061230,0.362705,0.351602,0.115362,...,-0.020421,0.127872,-0.178293,0.391793,0.076130,0.343802,-0.057263,0.604478,-0.126475,-0.162552


In [25]:
test_df[[f"emb_{i}" for i in range(0, 50)]] = pd.DataFrame(test_df.emb_hadamard.tolist(), index= test_df.index)
test_df

Unnamed: 0,left,right,value,emb_hadamard,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,...,emb_40,emb_41,emb_42,emb_43,emb_44,emb_45,emb_46,emb_47,emb_48,emb_49
0,89,964,0,"[0.0057807583, 0.15669572, 0.42289734, 0.01115...",0.005781,0.156696,0.422897,0.011152,0.153249,-0.014131,...,0.002042,-0.007311,0.000010,0.021289,0.017233,0.017202,-0.000126,0.001978,0.009760,0.001937
1,315,718,0,"[-0.037276134, -0.18977623, -1.1039468, 0.0287...",-0.037276,-0.189776,-1.103947,0.028770,-0.026932,-0.028906,...,-0.051305,-0.076320,-0.014615,-0.313216,-0.204194,0.040122,-0.043276,-0.025743,0.010548,-0.093224
2,375,1230,0,"[0.016731013, 0.36551604, -0.79233557, -0.0020...",0.016731,0.365516,-0.792336,-0.002029,0.009216,0.006715,...,-0.008093,-0.024294,-0.003483,-0.003414,-0.001449,-0.005086,-0.024705,0.002292,0.044788,-0.003189
3,170,1476,0,"[0.26960438, -0.7249424, -0.5593923, 0.0334723...",0.269604,-0.724942,-0.559392,0.033472,-0.001059,-0.031464,...,0.015620,-0.131364,-0.043305,-0.031374,-0.221050,-0.004050,-0.013919,0.000864,-0.125346,0.081181
4,467,900,0,"[0.2801725, -0.048824374, -0.709033, 0.0257184...",0.280172,-0.048824,-0.709033,0.025718,0.003547,-0.062496,...,-0.017003,-0.003171,-0.074845,0.009884,-0.002805,0.003451,-0.043501,0.084408,0.022579,-0.000303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107968,346,1457,0,"[0.02255107, -0.1489515, -0.7086432, 0.0412895...",0.022551,-0.148952,-0.708643,0.041290,0.016965,-0.240126,...,0.001967,-0.008345,0.006946,-0.001582,-0.011056,0.000909,0.002762,-0.023678,-0.000170,0.002113
107969,37,1140,0,"[0.21558098, -0.2971523, -2.292778, 0.01702204...",0.215581,-0.297152,-2.292778,0.017022,0.053679,0.238225,...,-0.028388,-0.683870,0.050961,0.007438,-0.012742,0.026029,-0.109624,0.039643,0.563127,0.035217
107970,384,1448,0,"[-0.102002464, 3.156612, -5.9415064, -0.003712...",-0.102002,3.156612,-5.941506,-0.003713,-1.377272,-1.353324,...,-0.003037,-0.037282,-0.039783,0.157868,-0.020017,-0.144187,0.029312,0.138866,-0.159435,0.002863
107971,358,906,0,"[0.25591987, -0.108881, -0.026541354, 0.021006...",0.255920,-0.108881,-0.026541,0.021007,0.206045,-0.004974,...,-0.186522,0.042858,0.007933,0.012236,0.008721,-0.004936,-0.018340,0.047088,0.001914,-0.002207


In [26]:
from sklearn.svm import SVC
clf = SVC(random_state=42, probability=True)
clf.fit(reduced_train_df[[f"emb_{i}" for i in range(0, 50)]].values, reduced_train_df["value"].values)

SVC(probability=True, random_state=42)

In [27]:
preds_prob = clf.predict_proba(test_df[[f"emb_{i}" for i in range(0, 50)]].values)
y_true = test_df["value"].values

In [28]:
preds_prob[:, 1].shape

(107973,)

In [29]:
y_true.shape

(107973,)

In [30]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
auroc = roc_auc_score(y_true, preds_prob[:, 1])
precision, recall, thresholds = precision_recall_curve(y_true, preds_prob[:, 1])
auprc = auc(recall, precision)
print(f"AUROC = {auroc}")
print(f"AUPRC = {auprc}")

AUROC = 0.9814029769232891
AUPRC = 0.6663972143379774


In [26]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 42, max_depth=50)
rf.fit(reduced_train_df[[f"emb_{i}" for i in range(0, reduced_train_df["emb_hadamard"].values[0].shape[0])]].values, reduced_train_df["value"].values)

RandomForestClassifier(max_depth=50, random_state=42)

In [27]:
preds_prob_rf = rf.predict_proba(test_df[[f"emb_{i}" for i in range(0, reduced_train_df["emb_hadamard"].values[0].shape[0])]].values)
y_true = test_df["value"].values

In [28]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
auroc = roc_auc_score(y_true, preds_prob_rf[:, 1])
precision, recall, thresholds = precision_recall_curve(y_true, preds_prob_rf[:, 1])
auprc = auc(recall, precision)
print('Random forest')
print(f"AUROC = {auroc}")
print(f"AUPRC = {auprc}")

Random forest
AUROC = 0.9858142840955717
AUPRC = 0.8459038003376732


In [29]:
print("Done")

Done
