This notebook uses the representations learnt by various methods for drugs and proteins, and uses a downstream classifier to predict the presence/absence of a drug-protein interaction in the test split.

In [1]:
import pandas as pd
import numpy as np

In [2]:
sample_id = 1
repr_folder = f"../../NIMCGCN/data/PolyP3/1/"
drug_emb_file = f"drug_repr.npy"
protein_emb_file = f"protein_repr.npy"

In [3]:
drug_repr_np = np.load(repr_folder + drug_emb_file)
drug_repr_np.shape

(645, 64)

In [4]:
protein_repr_np = np.load(repr_folder + protein_emb_file)
protein_repr_np.shape

(837, 64)

In [5]:
emb_dict = {}
for i in range(drug_repr_np.shape[0]):
    emb_dict[str(i)] = drug_repr_np[i]
for i in range(protein_repr_np.shape[0]):
    emb_dict[str(645 + i)] = protein_repr_np[i]

In [6]:
len(emb_dict)

1482

In [7]:
ncmf_input_files_path = f"../../datasets/NCMF/PolyP3/"
train_df = pd.read_csv(ncmf_input_files_path + f"sampled{sample_id}_link.dat", sep= "\t", header=None)
train_df.columns = ["left", "right", "link_type", "value"]
train_df

Unnamed: 0,left,right,link_type,value
0,0,0,0,1.0
1,0,6,0,1.0
2,0,8,0,1.0
3,0,10,0,1.0
4,0,17,0,1.0
...,...,...,...,...
160429,1480,1429,2,1.0
160430,1480,1448,2,1.0
160431,1480,1455,2,1.0
160432,1480,1459,2,1.0


In [8]:
train_df[train_df["link_type"] == 1]["value"].value_counts()

1.0    12545
Name: value, dtype: int64

In [9]:
entity_df = pd.read_csv(ncmf_input_files_path + "entity.csv")
entity_df

Unnamed: 0,Entity Names
0,CID000000085
1,CID000000119
2,CID000000143
3,CID000000158
4,CID000000159
...,...
1477,84816
1478,4190
1479,92483
1480,10988


In [10]:
reduced_train_df = train_df[train_df["link_type"] ==1]
reduced_train_df

Unnamed: 0,left,right,link_type,value
127591,1,804,1,1.0
127592,1,828,1,1.0
127593,1,829,1,1.0
127594,1,830,1,1.0
127595,1,831,1,1.0
...,...,...,...,...
140131,628,1446,1,1.0
140132,629,892,1,1.0
140133,632,892,1,1.0
140134,632,942,1,1.0


In [11]:
X1_train = np.zeros((645, 837))
for idx, row in reduced_train_df.iterrows():
    left_idx = int(row["left"])
    right_idx = int(row["right"]) - 645
    X1_train[left_idx][right_idx] = 1
X1_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
(X1_train == 1).sum()

12545

In [13]:
reduced_train_df = pd.DataFrame(columns = ["left", "right", "link_type", "value"])
reduced_train_df["left"] = sorted(list(range(0, 645)) * 837)
reduced_train_df["right"] = list(range(645, 645 + 837)) * 645
reduced_train_df["link_type"] = 1
reduced_train_df["value"] = X1_train.flatten()
reduced_train_df

Unnamed: 0,left,right,link_type,value
0,0,645,1,0.0
1,0,646,1,0.0
2,0,647,1,0.0
3,0,648,1,0.0
4,0,649,1,0.0
...,...,...,...,...
539860,644,1477,1,0.0
539861,644,1478,1,0.0
539862,644,1479,1,0.0
539863,644,1480,1,0.0


In [14]:
# # add in zeros for training
# x, y = np.where(X1_train == 0)
# for i, j in list(zip(x, y)):
#     reduced_train_df = reduced_train_df.append({"left": i, "right": j + 645, "link_type": 1, "value": 0}, ignore_index=True)

In [15]:
reduced_train_df["value"].value_counts()

0.0    527320
1.0     12545
Name: value, dtype: int64

In [16]:
test_df = pd.read_csv(ncmf_input_files_path + f"sampled{sample_id}_link.dat.test", sep="\t", header=None)
test_df.columns = ["left", "right", "value"]
test_df

Unnamed: 0,left,right,value
0,89,964,0
1,315,718,0
2,375,1230,0
3,170,1476,0
4,467,900,0
...,...,...,...
107968,346,1457,0
107969,37,1140,0
107970,384,1448,0
107971,358,906,0


In [17]:
test_df["value"].value_counts()

0    104943
1      3030
Name: value, dtype: int64

In [18]:
def get_hadamard_product(row):
    left_emb = emb_dict[str(int(row["left"]))]
    right_emb = emb_dict[str(int(row["right"]))]
    pdt_emb = np.multiply(left_emb, right_emb)
    return pdt_emb

In [19]:
reduced_train_df["emb_hadamard"] = reduced_train_df.apply(lambda row: get_hadamard_product(row), axis = 1)
reduced_train_df

Unnamed: 0,left,right,link_type,value,emb_hadamard
0,0,645,1,0.0,"[0.0, 0.0053919507, 0.0018105786, 0.0, 0.0, 0...."
1,0,646,1,0.0,"[0.0, 0.0052175275, 0.0017493754, 0.0, 0.0, 0...."
2,0,647,1,0.0,"[0.0, 0.0052579595, 0.0017556173, 0.0, 0.0, 0...."
3,0,648,1,0.0,"[0.0, 0.00553453, 0.0019905972, 0.0, 0.0, 0.0,..."
4,0,649,1,0.0,"[0.0, 0.0052552004, 0.0018880406, 0.0, 0.0, 0...."
...,...,...,...,...,...
539860,644,1477,1,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
539861,644,1478,1,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
539862,644,1479,1,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
539863,644,1480,1,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [20]:
reduced_train_df.reset_index(drop=True, inplace=True)

In [21]:
test_df["emb_hadamard"] = test_df.apply(lambda row: get_hadamard_product(row), axis = 1)
test_df

Unnamed: 0,left,right,value,emb_hadamard
0,89,964,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,315,718,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,375,1230,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,170,1476,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,467,900,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
107968,346,1457,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
107969,37,1140,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
107970,384,1448,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
107971,358,906,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [22]:
test_df.reset_index(inplace=True, drop=True)

In [23]:
reduced_train_df["emb_hadamard"].values[0].shape

(64,)

In [24]:
reduced_train_df[[f"emb_{i}" for i in range(0, 64)]] = pd.DataFrame(reduced_train_df.emb_hadamard.tolist(), index= reduced_train_df.index)
reduced_train_df

Unnamed: 0,left,right,link_type,value,emb_hadamard,emb_0,emb_1,emb_2,emb_3,emb_4,...,emb_54,emb_55,emb_56,emb_57,emb_58,emb_59,emb_60,emb_61,emb_62,emb_63
0,0,645,1,0.0,"[0.0, 0.0053919507, 0.0018105786, 0.0, 0.0, 0....",0.0,0.005392,0.001811,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001456
1,0,646,1,0.0,"[0.0, 0.0052175275, 0.0017493754, 0.0, 0.0, 0....",0.0,0.005218,0.001749,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001408
2,0,647,1,0.0,"[0.0, 0.0052579595, 0.0017556173, 0.0, 0.0, 0....",0.0,0.005258,0.001756,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001419
3,0,648,1,0.0,"[0.0, 0.00553453, 0.0019905972, 0.0, 0.0, 0.0,...",0.0,0.005535,0.001991,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001493
4,0,649,1,0.0,"[0.0, 0.0052552004, 0.0018880406, 0.0, 0.0, 0....",0.0,0.005255,0.001888,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
539860,644,1477,1,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
539861,644,1478,1,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
539862,644,1479,1,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
539863,644,1480,1,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


In [25]:
test_df[[f"emb_{i}" for i in range(0, 64)]] = pd.DataFrame(test_df.emb_hadamard.tolist(), index= test_df.index)
test_df

Unnamed: 0,left,right,value,emb_hadamard,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,...,emb_54,emb_55,emb_56,emb_57,emb_58,emb_59,emb_60,emb_61,emb_62,emb_63
0,89,964,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,315,718,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,375,1230,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,170,1476,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,467,900,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107968,346,1457,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
107969,37,1140,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
107970,384,1448,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
107971,358,906,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
from sklearn.svm import SVC
clf = SVC(random_state=42, probability=True)
clf.fit(reduced_train_df[[f"emb_{i}" for i in range(0, 64)]].values, reduced_train_df["value"].values)

SVC(probability=True, random_state=42)

In [30]:
preds_prob = clf.predict_proba(test_df[[f"emb_{i}" for i in range(0, 64)]].values)
y_true = test_df["value"].values

In [31]:
preds_prob[:, 1].shape

(107973,)

In [32]:
y_true.shape

(107973,)

In [33]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
auroc = roc_auc_score(y_true, preds_prob[:, 1])
precision, recall, thresholds = precision_recall_curve(y_true, preds_prob[:, 1])
auprc = auc(recall, precision)
print(f"AUROC = {auroc}")
print(f"AUPRC = {auprc}")

AUROC = 0.8940700356305318
AUPRC = 0.8361574920024466


In [26]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 42, max_depth=50)
rf.fit(reduced_train_df[[f"emb_{i}" for i in range(0, reduced_train_df["emb_hadamard"].values[0].shape[0])]].values, reduced_train_df["value"].values)

RandomForestClassifier(max_depth=50, random_state=42)

In [30]:
preds_prob_rf = rf.predict_proba(test_df[[f"emb_{i}" for i in range(0, reduced_train_df["emb_hadamard"].values[0].shape[0])]].values)
y_true = test_df["value"].values

In [31]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
auroc = roc_auc_score(y_true, preds_prob_rf[:, 1])
precision, recall, thresholds = precision_recall_curve(y_true, preds_prob_rf[:, 1])
auprc = auc(recall, precision)
print('Random forest')
print(f"AUROC = {auroc}")
print(f"AUPRC = {auprc}")

Random forest
AUROC = 0.9405357989559571
AUPRC = 0.853142431704718
