This notebook uses the representations learnt by various methods for drugs and proteins, and uses a downstream classifier to predict the presence/absence of a drug-protein interaction in the test split.

In [1]:
import pandas as pd
import numpy as np

In [2]:
sample_id = 1
repr_folder = f"../../datasets/NCMF/PolyP3/trials/sigmoid_actf_lr1e-6_ct1e-5_wd1e-3_k200_hadamard_deep_recon1/"
emb_file = f"emb_sample_{sample_id}.dat"

In [3]:
def load(emb_file_path):
    emb_dict = {}
    with open(emb_file_path, 'r') as emb_file:
        for i, line in enumerate(emb_file):
            if i == 0:
                train_para = line[:-1]
            else:
                index, emb = line[:-1].split('\t')
                emb_dict[index] = np.array(emb.split()).astype(np.float32)

    return train_para, emb_dict

In [4]:
train_para, emb_dict = load(repr_folder + emb_file) # indices 0 to 645 form drug embeddings here

In [5]:
ncmf_input_files_path = f"../../datasets/NCMF/PolyP3/"
train_df = pd.read_csv(ncmf_input_files_path + f"sampled{sample_id}_link.dat", sep= "\t", header=None)
train_df.columns = ["left", "right", "link_type", "value"]
train_df

Unnamed: 0,left,right,link_type,value
0,0,0,0,1.0
1,0,6,0,1.0
2,0,8,0,1.0
3,0,10,0,1.0
4,0,17,0,1.0
...,...,...,...,...
160429,1480,1429,2,1.0
160430,1480,1448,2,1.0
160431,1480,1455,2,1.0
160432,1480,1459,2,1.0


In [6]:
train_df[train_df["link_type"] == 1]["value"].value_counts()

1.0    12545
Name: value, dtype: int64

In [7]:
entity_df = pd.read_csv(ncmf_input_files_path + "entity.csv")
entity_df

Unnamed: 0,Entity Names
0,CID000000085
1,CID000000119
2,CID000000143
3,CID000000158
4,CID000000159
...,...
1477,84816
1478,4190
1479,92483
1480,10988


In [8]:
reduced_train_df = train_df[train_df["link_type"] ==1]
reduced_train_df

Unnamed: 0,left,right,link_type,value
127591,1,804,1,1.0
127592,1,828,1,1.0
127593,1,829,1,1.0
127594,1,830,1,1.0
127595,1,831,1,1.0
...,...,...,...,...
140131,628,1446,1,1.0
140132,629,892,1,1.0
140133,632,892,1,1.0
140134,632,942,1,1.0


In [9]:
X1_train = np.zeros((645, 837))
for idx, row in reduced_train_df.iterrows():
    left_idx = int(row["left"])
    right_idx = int(row["right"]) - 645
    X1_train[left_idx][right_idx] = 1
X1_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
(X1_train == 1).sum()

12545

In [11]:
reduced_train_df = pd.DataFrame(columns = ["left", "right", "link_type", "value"])
reduced_train_df["left"] = sorted(list(range(0, 645)) * 837)
reduced_train_df["right"] = list(range(645, 645 + 837)) * 645
reduced_train_df["link_type"] = 1
reduced_train_df["value"] = X1_train.flatten()
reduced_train_df

Unnamed: 0,left,right,link_type,value
0,0,645,1,0.0
1,0,646,1,0.0
2,0,647,1,0.0
3,0,648,1,0.0
4,0,649,1,0.0
...,...,...,...,...
539860,644,1477,1,0.0
539861,644,1478,1,0.0
539862,644,1479,1,0.0
539863,644,1480,1,0.0


In [12]:
# # add in zeros for training
# x, y = np.where(X1_train == 0)
# for i, j in list(zip(x, y)):
#     reduced_train_df = reduced_train_df.append({"left": i, "right": j + 645, "link_type": 1, "value": 0}, ignore_index=True)

In [13]:
reduced_train_df["value"].value_counts()

0.0    527320
1.0     12545
Name: value, dtype: int64

In [14]:
test_df = pd.read_csv(ncmf_input_files_path + f"sampled{sample_id}_link.dat.test", sep="\t", header=None)
test_df.columns = ["left", "right", "value"]
test_df

Unnamed: 0,left,right,value
0,89,964,0
1,315,718,0
2,375,1230,0
3,170,1476,0
4,467,900,0
...,...,...,...
107968,346,1457,0
107969,37,1140,0
107970,384,1448,0
107971,358,906,0


In [15]:
test_df["value"].value_counts()

0    104943
1      3030
Name: value, dtype: int64

In [16]:
def get_hadamard_product(row):
    left_emb = emb_dict[str(int(row["left"]))]
    right_emb = emb_dict[str(int(row["right"]))]
    pdt_emb = np.multiply(left_emb, right_emb)
    return pdt_emb

In [17]:
reduced_train_df["emb_hadamard"] = reduced_train_df.apply(lambda row: get_hadamard_product(row), axis = 1)
reduced_train_df

Unnamed: 0,left,right,link_type,value,emb_hadamard
0,0,645,1,0.0,"[-2.843807e-05, -0.00023545056, -0.00041261216..."
1,0,646,1,0.0,"[8.591136e-05, -0.0003064621, -0.0005390078, -..."
2,0,647,1,0.0,"[0.00029363463, -0.00028544624, -0.00058832654..."
3,0,648,1,0.0,"[-0.0012859012, -0.00036496072, -0.00043236735..."
4,0,649,1,0.0,"[-0.0008193139, -0.00030301115, -0.00066416775..."
...,...,...,...,...,...
539860,644,1477,1,0.0,"[-0.0004823991, 0.002453412, -0.0004459532, -0..."
539861,644,1478,1,0.0,"[-3.782197e-05, 0.0024777607, -0.0002187362, -..."
539862,644,1479,1,0.0,"[-7.1266e-05, 0.0022306112, -0.00042474567, -0..."
539863,644,1480,1,0.0,"[7.6308825e-05, 0.0021923312, -0.0005966915, -..."


In [18]:
reduced_train_df.reset_index(drop=True, inplace=True)

In [19]:
test_df["emb_hadamard"] = test_df.apply(lambda row: get_hadamard_product(row), axis = 1)
test_df

Unnamed: 0,left,right,value,emb_hadamard
0,89,964,0,"[2.0518235e-05, 0.003596268, -0.0005952283, -0..."
1,315,718,0,"[-0.00027651945, 0.0009618267, -0.00038450595,..."
2,375,1230,0,"[8.324516e-05, 0.0027363673, 0.0008073522, -0...."
3,170,1476,0,"[-0.00050329993, 0.00077419385, 4.9240505e-05,..."
4,467,900,0,"[-0.0011531753, 0.00043058532, 0.001284379, -0..."
...,...,...,...,...
107968,346,1457,0,"[-0.00029146194, 0.00021748731, 0.00081251503,..."
107969,37,1140,0,"[-0.00043620938, -0.0005631415, -4.9192662e-05..."
107970,384,1448,0,"[-0.00013446993, -3.2599738e-07, 9.801878e-05,..."
107971,358,906,0,"[0.00022103061, 0.0027756426, 0.00050498283, -..."


In [20]:
test_df.reset_index(inplace=True, drop=True)

In [21]:
reduced_train_df["emb_hadamard"].values[0].shape

(200,)

In [22]:
reduced_train_df[[f"emb_{i}" for i in range(0, reduced_train_df["emb_hadamard"].values[0].shape[0])]] = pd.DataFrame(reduced_train_df.emb_hadamard.tolist(), index= reduced_train_df.index)
reduced_train_df

Unnamed: 0,left,right,link_type,value,emb_hadamard,emb_0,emb_1,emb_2,emb_3,emb_4,...,emb_190,emb_191,emb_192,emb_193,emb_194,emb_195,emb_196,emb_197,emb_198,emb_199
0,0,645,1,0.0,"[-2.843807e-05, -0.00023545056, -0.00041261216...",-0.000028,-0.000235,-0.000413,-0.011143,0.000491,...,-0.000774,-0.001678,-0.000781,-0.001169,0.000931,-0.005930,0.001148,-0.000826,-0.000326,-0.002248
1,0,646,1,0.0,"[8.591136e-05, -0.0003064621, -0.0005390078, -...",0.000086,-0.000306,-0.000539,-0.011171,0.000398,...,-0.000748,-0.001933,-0.001015,-0.002895,0.000361,-0.005471,0.000809,0.000465,-0.000237,-0.003768
2,0,647,1,0.0,"[0.00029363463, -0.00028544624, -0.00058832654...",0.000294,-0.000285,-0.000588,-0.011942,0.000413,...,-0.000973,0.002920,-0.000857,-0.002215,0.000769,-0.005281,0.000717,-0.000263,-0.000296,-0.001142
3,0,648,1,0.0,"[-0.0012859012, -0.00036496072, -0.00043236735...",-0.001286,-0.000365,-0.000432,-0.013082,0.000595,...,-0.001102,-0.000156,-0.000447,-0.001635,0.000246,-0.005513,0.000419,0.000085,-0.000009,-0.010382
4,0,649,1,0.0,"[-0.0008193139, -0.00030301115, -0.00066416775...",-0.000819,-0.000303,-0.000664,-0.013293,0.000612,...,-0.001062,0.001828,-0.000479,-0.001415,0.000180,-0.005921,0.000982,-0.000321,-0.000053,-0.009772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
539860,644,1477,1,0.0,"[-0.0004823991, 0.002453412, -0.0004459532, -0...",-0.000482,0.002453,-0.000446,-0.009273,-0.000576,...,0.000105,0.000034,-0.000718,0.000537,-0.000186,-0.006342,0.000616,-0.000274,0.000245,-0.001122
539861,644,1478,1,0.0,"[-3.782197e-05, 0.0024777607, -0.0002187362, -...",-0.000038,0.002478,-0.000219,-0.008138,-0.000526,...,0.000095,0.000600,-0.000932,0.000764,0.000300,-0.006334,0.000681,0.000265,0.000372,-0.002807
539862,644,1479,1,0.0,"[-7.1266e-05, 0.0022306112, -0.00042474567, -0...",-0.000071,0.002231,-0.000425,-0.010882,-0.000416,...,0.000090,0.000078,-0.000735,0.000073,0.000017,-0.006323,0.000529,0.000103,0.000320,-0.000734
539863,644,1480,1,0.0,"[7.6308825e-05, 0.0021923312, -0.0005966915, -...",0.000076,0.002192,-0.000597,-0.010015,-0.000561,...,0.000113,-0.002812,-0.000837,0.000821,0.000200,-0.008109,0.000689,0.000096,0.000233,-0.001099


In [23]:
test_df[[f"emb_{i}" for i in range(0, reduced_train_df["emb_hadamard"].values[0].shape[0])]] = pd.DataFrame(test_df.emb_hadamard.tolist(), index= test_df.index)
test_df

Unnamed: 0,left,right,value,emb_hadamard,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,...,emb_190,emb_191,emb_192,emb_193,emb_194,emb_195,emb_196,emb_197,emb_198,emb_199
0,89,964,0,"[2.0518235e-05, 0.003596268, -0.0005952283, -0...",0.000021,3.596268e-03,-0.000595,-0.007119,-0.001565,-0.000581,...,0.000426,-0.000194,0.000151,0.000351,-0.000269,-0.007335,0.000968,3.271742e-04,-0.000004,-0.004561
1,315,718,0,"[-0.00027651945, 0.0009618267, -0.00038450595,...",-0.000277,9.618267e-04,-0.000385,-0.007499,-0.002460,-0.001276,...,0.000051,0.001010,-0.000321,-0.002968,0.000673,-0.005449,0.000798,4.275242e-04,0.000030,-0.005228
2,375,1230,0,"[8.324516e-05, 0.0027363673, 0.0008073522, -0....",0.000083,2.736367e-03,0.000807,-0.007663,-0.001065,-0.000420,...,-0.002726,-0.001105,0.000112,-0.000485,0.000261,-0.006686,0.000129,3.130826e-05,0.000027,-0.006222
3,170,1476,0,"[-0.00050329993, 0.00077419385, 4.9240505e-05,...",-0.000503,7.741938e-04,0.000049,-0.008690,-0.001223,-0.000343,...,0.000532,-0.000228,-0.001007,-0.000236,0.000200,-0.006588,0.000968,3.943312e-07,-0.000074,-0.003780
4,467,900,0,"[-0.0011531753, 0.00043058532, 0.001284379, -0...",-0.001153,4.305853e-04,0.001284,-0.003561,-0.000291,-0.000215,...,-0.000391,0.000447,-0.000714,-0.001694,0.000252,-0.004547,0.000469,1.171313e-03,-0.000095,0.003478
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107968,346,1457,0,"[-0.00029146194, 0.00021748731, 0.00081251503,...",-0.000291,2.174873e-04,0.000813,-0.008102,-0.001487,-0.001240,...,-0.001387,-0.000457,-0.000148,0.000468,0.000023,-0.007164,0.000333,-2.164784e-04,0.000139,-0.004475
107969,37,1140,0,"[-0.00043620938, -0.0005631415, -4.9192662e-05...",-0.000436,-5.631415e-04,-0.000049,-0.006378,-0.000514,-0.001500,...,0.000911,-0.000351,-0.001227,0.000232,-0.000013,-0.007310,0.000164,2.145133e-04,-0.000887,0.000156
107970,384,1448,0,"[-0.00013446993, -3.2599738e-07, 9.801878e-05,...",-0.000134,-3.259974e-07,0.000098,-0.009075,0.000424,-0.001587,...,-0.001432,-0.000947,-0.000864,0.000175,0.000027,-0.005087,0.000536,-7.811708e-04,-0.000725,0.000534
107971,358,906,0,"[0.00022103061, 0.0027756426, 0.00050498283, -...",0.000221,2.775643e-03,0.000505,-0.008419,-0.000984,-0.001075,...,-0.000688,-0.001295,-0.000097,-0.000011,0.000070,-0.007075,0.000340,2.408595e-04,0.000189,-0.005915


In [24]:
from sklearn.svm import SVC
clf = SVC(random_state=42, probability=True)
clf.fit(reduced_train_df[[f"emb_{i}" for i in range(0, reduced_train_df["emb_hadamard"].values[0].shape[0])]].values, reduced_train_df["value"].values)

SVC(probability=True, random_state=42)

In [25]:
preds_prob = clf.predict_proba(test_df[[f"emb_{i}" for i in range(0, reduced_train_df["emb_hadamard"].values[0].shape[0])]].values)
y_true = test_df["value"].values

In [26]:
preds_prob[:, 1].shape

(107973,)

In [27]:
y_true.shape

(107973,)

In [28]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
auroc = roc_auc_score(y_true, preds_prob[:, 1])
precision, recall, thresholds = precision_recall_curve(y_true, preds_prob[:, 1])
auprc = auc(recall, precision)
print(f"AUROC = {auroc}")
print(f"AUPRC = {auprc}")

AUROC = 0.9693550033085696
AUPRC = 0.9044914487971679
