This notebook uses the representations learnt by various methods for drugs and proteins, and uses a downstream classifier to predict the presence/absence of a drug-protein interaction in the test split.

In [1]:
import pandas as pd
import numpy as np
import torch
torch.manual_seed(0)

<torch._C.Generator at 0x7f22fcbde190>

In [2]:
sample_id = 1
repr_folder = f"../../datasets/NCMF/PolyP3/trials/sigmoid_actf_lr1e-6_ct1e-5_wd1e-3_k200_hadamard_recon/"
emb_file = f"emb_sample_{sample_id}.dat"
method = "hadamard" # set to hadamard or concat

In [3]:
def load(emb_file_path):
    emb_dict = {}
    with open(emb_file_path, 'r') as emb_file:
        for i, line in enumerate(emb_file):
            if i == 0:
                train_para = line[:-1]
            else:
                index, emb = line[:-1].split('\t')
                emb_dict[index] = np.array(emb.split()).astype(np.float32)

    return train_para, emb_dict

In [4]:
train_para, emb_dict = load(repr_folder + emb_file) # indices 0 to 645 form drug embeddings here

In [5]:
ncmf_input_files_path = f"../../datasets/NCMF/PolyP3/"
train_df = pd.read_csv(ncmf_input_files_path + f"sampled{sample_id}_link.dat", sep= "\t", header=None)
train_df.columns = ["left", "right", "link_type", "value"]
train_df

Unnamed: 0,left,right,link_type,value
0,0,0,0,1.0
1,0,6,0,1.0
2,0,8,0,1.0
3,0,10,0,1.0
4,0,17,0,1.0
...,...,...,...,...
160429,1480,1429,2,1.0
160430,1480,1448,2,1.0
160431,1480,1455,2,1.0
160432,1480,1459,2,1.0


In [6]:
train_df[train_df["link_type"] == 1]["value"].value_counts()

1.0    12545
Name: value, dtype: int64

In [7]:
entity_df = pd.read_csv(ncmf_input_files_path + "entity.csv")
entity_df

Unnamed: 0,Entity Names
0,CID000000085
1,CID000000119
2,CID000000143
3,CID000000158
4,CID000000159
...,...
1477,84816
1478,4190
1479,92483
1480,10988


In [8]:
reduced_train_df = train_df[train_df["link_type"] ==1]
reduced_train_df

Unnamed: 0,left,right,link_type,value
127591,1,804,1,1.0
127592,1,828,1,1.0
127593,1,829,1,1.0
127594,1,830,1,1.0
127595,1,831,1,1.0
...,...,...,...,...
140131,628,1446,1,1.0
140132,629,892,1,1.0
140133,632,892,1,1.0
140134,632,942,1,1.0


In [9]:
X1_train = np.zeros((645, 837))
for idx, row in reduced_train_df.iterrows():
    left_idx = int(row["left"])
    right_idx = int(row["right"]) - 645
    X1_train[left_idx][right_idx] = 1
X1_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
(X1_train == 1).sum()

12545

In [11]:
reduced_train_df = pd.DataFrame(columns = ["left", "right", "link_type", "value"])
reduced_train_df["left"] = sorted(list(range(0, 645)) * 837)
reduced_train_df["right"] = list(range(645, 645 + 837)) * 645
reduced_train_df["link_type"] = 1
reduced_train_df["value"] = X1_train.flatten()
reduced_train_df

Unnamed: 0,left,right,link_type,value
0,0,645,1,0.0
1,0,646,1,0.0
2,0,647,1,0.0
3,0,648,1,0.0
4,0,649,1,0.0
...,...,...,...,...
539860,644,1477,1,0.0
539861,644,1478,1,0.0
539862,644,1479,1,0.0
539863,644,1480,1,0.0


In [12]:
# # add in zeros for training
# x, y = np.where(X1_train == 0)
# for i, j in list(zip(x, y)):
#     reduced_train_df = reduced_train_df.append({"left": i, "right": j + 645, "link_type": 1, "value": 0}, ignore_index=True)

In [13]:
reduced_train_df["value"].value_counts()

0.0    527320
1.0     12545
Name: value, dtype: int64

In [14]:
test_df = pd.read_csv(ncmf_input_files_path + f"sampled{sample_id}_link.dat.test", sep="\t", header=None)
test_df.columns = ["left", "right", "value"]
test_df

Unnamed: 0,left,right,value
0,89,964,0
1,315,718,0
2,375,1230,0
3,170,1476,0
4,467,900,0
...,...,...,...
107968,346,1457,0
107969,37,1140,0
107970,384,1448,0
107971,358,906,0


In [15]:
test_df["value"].value_counts()

0    104943
1      3030
Name: value, dtype: int64

In [16]:
def get_hadamard_product(row):
    left_emb = emb_dict[str(int(row["left"]))]
    right_emb = emb_dict[str(int(row["right"]))]
    pdt_emb = np.multiply(left_emb, right_emb)
    return pdt_emb

In [17]:
reduced_train_df["emb_hadamard"] = reduced_train_df.apply(lambda row: get_hadamard_product(row), axis = 1)
reduced_train_df

Unnamed: 0,left,right,link_type,value,emb_hadamard
0,0,645,1,0.0,"[0.00023570865, 0.0017247543, 0.011380602, 0.0..."
1,0,646,1,0.0,"[0.00016482345, 0.00063030113, 0.012250309, 0...."
2,0,647,1,0.0,"[0.00011385756, 0.0005549456, 0.012023116, 0.0..."
3,0,648,1,0.0,"[0.00021123305, 0.00039221224, 0.010079711, 0...."
4,0,649,1,0.0,"[0.00023507388, 0.0010328647, 0.010273051, 0.0..."
...,...,...,...,...,...
539860,644,1477,1,0.0,"[-0.00089835894, 0.0019469011, 0.006007275, 0...."
539861,644,1478,1,0.0,"[-0.0011698911, 0.0012682296, 0.0059938203, 0...."
539862,644,1479,1,0.0,"[-0.0009473978, 0.001829997, 0.005998034, 0.02..."
539863,644,1480,1,0.0,"[-0.0008848954, 0.0022914514, 0.006154008, 0.0..."


In [18]:
reduced_train_df.reset_index(drop=True, inplace=True)

In [19]:
reduced_train_df["value"].value_counts()

0.0    527320
1.0     12545
Name: value, dtype: int64

In [20]:
test_df["emb_hadamard"] = test_df.apply(lambda row: get_hadamard_product(row), axis = 1)
test_df

Unnamed: 0,left,right,value,emb_hadamard
0,89,964,0,"[-0.00048404784, 0.0018291, 0.00698553, 0.0288..."
1,315,718,0,"[6.523688e-05, 0.00036913407, 0.0076034935, 0...."
2,375,1230,0,"[8.879748e-05, 0.0013260657, 0.0082228985, 0.0..."
3,170,1476,0,"[-0.00078403443, 0.0035086067, 0.008227681, 0...."
4,467,900,0,"[0.00055179757, 0.0016926968, 0.016691053, 0.0..."
...,...,...,...,...
107968,346,1457,0,"[0.00020281032, 0.0011192303, 0.014424974, 0.0..."
107969,37,1140,0,"[0.0002029961, 0.0019380077, 0.014615621, 0.01..."
107970,384,1448,0,"[0.00027665167, 0.0020447352, 0.0084478725, 0...."
107971,358,906,0,"[-0.00063080085, 0.0022097486, 0.010958516, 0...."


In [21]:
test_df.reset_index(inplace=True, drop=True)

In [22]:
reduced_train_df["emb_hadamard"].values[0].shape

(200,)

In [23]:
reduced_train_df[[f"emb_{i}" for i in range(0, reduced_train_df["emb_hadamard"].values[0].shape[0])]] = pd.DataFrame(reduced_train_df.emb_hadamard.tolist(), index= reduced_train_df.index)
reduced_train_df

Unnamed: 0,left,right,link_type,value,emb_hadamard,emb_0,emb_1,emb_2,emb_3,emb_4,...,emb_190,emb_191,emb_192,emb_193,emb_194,emb_195,emb_196,emb_197,emb_198,emb_199
0,0,645,1,0.0,"[0.00023570865, 0.0017247543, 0.011380602, 0.0...",0.000236,0.001725,0.011381,0.030737,-0.000234,...,0.001829,0.007265,0.012407,-0.008401,-0.004963,-0.000938,0.002927,-0.000693,-0.013598,0.000437
1,0,646,1,0.0,"[0.00016482345, 0.00063030113, 0.012250309, 0....",0.000165,0.000630,0.012250,0.031762,-0.000074,...,0.001191,0.006947,0.014814,-0.010631,-0.004768,-0.000913,0.001581,-0.000611,-0.010726,0.000328
2,0,647,1,0.0,"[0.00011385756, 0.0005549456, 0.012023116, 0.0...",0.000114,0.000555,0.012023,0.029355,-0.000067,...,0.001761,0.006746,0.014027,-0.009045,-0.003745,-0.000902,0.001123,-0.001431,-0.014374,0.000364
3,0,648,1,0.0,"[0.00021123305, 0.00039221224, 0.010079711, 0....",0.000211,0.000392,0.010080,0.032105,-0.000244,...,-0.001717,0.006689,0.015582,-0.008349,-0.002011,-0.001017,0.002054,-0.000650,-0.012568,0.000325
4,0,649,1,0.0,"[0.00023507388, 0.0010328647, 0.010273051, 0.0...",0.000235,0.001033,0.010273,0.034397,-0.000377,...,-0.001162,0.006054,0.015494,-0.008367,-0.002394,-0.001149,0.002301,-0.000280,-0.014652,0.000479
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
539860,644,1477,1,0.0,"[-0.00089835894, 0.0019469011, 0.006007275, 0....",-0.000898,0.001947,0.006007,0.026003,-0.000221,...,0.002572,0.008575,0.012251,-0.005174,-0.004265,-0.001973,0.003349,-0.000232,-0.010692,0.000351
539861,644,1478,1,0.0,"[-0.0011698911, 0.0012682296, 0.0059938203, 0....",-0.001170,0.001268,0.005994,0.026223,-0.000269,...,0.003370,0.009631,0.012453,-0.005346,-0.004525,-0.001718,0.003209,0.000104,-0.011791,0.000316
539862,644,1479,1,0.0,"[-0.0009473978, 0.001829997, 0.005998034, 0.02...",-0.000947,0.001830,0.005998,0.025326,-0.000297,...,0.002399,0.011029,0.013510,-0.006060,-0.005521,-0.001911,0.002957,-0.000111,-0.011799,0.000145
539863,644,1480,1,0.0,"[-0.0008848954, 0.0022914514, 0.006154008, 0.0...",-0.000885,0.002291,0.006154,0.023310,-0.000316,...,0.001779,0.009856,0.011909,-0.003764,-0.004496,-0.001560,0.001740,-0.000919,-0.014956,0.000428


In [24]:
test_df[[f"emb_{i}" for i in range(0, reduced_train_df["emb_hadamard"].values[0].shape[0])]] = pd.DataFrame(test_df.emb_hadamard.tolist(), index= test_df.index)
test_df

Unnamed: 0,left,right,value,emb_hadamard,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,...,emb_190,emb_191,emb_192,emb_193,emb_194,emb_195,emb_196,emb_197,emb_198,emb_199
0,89,964,0,"[-0.00048404784, 0.0018291, 0.00698553, 0.0288...",-0.000484,0.001829,0.006986,0.028838,-0.000708,0.012887,...,0.001437,0.007900,0.013239,-0.004949,-0.003293,-0.002360,0.002764,-0.000234,-0.009127,0.000295
1,315,718,0,"[6.523688e-05, 0.00036913407, 0.0076034935, 0....",0.000065,0.000369,0.007603,0.031199,0.000070,0.019466,...,0.000963,0.009413,0.015290,-0.006225,-0.003654,-0.000838,0.002688,0.000053,-0.010107,0.000769
2,375,1230,0,"[8.879748e-05, 0.0013260657, 0.0082228985, 0.0...",0.000089,0.001326,0.008223,0.029855,-0.001170,0.017782,...,0.003029,0.006545,0.014845,-0.003615,-0.004668,-0.004507,0.001990,-0.000218,-0.011511,0.000507
3,170,1476,0,"[-0.00078403443, 0.0035086067, 0.008227681, 0....",-0.000784,0.003509,0.008228,0.025607,0.000081,0.016861,...,0.001504,0.011866,0.014570,-0.005597,-0.004388,-0.000971,0.002002,-0.000860,-0.013065,0.000704
4,467,900,0,"[0.00055179757, 0.0016926968, 0.016691053, 0.0...",0.000552,0.001693,0.016691,0.020178,0.000214,0.015509,...,0.006503,0.003277,0.016757,-0.007639,0.000169,-0.003095,-0.000478,-0.000632,-0.015055,0.000695
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107968,346,1457,0,"[0.00020281032, 0.0011192303, 0.014424974, 0.0...",0.000203,0.001119,0.014425,0.022415,0.000549,0.018064,...,0.002778,0.004414,0.018418,-0.008146,0.000039,-0.004602,-0.000279,-0.000351,-0.019296,0.001294
107969,37,1140,0,"[0.0002029961, 0.0019380077, 0.014615621, 0.01...",0.000203,0.001938,0.014616,0.018420,0.000648,0.012788,...,0.004502,0.002293,0.013229,-0.010133,-0.000979,-0.002325,-0.000755,-0.000471,-0.013052,0.000675
107970,384,1448,0,"[0.00027665167, 0.0020447352, 0.0084478725, 0....",0.000277,0.002045,0.008448,0.027166,0.000123,0.014925,...,0.004016,0.007680,0.010972,-0.003500,0.000002,-0.000578,0.001337,0.000569,-0.015479,-0.000233
107971,358,906,0,"[-0.00063080085, 0.0022097486, 0.010958516, 0....",-0.000631,0.002210,0.010959,0.027748,-0.000313,0.013710,...,0.001264,0.002144,0.013782,-0.003745,-0.002579,-0.003338,0.001582,-0.000499,-0.014092,0.001044


In [25]:
import torch.nn as nn
from typing import List, Tuple, Optional, Any
from torch import Tensor

In [26]:
class recon_net(nn.Module):
    def __init__(self, in_features: int, method: str) -> None:
        super(recon_net, self).__init__()
        self.actf = nn.Sigmoid()
        self.method = method
        self.fc1 = nn.Linear(in_features, 150)
        self.fc2 = nn.Linear(150, 100)
        self.fc3 = nn.Linear(100, 50)
        self.fc4 = nn.Linear(50, 1)

    def forward(self, emb: Tensor) -> Tensor:
        x = emb
        h1 = self.actf(self.fc1(x))
        h2 = self.actf(self.fc2(h1))
        h3 = self.actf(self.fc3(h2))

        out = self.actf(self.fc4(h3))
        return out

In [27]:
# weights = reduced_train_df["value"].value_counts().sort_index(ascending=True).values/reduced_train_df.shape[0]
weights = [0.9, 0.1]
print(weights)

[0.9, 0.1]


In [28]:
net = recon_net(in_features=reduced_train_df["emb_hadamard"].values[0].shape[0], method=method)
optimizer = torch.optim.Adam(net.parameters(), lr = 1e-4)
epochs = 1000
# loss = nn.CrossEntropyLoss(weight = torch.Tensor(weights))
loss = nn.MSELoss(reduction="mean")
threshold = 1e-6

In [29]:
epoch_loss = []
for e in range(epochs):
    optimizer.zero_grad()
    net_input = torch.Tensor(reduced_train_df[[f"emb_{i}" for i in range(0, reduced_train_df["emb_hadamard"].values[0].shape[0])]].values)
    output_actual = torch.tensor(reduced_train_df["value"].values).type(torch.FloatTensor).view(-1, 1)
    output_pred = net(net_input)
    loss_e = loss(output_pred, output_actual)
    loss_e.backward()
    optimizer.step()
    
    epoch_loss.append(loss_e.item())
    
    if e % 10 == 0:
        print(f"Epoch loss for epoch {e} = {loss_e.item()}")
    if (len(epoch_loss) >= 2) and abs(epoch_loss[-1] - epoch_loss[-2]) < threshold:
        print("Converged!!!")
        break

Epoch loss for epoch 0 = 0.22501206398010254
Epoch loss for epoch 10 = 0.206857830286026
Epoch loss for epoch 20 = 0.18976792693138123
Epoch loss for epoch 30 = 0.17385371029376984
Epoch loss for epoch 40 = 0.1591562181711197
Epoch loss for epoch 50 = 0.1456659436225891
Epoch loss for epoch 60 = 0.13335110247135162
Epoch loss for epoch 70 = 0.12217070162296295
Epoch loss for epoch 80 = 0.11207748204469681
Epoch loss for epoch 90 = 0.10301680862903595
Epoch loss for epoch 100 = 0.0949256420135498
Epoch loss for epoch 110 = 0.08773314952850342
Epoch loss for epoch 120 = 0.08136282861232758
Epoch loss for epoch 130 = 0.07573547214269638
Epoch loss for epoch 140 = 0.0707721933722496
Epoch loss for epoch 150 = 0.06639714539051056
Epoch loss for epoch 160 = 0.06253939867019653
Epoch loss for epoch 170 = 0.0591340996325016
Epoch loss for epoch 180 = 0.056123122572898865
Epoch loss for epoch 190 = 0.05345501750707626
Epoch loss for epoch 200 = 0.051084764301776886
Epoch loss for epoch 210 = 0.

In [30]:
test_inputs = torch.Tensor(test_df[[f"emb_{i}" for i in range(0, reduced_train_df["emb_hadamard"].values[0].shape[0])]].values)
test_outputs = net(test_inputs)

In [31]:
# res = torch.argmax(test_outputs, dim = 1)
res = test_outputs

In [32]:
y_true = test_df["value"].values

In [33]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
auroc = roc_auc_score(y_true, res.detach().numpy())
precision, recall, thresholds = precision_recall_curve(y_true, res.detach().numpy())
auprc = auc(recall, precision)
print('Random forest')
print(f"AUROC = {auroc}")
print(f"AUPRC = {auprc}")

Random forest
AUROC = 0.5782970507107599
AUPRC = 0.053212997141920404
