This notebook uses the files from NCMF and converts the drug-protein matrix into an edgelist that can be used by CSGNN.

In [1]:
import pandas as pd
import numpy as np

In [2]:
sample_id = 3
drug_count = 645
protein_count = 837
input_data_folder = "../../datasets/NCMF/PolyP3/"
link_file_train = input_data_folder + f"sampled{sample_id}_link.dat"

In [3]:
train_df = pd.read_csv(link_file_train, sep = "\t", header=None)
train_df.columns = ["left", "right", "link_type", "value"]
train_df

Unnamed: 0,left,right,link_type,value
0,0,0,0,1.0
1,0,6,0,1.0
2,0,8,0,1.0
3,0,10,0,1.0
4,0,17,0,1.0
...,...,...,...,...
160396,1480,1429,2,1.0
160397,1480,1448,2,1.0
160398,1480,1455,2,1.0
160399,1480,1459,2,1.0


In [4]:
# create matrices
X0 = np.zeros((drug_count, drug_count))
X1 = np.zeros((drug_count, protein_count))
X2 = np.zeros((protein_count, protein_count))
print(X0.shape)
print(X1.shape)
print(X2.shape)

(645, 645)
(645, 837)
(837, 837)


In [5]:
for idx, row in train_df[train_df["link_type"] == 0].iterrows():
    X0[int(row["left"])][int(row["right"])] = 1
for idx, row in train_df[train_df["link_type"] == 1].iterrows():
    X1[int(row["left"])][int(row["right"])-645] = 1
for idx, row in train_df[train_df["link_type"] == 2].iterrows():
    X2[int(row["left"])-645][int(row["right"])-645] = 1

In [6]:
print(np.count_nonzero(X0))
print(np.count_nonzero(X1))
print(np.count_nonzero(X2))

127591
12512
20298


In [7]:
print(train_df[train_df["link_type"] == 0].shape)
print(train_df[train_df["link_type"] == 1].shape)
print(train_df[train_df["link_type"] == 2].shape)

(127591, 4)
(12512, 4)
(20298, 4)


In [8]:
entity_df = pd.read_csv(input_data_folder + "entity.csv")
entity_df

Unnamed: 0,Entity Names
0,CID000000085
1,CID000000119
2,CID000000143
3,CID000000158
4,CID000000159
...,...
1477,84816
1478,4190
1479,92483
1480,10988


In [9]:
drugs = list(entity_df.iloc[0:645]["Entity Names"])
proteins = list(entity_df.iloc[645:]["Entity Names"])
print(len(drugs), len(proteins))

645 837


In [10]:
X1_df = pd.DataFrame(X1)
X1_df.columns = proteins
X1_df["drugs"] = drugs
X1_df.set_index("drugs", drop=True, inplace=True)
X1_df.index.name = None
X1_df = X1_df.add_prefix("protein_")
X1_df

Unnamed: 0,protein_3356,protein_3358,protein_3357,protein_3350,protein_150,protein_151,protein_152,protein_4988,protein_1814,protein_148,...,protein_51069,protein_6476,protein_3948,protein_728378,protein_4126,protein_84816,protein_4190,protein_92483,protein_10988,protein_221656
CID000000085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CID000000119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CID000000143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CID000000158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CID000000159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CID006435110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CID006436173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CID006447131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CID006918453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
x, y = np.where(X1 != 0)
len(x)

12512

In [12]:
with open(f"../../CSGNN/data/PolyP3_{sample_id}.edgelist", "w") as f:
    for i, j in list(zip(x, y)):
        f.write(f"{i} {j+645}\n")
f.close()
    

In [13]:
X1_original = pd.read_csv("../../datasets/NCMF/PolyP3/drug-protein.csv", header=None)
X1_original

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,827,828,829,830,831,832,833,834,835,836
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
640,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
642,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
x, y = np.where(X1_original != 0)
len(x)

15575

In [15]:
len(np.unique(np.array(list(set(x) | set(y + 645)))))

1090

In [16]:
# with open(f"../../CSGNN/data/PolyP3_full.edgelist", "w") as f:
#     for i, j in list(zip(x, y)):
#         f.write(f"{i} {j}\n")
# f.close()

In [17]:
test_df = pd.read_csv("../../datasets/NCMF/PolyP3/sampled1_link.dat.test",sep="\t", header=None)
test_df.columns = ["left", "right", "value"]
test_df

Unnamed: 0,left,right,value
0,89,964,0
1,315,718,0
2,375,1230,0
3,170,1476,0
4,467,900,0
...,...,...,...
107968,346,1457,0
107969,37,1140,0
107970,384,1448,0
107971,358,906,0


In [18]:
test_df.sort_values(by="value", ascending=False)

Unnamed: 0,left,right,value
20767,192,645,1
98783,277,655,1
95293,419,708,1
2288,402,687,1
91901,401,755,1
...,...,...,...
36498,35,1168,0
36497,20,1356,0
36496,562,777,0
36495,252,675,0
