This notebook uses the PolyP3 sample files generated for NCMF as input and creates corresponding input files to be used by GRGMF.

In [1]:
import pandas as pd
import numpy as np

In [2]:
sample_id = 3
drug_count = 645
protein_count = 837
input_data_folder = "../../datasets/NCMF/PolyP3/"
link_file_train = input_data_folder + f"sampled{sample_id}_link.dat"

In [3]:
train_df = pd.read_csv(link_file_train, sep = "\t", header=None)
train_df.columns = ["left", "right", "link_type", "value"]
train_df

Unnamed: 0,left,right,link_type,value
0,0,0,0,1.0
1,0,6,0,1.0
2,0,8,0,1.0
3,0,10,0,1.0
4,0,17,0,1.0
...,...,...,...,...
160396,1480,1429,2,1.0
160397,1480,1448,2,1.0
160398,1480,1455,2,1.0
160399,1480,1459,2,1.0


In [4]:
# create matrices
X0 = np.zeros((drug_count, drug_count))
X1 = np.zeros((drug_count, protein_count))
X2 = np.zeros((protein_count, protein_count))
print(X0.shape)
print(X1.shape)
print(X2.shape)

(645, 645)
(645, 837)
(837, 837)


In [5]:
for idx, row in train_df[train_df["link_type"] == 0].iterrows():
    X0[int(row["left"])][int(row["right"])] = 1
for idx, row in train_df[train_df["link_type"] == 1].iterrows():
    X1[int(row["left"])][int(row["right"])-645] = 1
for idx, row in train_df[train_df["link_type"] == 2].iterrows():
    X2[int(row["left"])-645][int(row["right"])-645] = 1

In [6]:
print(np.count_nonzero(X0))
print(np.count_nonzero(X1))
print(np.count_nonzero(X2))

127591
12512
20298


In [7]:
print(train_df[train_df["link_type"] == 0].shape)
print(train_df[train_df["link_type"] == 1].shape)
print(train_df[train_df["link_type"] == 2].shape)

(127591, 4)
(12512, 4)
(20298, 4)


In [8]:
entity_df = pd.read_csv(input_data_folder + "entity.csv")
entity_df

Unnamed: 0,Entity Names
0,CID000000085
1,CID000000119
2,CID000000143
3,CID000000158
4,CID000000159
...,...
1477,84816
1478,4190
1479,92483
1480,10988


In [9]:
drugs = list(entity_df.iloc[0:645]["Entity Names"])
proteins = list(entity_df.iloc[645:]["Entity Names"])
print(len(drugs), len(proteins))

645 837


In [10]:
X0_df = pd.DataFrame(X0)
X0_df.columns = drugs
X0_df["drugs"] = drugs
X0_df.set_index("drugs", drop=True, inplace=True)
X0_df.index.name = None
X0_df

Unnamed: 0,CID000000085,CID000000119,CID000000143,CID000000158,CID000000159,CID000000191,CID000000206,CID000000214,CID000000271,CID000000298,...,CID005487301,CID005493381,CID005493444,CID006398525,CID006398970,CID006435110,CID006436173,CID006447131,CID006918453,CID009571074
CID000000085,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
CID000000119,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CID000000143,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
CID000000158,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CID000000159,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CID006435110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
CID006436173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
CID006447131,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
CID006918453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [11]:
X1_df = pd.DataFrame(X1)
X1_df.columns = proteins
X1_df["drugs"] = drugs
X1_df.set_index("drugs", drop=True, inplace=True)
X1_df.index.name = None
X1_df = X1_df.add_prefix("protein_")
X1_df

Unnamed: 0,protein_3356,protein_3358,protein_3357,protein_3350,protein_150,protein_151,protein_152,protein_4988,protein_1814,protein_148,...,protein_51069,protein_6476,protein_3948,protein_728378,protein_4126,protein_84816,protein_4190,protein_92483,protein_10988,protein_221656
CID000000085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CID000000119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CID000000143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CID000000158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CID000000159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CID006435110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CID006436173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CID006447131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CID006918453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
X2_df = pd.DataFrame(X2)
X2_df.columns = proteins
X2_df["proteins"] = proteins
X2_df.set_index("proteins", drop=True, inplace=True)
X2_df.index.name = None
X2_df = X2_df.add_prefix("protein_")
X2_df

Unnamed: 0,protein_3356,protein_3358,protein_3357,protein_3350,protein_150,protein_151,protein_152,protein_4988,protein_1814,protein_148,...,protein_51069,protein_6476,protein_3948,protein_728378,protein_4126,protein_84816,protein_4190,protein_92483,protein_10988,protein_221656
3356,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3358,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3357,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3350,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
150,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4190,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
92483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
10988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
X0_df.to_csv(f"../../GRGMF/data/PolyP3_{sample_id}_A_sim.txt", sep="\t")
X1_df.astype(int).to_csv(f"../../GRGMF/data/PolyP3_{sample_id}_int.txt", sep="\t")
X2_df.to_csv(f"../../GRGMF/data/PolyP3_{sample_id}_B_sim.txt", sep="\t")