This notebook uses the PolyP3 sample files generated for NCMF as input and creates corresponding input files to be used by NIMCGCN.

In [1]:
import pandas as pd
import numpy as np

In [2]:
sample_id = 3
drug_count = 645
protein_count = 837
input_data_folder = "../../datasets/NCMF/PolyP3/"
link_file_train = input_data_folder + f"sampled{sample_id}_link.dat"

In [3]:
train_df = pd.read_csv(link_file_train, sep = "\t", header=None)
train_df.columns = ["left", "right", "link_type", "value"]
train_df

Unnamed: 0,left,right,link_type,value
0,0,0,0,1.0
1,0,6,0,1.0
2,0,8,0,1.0
3,0,10,0,1.0
4,0,17,0,1.0
...,...,...,...,...
160396,1480,1429,2,1.0
160397,1480,1448,2,1.0
160398,1480,1455,2,1.0
160399,1480,1459,2,1.0


In [4]:
# create matrices
X0 = np.zeros((drug_count, drug_count))
X1 = np.zeros((drug_count, protein_count))
X2 = np.zeros((protein_count, protein_count))
print(X0.shape)
print(X1.shape)
print(X2.shape)

(645, 645)
(645, 837)
(837, 837)


In [5]:
for idx, row in train_df[train_df["link_type"] == 0].iterrows():
    X0[int(row["left"])][int(row["right"])] = 1
for idx, row in train_df[train_df["link_type"] == 1].iterrows():
    X1[int(row["left"])][int(row["right"])-645] = 1
for idx, row in train_df[train_df["link_type"] == 2].iterrows():
    X2[int(row["left"])-645][int(row["right"])-645] = 1

In [6]:
print(np.count_nonzero(X0))
print(np.count_nonzero(X1))
print(np.count_nonzero(X2))

127591
12512
20298


In [7]:
print(train_df[train_df["link_type"] == 0].shape)
print(train_df[train_df["link_type"] == 1].shape)
print(train_df[train_df["link_type"] == 2].shape)

(127591, 4)
(12512, 4)
(20298, 4)


In [8]:
X0_df = pd.DataFrame(X0)
X0_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,635,636,637,638,639,640,641,642,643,644
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
640,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
642,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [9]:
X1_df = pd.DataFrame(X1)
X1_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,827,828,829,830,831,832,833,834,835,836
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
640,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
642,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
X2_df = pd.DataFrame(X2)
X2_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,827,828,829,830,831,832,833,834,835,836
0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
832,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
834,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
835,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
! mkdir -p ../../NIMCGCN/data/PolyP3/{sample_id}/

In [12]:
X0_df.to_csv(f"../../NIMCGCN/data/PolyP3/{sample_id}/d-d.csv", header=False, index=False)
X1_df.to_csv(f"../../NIMCGCN/data/PolyP3/{sample_id}/d-p.csv", header=False, index=False)
X2_df.to_csv(f"../../NIMCGCN/data/PolyP3/{sample_id}/p-p.csv", header=False, index=False)