In [1]:
import pandas as pd
import numpy as np

In [2]:
sample_id = 3
ncmf_file_path = f"../../datasets/NCMF/CellLine/"
cmf_file_path = f"../../datasets/CMF/CellLine/sampled{sample_id}/"

In [3]:
# NCMF files
sampled_node_file = ncmf_file_path + f'sampled{sample_id}_node.dat'
sampled_link_file = ncmf_file_path + f'sampled{sample_id}_link.dat'
sampled_link_test_file = ncmf_file_path + f'sampled{sample_id}_link.dat.test'
sampled_label_file = ncmf_file_path + f'sampled{sample_id}_label.dat'
sampled_label_test_file = ncmf_file_path + f'sampled{sample_id}_label.dat.test'
sampled_meta_file = ncmf_file_path + f'sampled{sample_id}_meta.dat'
sampled_info_file = ncmf_file_path + f'sampled{sample_id}_info.dat'

In [4]:
entity_df = pd.read_csv(ncmf_file_path + "entity_oncokb.csv")
entity_df

Unnamed: 0,Entity Names
0,ACH-000001
1,ACH-000002
2,ACH-000004
3,ACH-000006
4,ACH-000007
...,...
1721,ZG-10
1722,ZIBOTENTAN
1723,ZM-447439
1724,ZOLEDRONATE


In [5]:
entity_df["Node Types"] = [0] * 692 + [1] * 638 + [2] * 396
entity_df

Unnamed: 0,Entity Names,Node Types
0,ACH-000001,0
1,ACH-000002,0
2,ACH-000004,0
3,ACH-000006,0
4,ACH-000007,0
...,...,...
1721,ZG-10,2
1722,ZIBOTENTAN,2
1723,ZM-447439,2
1724,ZOLEDRONATE,2


In [6]:
entity_df = entity_df.reset_index()
entity_df

Unnamed: 0,index,Entity Names,Node Types
0,0,ACH-000001,0
1,1,ACH-000002,0
2,2,ACH-000004,0
3,3,ACH-000006,0
4,4,ACH-000007,0
...,...,...,...
1721,1721,ZG-10,2
1722,1722,ZIBOTENTAN,2
1723,1723,ZM-447439,2
1724,1724,ZOLEDRONATE,2


In [7]:
def get_updated_id(x):
    if x["Node Types"] == 0:
        return x["index"]
    elif x["Node Types"] == 1:
        return x["index"] - 692
    else:
        return x["index"] - 692 - 638

In [8]:
entity_df["update node id"] = entity_df.apply(lambda x: get_updated_id(x), axis = 1)

In [9]:
entity_df

Unnamed: 0,index,Entity Names,Node Types,update node id
0,0,ACH-000001,0,0
1,1,ACH-000002,0,1
2,2,ACH-000004,0,2
3,3,ACH-000006,0,3
4,4,ACH-000007,0,4
...,...,...,...,...
1721,1721,ZG-10,2,391
1722,1722,ZIBOTENTAN,2,392
1723,1723,ZM-447439,2,393
1724,1724,ZOLEDRONATE,2,394


In [10]:
entity_df[["update node id", "Node Types"]].to_csv(f"{cmf_file_path}/id_idx.csv", header=False)

In [11]:
for f in [sampled_info_file, sampled_label_file, sampled_label_test_file, sampled_meta_file, sampled_node_file]:
    ! cp -prf {f} {cmf_file_path}

In [12]:
train_link_df = pd.read_csv(sampled_link_file, sep = "\t", header=None)
train_link_df.columns = ["left", "right", "link_type", 'value']
train_link_df

Unnamed: 0,left,right,link_type,value
0,0,722,0,1.000000
1,0,835,0,1.000000
2,0,974,0,1.000000
3,0,1066,0,1.000000
4,0,1068,0,1.000000
...,...,...,...,...
872365,691,1719,3,0.169045
872366,691,1720,3,0.043135
872367,691,1721,3,0.196274
872368,691,1722,3,0.024832


In [13]:
def get_updated_id_train(x):
    if x["link_type"] in [0, 1, 2]:
        return int(x["right"] - 692)
    else:
        return int(x["right"] - 692 - 638)

In [14]:
train_link_df["updated_right"] = train_link_df.apply(lambda x: get_updated_id_train(x), axis = 1)
train_link_df

Unnamed: 0,left,right,link_type,value,updated_right
0,0,722,0,1.000000,30
1,0,835,0,1.000000,143
2,0,974,0,1.000000,282
3,0,1066,0,1.000000,374
4,0,1068,0,1.000000,376
...,...,...,...,...,...
872365,691,1719,3,0.169045,389
872366,691,1720,3,0.043135,390
872367,691,1721,3,0.196274,391
872368,691,1722,3,0.024832,392


In [15]:
train_link_df[['left', "updated_right", "link_type", "value"]].to_csv(cmf_file_path + f"sampled{sample_id}_link.dat", sep='\t', header = False, index=False)

In [16]:
test_link_df = pd.read_csv(sampled_link_test_file, sep = "\t", header=None)
test_link_df.columns = ["left", "right", 'value']
test_link_df['link_type'] = 3

In [17]:
test_link_df

Unnamed: 0,left,right,value,link_type
0,598,1375,0.380364,3
1,454,1371,0.312764,3
2,248,1572,0.209578,3
3,0,1651,0.115166,3
4,468,1508,0.005335,3
...,...,...,...,...
54801,321,1700,0.013843,3
54802,168,1473,0.773214,3
54803,349,1717,0.419108,3
54804,3,1445,0.102049,3


In [18]:
test_link_df["updated_right"] = test_link_df.apply(lambda x: get_updated_id_train(x), axis = 1)
test_link_df

Unnamed: 0,left,right,value,link_type,updated_right
0,598,1375,0.380364,3,45
1,454,1371,0.312764,3,41
2,248,1572,0.209578,3,242
3,0,1651,0.115166,3,321
4,468,1508,0.005335,3,178
...,...,...,...,...,...
54801,321,1700,0.013843,3,370
54802,168,1473,0.773214,3,143
54803,349,1717,0.419108,3,387
54804,3,1445,0.102049,3,115


In [19]:
test_link_df[['left', "updated_right", "value"]].to_csv(cmf_file_path + f"sampled{sample_id}_link.dat.test", sep='\t', header = False, index=False)

In [20]:
X0_train = np.zeros((692, 638))
X1_train = np.zeros((692, 638))
X2_train = np.zeros((692, 638))
X3_train = np.zeros((692, 396))

In [21]:
for idx, row in train_link_df[train_link_df["link_type"] == 0].iterrows():
    left_idx = int(row["left"])
    right_idx = int(row["updated_right"])
    X0_train[left_idx][right_idx] = row["value"]
X0_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [22]:
for idx, row in train_link_df[train_link_df["link_type"] == 1].iterrows():
    left_idx = int(row["left"])
    right_idx = int(row["updated_right"])
    X1_train[left_idx][right_idx] = row["value"]
X1_train

array([[ 1.,  1.,  1., ...,  1.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [-1.,  1.,  1., ...,  1., -1.,  0.],
       ...,
       [-1.,  1.,  1., ...,  1., -1.,  0.],
       [-1.,  1.,  1., ...,  1.,  1.,  0.],
       [ 0.,  0.,  1., ...,  1.,  0.,  0.]])

In [23]:
for idx, row in train_link_df[train_link_df["link_type"] == 2].iterrows():
    left_idx = int(row["left"])
    right_idx = int(row["updated_right"])
    X2_train[left_idx][right_idx] = row["value"]
X2_train

array([[ 5.14077866,  2.9981955 , 11.41864325, ...,  1.94485845,
         3.7441611 ,  3.39917109],
       [ 3.88752527,  1.32192809, 10.37275626, ...,  0.0976108 ,
         1.69153416,  3.73443867],
       [ 4.32192809,  2.54843662, 10.4629112 , ...,  0.05658353,
         1.53106949,  3.23266076],
       ...,
       [ 4.11935618,  1.76128527, 11.04083765, ...,  0.25096157,
         1.79493566,  2.96901231],
       [ 4.49441561,  4.90255601, 11.67516316, ...,  0.01435529,
         2.48542683,  3.19377174],
       [ 4.61706334,  2.75702325, 11.24917272, ...,  0.08406426,
         3.67016051,  3.50080205]])

In [24]:
for idx, row in train_link_df[train_link_df["link_type"] == 3].iterrows():
    left_idx = int(row["left"])
    right_idx = int(row["updated_right"])
    X3_train[left_idx][right_idx] = row["value"]
X3_train

array([[0.221568 , 0.048679 , 0.159713 , ..., 0.106223 , 0.016139 ,
        0.066965 ],
       [0.211673 , 0.       , 0.239863 , ..., 0.160825 , 0.       ,
        0.466796 ],
       [0.26237  , 0.061267 , 0.164559 , ..., 0.       , 0.007395 ,
        0.238044 ],
       ...,
       [0.3081946, 0.023054 , 0.       , ..., 0.075491 , 0.06766  ,
        0.184072 ],
       [0.396938 , 0.084936 , 0.289398 , ..., 0.       , 0.031999 ,
        0.018559 ],
       [0.       , 0.       , 0.274271 , ..., 0.       , 0.024583 ,
        0.       ]])

In [25]:
np.save(cmf_file_path + "X0.npy", X0_train)
np.save(cmf_file_path + "X1.npy", X1_train)
np.save(cmf_file_path + "X2.npy", X2_train)
np.save(cmf_file_path + "X3.npy", X3_train)