This notebook converts the MIMIC dataset used for NCMF into a format that can be used by DCMF.

In [1]:
import pandas as pd
import numpy as np

In [2]:
data_folder = "../../datasets/NCMF/"
dataset_name = "PubMed"
sample_no = 3
target_link_type = 1

In [3]:
node_file = f'sampled{sample_no}_node.dat'
link_file = f'sampled{sample_no}_link.dat'
link_test_file = f'sampled{sample_no}_link.dat.test'
label_file = f'sampled{sample_no}_label.dat'
label_test_file = f'sampled{sample_no}_label.dat.test'
meta_file = f'sampled{sample_no}_meta.dat'
info_file = f'sampled{sample_no}_info.dat'
record_file = f'sampled{sample_no}_record.dat'

In [4]:
genes_count = 2675
diseases_count = 4478
chemicals_count = 5766
species_count = 622

In [5]:
# Creating matrix from link.dat file 

In [6]:
link_train_df = pd.read_csv(data_folder + dataset_name + "/" + link_file, sep = "\t", header = None)
link_train_df.columns = ["left", "right", "link_type", "link_value"]
link_train_df.head()

Unnamed: 0,left,right,link_type,link_value
0,466,525,0,2
1,2534,1485,0,2
2,1485,2534,0,2
3,599,2080,0,1
4,2080,599,0,1


In [7]:
link_train_df.tail()

Unnamed: 0,left,right,link_type,link_value
37550,13119,13391,9,1
37551,13391,13119,9,1
37552,13145,12997,9,1
37553,13025,13025,9,2
37554,12977,13310,9,1


In [8]:
link_train_df0 = link_train_df[link_train_df["link_type"] == 0]
link_train_df1 = link_train_df[link_train_df["link_type"] == 1]
link_train_df2 = link_train_df[link_train_df["link_type"] == 2]
link_train_df3 = link_train_df[link_train_df["link_type"] == 3]
link_train_df4 = link_train_df[link_train_df["link_type"] == 4]
link_train_df5 = link_train_df[link_train_df["link_type"] == 5]
link_train_df6 = link_train_df[link_train_df["link_type"] == 6]
link_train_df7 = link_train_df[link_train_df["link_type"] == 7]
link_train_df8 = link_train_df[link_train_df["link_type"] == 8]
link_train_df9 = link_train_df[link_train_df["link_type"] == 9]

In [9]:
link_train_df0.head()

Unnamed: 0,left,right,link_type,link_value
0,466,525,0,2
1,2534,1485,0,2
2,1485,2534,0,2
3,599,2080,0,1
4,2080,599,0,1


In [10]:
link_train_df0.tail()

Unnamed: 0,left,right,link_type,link_value
4758,2607,1719,0,1
4759,328,2504,0,4
4760,2405,404,0,1
4761,1015,2600,0,8
4762,1719,2607,0,1


In [11]:
max(link_train_df0["link_value"])

471

In [12]:
gene_gene_np = np.zeros((genes_count, genes_count))
gene_gene_np.shape

(2675, 2675)

In [13]:
gene_gene_np[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [14]:
for idx, v in link_train_df0.iterrows():
    gene_gene_np[int(v["left"])][int(v["right"])] = v["link_value"]

In [15]:
gene_disease_np = np.zeros((genes_count, diseases_count))
print(gene_disease_np[1])

[0. 0. 0. ... 0. 0. 0.]


In [16]:
link_train_df1.head()

Unnamed: 0,left,right,link_type,link_value
4763,1790,4401,1,2
4764,204,6039,1,2
4765,775,5525,1,2
4766,1418,3191,1,2
4767,200,3191,1,2


In [17]:
for idx, v in link_train_df1.iterrows():
    gene_disease_np[int(v["left"])][int(v["right"]) - genes_count] = v["link_value"]
print(gene_disease_np[1])

[0. 0. 0. ... 0. 0. 0.]


In [18]:
disease_disease_np = np.zeros((diseases_count, diseases_count))
print(disease_disease_np[0])

[0. 0. 0. ... 0. 0. 0.]


In [19]:
link_train_df2.head()

Unnamed: 0,left,right,link_type,link_value
9879,6916,3227,2,1
9880,3520,6795,2,1
9881,6916,3112,2,4
9882,4837,3191,2,2
9883,4821,6384,2,1


In [20]:
for idx, v in link_train_df2.iterrows():
    disease_disease_np[int(v["left"]) - genes_count][int(v["right"]) - genes_count] = v["link_value"]
print(disease_disease_np[0])

[0. 0. 0. ... 0. 0. 0.]


In [21]:
chemical_gene_np = np.zeros((chemicals_count, genes_count))
print(chemical_gene_np[0])
print(link_train_df3.head())
for idx, v in link_train_df3.iterrows():
    chemical_gene_np[int(v["left"]) - genes_count - diseases_count][int(v["right"])] = v["link_value"]
print(chemical_gene_np[0])

[0. 0. 0. ... 0. 0. 0.]
        left  right  link_type  link_value
15523   8817    401          3           2
15524  12589    963          3          14
15525  11012   1948          3           2
15526  10766    223          3           3
15527  11716   1127          3           2
[0. 0. 0. ... 0. 0. 0.]


In [22]:
chemical_disease_np = np.zeros((chemicals_count, diseases_count))
print(chemical_disease_np[0])
print(link_train_df4.head())
for idx, v in link_train_df4.iterrows():
    chemical_disease_np[int(v["left"]) - genes_count - diseases_count][int(v["right"]) - genes_count] = v["link_value"]
print(chemical_disease_np[0])

[0. 0. 0. ... 0. 0. 0.]
        left  right  link_type  link_value
20624  11351   5144          4           2
20625   8191   5102          4           3
20626  12544   4775          4           5
20627   9016   3191          4           2
20628   8817   6423          4           2
[0. 0. 0. ... 0. 0. 0.]


In [23]:
chemical_chemical_np = np.zeros((chemicals_count, chemicals_count))
print(chemical_chemical_np[0])
print(link_train_df5.head())
for idx, v in link_train_df5.iterrows():
    chemical_chemical_np[int(v["left"]) - genes_count - diseases_count][int(v["right"]) - genes_count - diseases_count] = v["link_value"]
print(chemical_chemical_np[0])

[0. 0. 0. ... 0. 0. 0.]
        left  right  link_type  link_value
27741  10406   8675          5           1
27742  10166   8191          5           1
27743  10923   8826          5           1
27744  10420  12431          5           1
27745   9527   7317          5           1
[0. 1. 0. ... 0. 0. 0.]


In [24]:
chemical_species_np = np.zeros((chemicals_count, species_count))
print(chemical_species_np[0])
print(link_train_df6.head())
for idx, v in link_train_df6.iterrows():
    chemical_species_np[int(v["left"]) - genes_count - diseases_count][int(v["right"]) - genes_count - diseases_count - chemicals_count] = v["link_value"]
print(chemical_species_np[0])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [25]:
species_gene_np = np.zeros((species_count, genes_count))
print(species_gene_np[0])
print(link_train_df7.head())
for idx, v in link_train_df7.iterrows():
    species_gene_np[int(v["left"]) - genes_count - diseases_count - chemicals_count][int(v["right"])] = v["link_value"]
print(species_gene_np[0])

[0. 0. 0. ... 0. 0. 0.]
        left  right  link_type  link_value
36015  13119   1391          7           2
36016  12964    668          7           2
36017  13091     19          7           2
36018  13527   2620          7           2
36019  13508   2541          7           2
[0. 0. 0. ... 0. 0. 0.]


In [26]:
species_disease_np = np.zeros((species_count, diseases_count))
print(species_disease_np[0])
print(link_train_df8.head())
for idx, v in link_train_df8.iterrows():
    species_disease_np[int(v["left"]) - genes_count - diseases_count - chemicals_count][int(v["right"]) - genes_count] = v["link_value"]
print(species_disease_np[0])

[0. 0. 0. ... 0. 0. 0.]
        left  right  link_type  link_value
36612  13162   5131          8           2
36613  13026   3763          8           2
36614  13218   3219          8           3
36615  13119   6337          8           2
36616  13319   3280          8           3
[0. 0. 0. ... 0. 0. 0.]


In [27]:
species_species_np = np.zeros((species_count, species_count))
print(species_species_np[0])
print(link_train_df9.head())
for idx, v in link_train_df9.iterrows():
    species_species_np[int(v["left"]) - genes_count - diseases_count - chemicals_count][int(v["right"]) - genes_count - diseases_count - chemicals_count] = v["link_value"]
print(species_species_np[0])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [28]:
gene_disease_test_np = np.zeros((genes_count, diseases_count))
link_test_df = pd.read_csv(data_folder + dataset_name + "/" + link_test_file, sep = "\t", header = None)
link_test_df.columns = ["left", "right", "link_value"]
link_test_df.head()

Unnamed: 0,left,right,link_value
0,1597,3280,1
1,1779,6581,1
2,1060,5776,1
3,1015,5629,1
4,328,4725,1


In [29]:
for idx, v in link_test_df.iterrows():
    gene_disease_test_np[int(v["left"])][int(v["right"]) - genes_count] = v["link_value"]

In [30]:
# Rdoublets creation
Rdoublets = pd.DataFrame(columns = ["left", "right"])
Rdoublets["left"] = link_train_df1["left"].copy()
Rdoublets["left"] = Rdoublets["left"]
Rdoublets["right"] = link_train_df1["right"].copy()
Rdoublets["right"] = Rdoublets["right"] - genes_count

In [31]:
Rdoublets.head()

Unnamed: 0,left,right
4763,1790,1726
4764,204,3364
4765,775,2850
4766,1418,516
4767,200,516


In [32]:
# getting traing and test idx
link_train_df1["indices"] = link_train_df1.apply(lambda row: int((row["left"]) * diseases_count + (row["right"] - genes_count)), axis = 1)
link_train_df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,left,right,link_type,link_value,indices
4763,1790,4401,1,2,8017346
4764,204,6039,1,2,916876
4765,775,5525,1,2,3473300
4766,1418,3191,1,2,6350320
4767,200,3191,1,2,896116


In [33]:
# getting traing and test idx
link_test_df["indices"] = link_test_df.apply(lambda row: int((row["left"]) * diseases_count + (row["right"] - genes_count)), axis = 1)
link_test_df.head()

Unnamed: 0,left,right,link_value,indices
0,1597,3280,1,7151971
1,1779,6581,1,7970268
2,1060,5776,1,4749781
3,1015,5629,1,4548124
4,328,4725,1,1470834


In [34]:
link_test_df.tail()

Unnamed: 0,left,right,link_value,indices
97,1105,5540,0,4951055
98,1106,2726,0,4952719
99,1106,5264,0,4955257
100,955,5298,0,4279113
101,1015,6886,0,4549381


In [35]:
genes_count * diseases_count

11978650

In [36]:
# Saving matrices
import pickle

In [37]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_12_train_fold_1.pkl", "wb") as f:
    pickle.dump(gene_disease_np, f)

In [38]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_12_test_fold_1.pkl", "wb") as f:
    pickle.dump(gene_disease_test_np, f)

In [39]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_12_train_idx_1.pkl", "wb") as f:
    pickle.dump(link_train_df1["indices"].values, f)

In [40]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_12_test_idx_1.pkl", "wb") as f:
    pickle.dump(link_test_df["indices"].values, f)

In [41]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_11.pkl", "wb") as f:
    pickle.dump(gene_gene_np, f)

In [42]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_22.pkl", "wb") as f:
    pickle.dump(disease_disease_np, f)

In [43]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_31.pkl", "wb") as f:
    pickle.dump(chemical_gene_np, f)

In [44]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_32.pkl", "wb") as f:
    pickle.dump(chemical_disease_np, f)

In [45]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_33.pkl", "wb") as f:
    pickle.dump(chemical_chemical_np, f)

In [46]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_34.pkl", "wb") as f:
    pickle.dump(chemical_species_np, f)

In [47]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_41.pkl", "wb") as f:
    pickle.dump(species_gene_np, f)

In [48]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_42.pkl", "wb") as f:
    pickle.dump(species_disease_np, f)

In [49]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_44.pkl", "wb") as f:
    pickle.dump(species_species_np, f)

In [50]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "R_doublets_1.pkl", "wb") as f:
    pickle.dump(Rdoublets.values, f)