This notebook converts the MIMIC dataset used for NCMF into a format that can be used by DCMF.

In [1]:
import pandas as pd
import numpy as np

In [2]:
data_folder = "../../HNE/Data/"
dataset_name = "MIMIC"
sample_no = 4
target_link_type = 1

In [3]:
node_file = f'sampled{sample_no}_node.dat'
link_file = f'sampled{sample_no}_link.dat'
link_test_file = f'sampled{sample_no}_link.dat.test'
label_file = f'sampled{sample_no}_label.dat'
label_test_file = f'sampled{sample_no}_label.dat.test'
meta_file = f'sampled{sample_no}_meta.dat'
info_file = f'sampled{sample_no}_info.dat'
record_file = f'sampled{sample_no}_record.dat'

In [4]:
patients_count = 5911
diseases_count = 1321
drugs_count = 596

In [5]:
# Creating matrix from link.dat file 

In [6]:
link_train_df = pd.read_csv(data_folder + dataset_name + "/" + link_file, sep = "\t", header = None)
link_train_df.columns = ["left", "right", "link_type", "link_value"]
link_train_df.head()

Unnamed: 0,left,right,link_type,link_value
0,1,6048,0,1
1,1,6177,0,1
2,1,6237,0,1
3,1,6566,0,1
4,1,6575,0,1


In [7]:
link_train_df.tail()

Unnamed: 0,left,right,link_type,link_value
435339,7827,4619,2,1
435340,6222,7273,1,1
435341,6230,7258,1,1
435342,6944,7729,1,1
435343,6995,7544,1,1


In [8]:
link_train_df0 = link_train_df[link_train_df["link_type"] == 0]
link_train_df1 = link_train_df[link_train_df["link_type"] == 1]
link_train_df2 = link_train_df[link_train_df["link_type"] == 2]

In [9]:
link_train_df0.head()

Unnamed: 0,left,right,link_type,link_value
0,1,6048,0,1
1,1,6177,0,1
2,1,6237,0,1
3,1,6566,0,1
4,1,6575,0,1


In [10]:
link_train_df0.tail()

Unnamed: 0,left,right,link_type,link_value
29208,5909,6787,0,1
29209,5909,7166,0,1
29210,5910,6427,0,1
29211,5910,6486,0,1
29212,5910,7166,0,1


In [11]:
patient_disease_np = np.zeros((patients_count, diseases_count))
patient_disease_np.shape

(5911, 1321)

In [12]:
patient_disease_np[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [13]:
for idx, v in link_train_df0.iterrows():
    patient_disease_np[int(v["left"])][int(v["right"]) - patients_count] = v["link_value"]

In [14]:
disease_drug_np = np.zeros((diseases_count, drugs_count))
print(disease_drug_np[1])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [15]:
link_train_df1.head()

Unnamed: 0,left,right,link_type,link_value
29213,5911,7233,1,1
29214,5911,7234,1,1
29215,5911,7236,1,1
29216,5911,7241,1,1
29217,5911,7243,1,1


In [16]:
for idx, v in link_train_df1.iterrows():
    disease_drug_np[int(v["left"]) - patients_count][int(v["right"]) - patients_count - diseases_count] = v["link_value"]
print(disease_drug_np[1])

[0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.
 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 0.
 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0.

In [17]:
drug_patient_np = np.zeros((drugs_count, patients_count))
print(drug_patient_np[0])

[0. 0. 0. ... 0. 0. 0.]


In [18]:
link_train_df2.head()

Unnamed: 0,left,right,link_type,link_value
305565,7232,3,2,1
305566,7232,24,2,1
305567,7232,69,2,1
305568,7232,75,2,1
305569,7232,104,2,1


In [19]:
for idx, v in link_train_df2.iterrows():
    drug_patient_np[int(v["left"]) - patients_count - diseases_count][int(v["right"])] = v["link_value"]
print(drug_patient_np[0])

[0. 0. 0. ... 0. 0. 0.]


In [20]:
disease_drug_test_np = np.zeros((diseases_count, drugs_count))
link_test_df = pd.read_csv(data_folder + dataset_name + "/" + link_test_file, sep = "\t", header = None)
link_test_df.columns = ["left", "right", "link_value"]
link_test_df.head()

Unnamed: 0,left,right,link_value
0,6692,7460,0
1,6467,7645,1
2,6057,7257,1
3,6600,7666,1
4,7188,7711,0


In [21]:
for idx, v in link_test_df.iterrows():
    disease_drug_test_np[int(v["left"]) - patients_count][int(v["right"]) - patients_count - diseases_count] = v["link_value"]

In [22]:
# Rdoublets creation
Rdoublets = pd.DataFrame(columns = ["left", "right"])
Rdoublets["left"] = link_train_df1["left"].copy()
Rdoublets["left"] = Rdoublets["left"] - patients_count
Rdoublets["right"] = link_train_df1["right"].copy()
Rdoublets["right"] = Rdoublets["right"] - patients_count - diseases_count

In [23]:
Rdoublets.head()

Unnamed: 0,left,right
29213,0,1
29214,0,2
29215,0,4
29216,0,9
29217,0,11


In [24]:
# getting traing and test idx
link_train_df1["indices"] = link_train_df1.apply(lambda row: int((row["left"] - patients_count) * drugs_count + (row["right"] - patients_count - diseases_count)), axis = 1)
link_train_df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,left,right,link_type,link_value,indices
29213,5911,7233,1,1,1
29214,5911,7234,1,1,2
29215,5911,7236,1,1,4
29216,5911,7241,1,1,9
29217,5911,7243,1,1,11


In [25]:
# getting traing and test idx
link_test_df["indices"] = link_test_df.apply(lambda row: int((row["left"] - patients_count) * drugs_count + (row["right"] - patients_count - diseases_count)), axis = 1)
link_test_df.head()

Unnamed: 0,left,right,link_value,indices
0,6692,7460,0,465704
1,6467,7645,1,331789
2,6057,7257,1,87041
3,6600,7666,1,411078
4,7188,7711,0,761571


In [26]:
print(6580 - patients_count)
print(7646 - patients_count - diseases_count)

669
414


In [27]:
link_test_df.tail()

Unnamed: 0,left,right,link_value,indices
157475,6685,7598,1,461670
157476,6738,7689,1,493349
157477,7216,7400,1,777948
157478,7046,7279,1,676507
157479,5981,7718,1,42206


In [28]:
diseases_count * drugs_count

787316

In [29]:
# Saving matrices
import pickle

In [30]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_23_train_fold_1.pkl", "wb") as f:
    pickle.dump(disease_drug_np, f)

In [31]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_23_test_fold_1.pkl", "wb") as f:
    pickle.dump(disease_drug_test_np, f)

In [32]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_23_train_idx_1.pkl", "wb") as f:
    pickle.dump(link_train_df1["indices"].values, f)

In [33]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_23_test_idx_1.pkl", "wb") as f:
    pickle.dump(link_test_df["indices"].values, f)

In [34]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_12.pkl", "wb") as f:
    pickle.dump(patient_disease_np, f)

In [35]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_31.pkl", "wb") as f:
    pickle.dump(drug_patient_np, f)

In [36]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "R_doublets_1.pkl", "wb") as f:
    pickle.dump(Rdoublets.values, f)