This notebook performs scaling on the dataset, i.e. link.dat before learning embeddings from it

In [1]:
import pandas as pd
import numpy as np

In [2]:
sample_no = 1
link_file = f"sampled{sample_no}_link.dat"
dataset_folder = f"../../datasets/NCMF/MIMIC/"
filename = f'dict_nsides_mimic_data_v{sample_no}_case1_part.pkl'
np.random.seed(42)

In [3]:
# Perform scaling on only the values part of the link
link_df = pd.read_csv(dataset_folder + link_file, header = None, sep = "\t")
link_df.columns = ["left", "right", "link_type", "link_weight"]
link_df.head()

Unnamed: 0,left,right,link_type,link_weight
0,1,6015,0,1.0
1,1,6679,0,1.0
2,1,6954,0,1.0
3,2,6154,0,1.0
4,2,6230,0,1.0


In [4]:
# Reconstruct all three matrices from original data
object_pd = pd.read_pickle(dataset_folder + filename)
patient_disease_np = object_pd['matrices_data']['mat_pat_dis_treat']
disease_drug_np = np.transpose(object_pd['matrices_data']['mat_drugs_dis_side'])
drug_patient_np = np.transpose(object_pd['matrices_data']['mat_pat_drugs'])

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
patient_disease_scaled_np = scaler.fit_transform(patient_disease_np) # type 0
disease_drug_scaled_np = scaler.fit_transform(disease_drug_np) # type 1
drug_patient_scaled_np = scaler.fit_transform(drug_patient_np) # typw 2

In [6]:
patient_count = len(object_pd['metadata']['dict_p_idx_id_map'])
disease_count = len(object_pd['metadata']['dict_d_idx_id_map'])
drug_count = len(object_pd['metadata']['dict_r_idx_id_map'])

In [7]:
link_df.head()

Unnamed: 0,left,right,link_type,link_weight
0,1,6015,0,1.0
1,1,6679,0,1.0
2,1,6954,0,1.0
3,2,6154,0,1.0
4,2,6230,0,1.0


In [8]:
def assign_values(l, r, ltype):
    l = int(l)
    r = int(r)
    ltype = int(ltype)
    if ltype == 0:
        return patient_disease_scaled_np[l][r - patient_count]
    if ltype == 1:
        return disease_drug_scaled_np[l - patient_count][r - patient_count - disease_count]
    if ltype == 2:
        return drug_patient_scaled_np[l - patient_count - disease_count][r]

In [9]:
link_df["scaled_link_weight"] = link_df.apply(lambda row: assign_values(row["left"], row["right"], row["link_type"]), axis = 1)

In [10]:
patient_disease_scaled_np[1][6177- 5911]

-0.0483896667850337

In [11]:
link_df.tail()

Unnamed: 0,left,right,link_type,link_weight,scaled_link_weight
432511,7807,5861,2,2.0,1.167023
432512,7807,5864,2,3.0,0.285719
432513,7807,5868,2,2.0,0.572921
432514,7807,5882,2,1.0,0.31722
432515,7807,5884,2,5.0,1.393184


In [13]:
scaled_link_file = link_file
link_df[["left", "right", "link_type", "scaled_link_weight"]].to_csv(dataset_folder + scaled_link_file, sep = "\t", header = None, index = None)