## dCMF
Example of running the "dcmf" module with the use provided parameters

In [1]:
import sys
sys.path.append("..")

In [2]:
import pprint
import numpy as np
import pickle as pkl
import time
import itertools
import os
from datetime import datetime

In [3]:
from src.dcmf import dcmf

## Loading the sample dataset

This directory contains a sample synthetic dataset generated for the augmented setting of Fig 1(c) in the [paper](https://arxiv.org/abs/1811.11427).
You can download the sample data from [here](https://drive.google.com/open?id=1EFF_kuOIg2aYyOGZY_peX3NziqCSxxP1) and unzip it to the data directory.

In [4]:
sample_no = 1
data_dir = f"../data/PubMed/sample{sample_no}/"

In [5]:
#Loads the dataset into a dict
#Note: This dataset contains 5-folds for the matrix X_12 (matrix R below)
num_folds = 1
#
pp = pprint.PrettyPrinter()
print("Loading data from data_dir: ",data_dir)
U1 = pkl.load(open(data_dir+"X_11.pkl",'rb'))
U2 = pkl.load(open(data_dir+"X_22.pkl",'rb'))
V1 = pkl.load(open(data_dir+"X_31.pkl",'rb'))
V2 = pkl.load(open(data_dir+"X_32.pkl",'rb'))
V3 = pkl.load(open(data_dir+"X_33.pkl",'rb'))
V4 = pkl.load(open(data_dir+"X_34.pkl",'rb'))
W1 = pkl.load(open(data_dir+"X_41.pkl",'rb'))
W2 = pkl.load(open(data_dir+"X_42.pkl",'rb'))
W3 = pkl.load(open(data_dir+"X_44.pkl",'rb'))
R_temp_dict = {}
for fold_num in np.arange(1,num_folds+1):
    Rtrain = pkl.load(open(data_dir+'/X_12_train_fold_'+str(fold_num)+'.pkl','rb'))
    Rtrain = Rtrain
    Rtrain_idx = pkl.load(open(data_dir+'/X_12_train_idx_'+str(fold_num)+'.pkl','rb')) 
    Rtest = pkl.load(open(data_dir+'/X_12_test_fold_'+str(fold_num)+'.pkl','rb'))
    Rtest_idx = pkl.load(open(data_dir+'/X_12_test_idx_'+str(fold_num)+'.pkl','rb'))
    Rdoublets = pkl.load(open(data_dir+'/R_doublets_'+str(fold_num)+'.pkl','rb'))
    R_temp_dict[fold_num] = {"Rtrain":Rtrain,"Rtrain_idx":Rtrain_idx,"Rtest":Rtest,"Rtest_idx":Rtest_idx,"Rdoublets":Rdoublets}
#
data_dict = {"U1":U1,"U2":U2,"V1":V1,"V2":V2,"V3":V3,"V4":V4,"W1":W1,"W2":W2,"W3":W3,"R":R_temp_dict}

Loading data from data_dir:  ../data/PubMed/sample1/


In [6]:
print("U1.shape: ",U1.shape)
print("U1.shape: ",U2.shape)
print("V1.shape: ",V1.shape)
print("V2.shape: ",V2.shape)
print("V3.shape: ",V3.shape)
print("V4.shape: ",V4.shape)
print("W1.shape: ",W1.shape)
print("W2.shape: ",W2.shape)
print("W3.shape: ",W3.shape)
print("R.shape: ",data_dict['R'][1]['Rtrain'].shape)

U1.shape:  (2661, 2661)
U1.shape:  (4288, 4288)
V1.shape:  (5546, 2661)
V2.shape:  (5546, 4288)
V3.shape:  (5546, 5546)
V4.shape:  (5546, 592)
W1.shape:  (592, 2661)
W2.shape:  (592, 4288)
W3.shape:  (592, 592)
R.shape:  (2661, 4288)


## Building the required data structures

Here we construct the data structures required as input to the dcmf API

#### *entity matrix relationship graph *

- **G**: dict, keys are entity IDs and values are lists of associated matrix IDs

#### * training data*
- **X_data**: dict, keys are matrix IDs and values are (1) np.array, or (2) dict, (if this matrix is in validation set **X_val**) with validation set IDs as keys & values as np.array
- **X_meta**: dict, keys are matrix IDs and values are lists of the 2 associated entity IDs

#### *validation data*
- **X_val**: dict, keys are IDs of the matrices that are part of validation set and values are dict with validation set IDs as keys and values are (1) scipy.sparse matrix, or (2) list of triplets corresponding to the validation entries (if you would like to perform classification and measure AUC)  
**Note**: To perform K folds cross validation, use K validation sets for the corresponsing matrix/matrices. In the example below, we used a single validation set with ID "1" for each of the matrices with IDs "X1" and "X2"

In [7]:
G = {
    "e1":["X1","X2","X4","X8"],\
    "e2":["X2","X3","X5","X9"],\
    "e3":["X4","X5","X6","X7"],\
    "e4":["X7","X8","X9","X10"]
}
    #"e6":["X4"]}

In [8]:
X_data = {
    "X1":{"1":U1},\
    "X2":{"1":data_dict['R'][1]["Rtrain"]},\
    "X3":U2,\
    "X4":V1,\
    "X5":V2,\
    "X6":V3,\
    "X7":V4,\
    "X8":W1,\
    "X9":W2,\
    "X10":W3
}

In [9]:
X_meta = {
    "X1":["e1","e1"],\
    "X2":["e1","e2"],\
    "X3":["e2","e2"],\
    "X4":["e3","e1"],\
    "X5":["e3","e2"],\
    "X6":["e3","e3"],\
    "X7":["e3","e4"],\
    "X8":["e4","e1"],\
    "X9":["e4","e2"],\
    "X10":["e4","e4"]
}
    #"X5":["e5","e3"]}

In [10]:
# Rtest_triplets1 = [[1,1,1],[2,2,0]]
# Rtest_triplets2 = [[1,1,1],[3,3,0],[1,2,0],[0,1,0],[0,2,0],[0,3,0]]
Rtest_triplets1 = [[57, 1769, 0], [1708, 538, 1]]
Rtest_triplets2 = [[360, 2032, 1], [2506, 2430, 0]]

In [11]:
X_val = {
    "X1":{"1":Rtest_triplets1},
    "X2":{"1":Rtest_triplets2}
}

#### *dCMF network construction - hyperparameters*

- **kf**: float, in the range (0,1) 
- **k**: int, entity representation or encoding size. Refer Appendix A in the [paper](https://arxiv.org/abs/1811.11427) for info about how k and kf are used in the dCMF network construction. 
- **e_actf**: str, autoencoder's encoding activation function.
- **d_actf**: str, autoencoder's decoding activation function. Supported functions are "tanh","sigma","relu","lrelu"
- **is_linear_last_enc_layer**: bool, True to set linear activation for the bottleneck/encoding generation layer 
- **is_linear_last_dec_layer**: bool, True to set linear activation for the output/decoding generation layer 
- **num_chunks**: int, number of training batches to create.

In [12]:
kf = 0.5
k = 100
e_actf = "tanh"
d_actf = "tanh"
is_linear_last_enc_layer = False
is_linear_last_dec_layer = False
num_chunks = 2

#### *Optimization/training - hyperparamteres*

- **learning_rate**: float, Adam optimizer's learning rate
- **weight_decay**: float, Adam optimizers's weight decay (L2 penalty)
- **max_epochs**: int, maximum number of training epochs at which the training stops 
- **convg_thres**: float, convergence threshold 

In [13]:
learning_rate = 0.001
weight_decay = 0.05
max_epochs = 500
convg_thres = 0.1

#### *Hyperparamteres related to pre-training*

- **is_pretrain**: bool, True for pretraining 
- **pretrain_thres**: bool, pre-training convergence thresholsd
- **max_pretrain_epochs**: int, maximum number of pre-training epochs at which the training stops

In [14]:
is_pretrain=True
pretrain_thres= 0.1
max_pretrain_epochs = 2

#### *Parameters related to validation*

- **val_metric**: str, Validation performance metric. Supported metrics: ["rmse","r@k","p@k","auc"]. Where,  
     *rmse* - Root [mean square error](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html)  
     *r@k* - Recall@k. Refer section 5.2's sub-section "Evaluation metric" in the [paper](https://arxiv.org/abs/1811.11427)      
     *p@k* - Probability@k. Refer section 5.3's sub-section "Evaluation metric" in the [paper](https://arxiv.org/abs/1811.11427)      
     *auc* - [Area under the curve](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html)
    
- **is_val_transpose**: bool, True if the reconstructed matrix has to be transposed before computing the validation performance
- **at_k**: int, the value of k if the **val_metric** is either "r@k" or "p@k"

In [15]:
val_metric = "auc"
is_val_transpose = True
at_k = 10

#### *GPU - parameters *

- **is_gpu**: bool, True if pytorch tensors storage and operations has to be done in GPU
- **gpu_ids**: str, Comma separated string of CUDA GPU ID

In [16]:
is_gpu = False
gpu_ids = "1"

#### *Instantiating the dCMF model...*
- Initializes dCMF after validating the input data and the (hyper)parameters

In [17]:
dcmf_model = dcmf(G, X_data, X_meta,\
            num_chunks=num_chunks,k=k, kf=kf, e_actf=e_actf, d_actf=d_actf,\
            learning_rate=learning_rate, weight_decay=weight_decay, convg_thres=convg_thres, max_epochs=max_epochs,\
            is_gpu=is_gpu,gpu_ids=gpu_ids,is_pretrain=is_pretrain, pretrain_thres=pretrain_thres,\
            max_pretrain_epochs=max_pretrain_epochs,X_val=X_val,val_metric=val_metric,\
            is_val_transpose=is_val_transpose, at_k=at_k,\
            is_linear_last_enc_layer=is_linear_last_enc_layer,is_linear_last_dec_layer=is_linear_last_dec_layer,num_val_sets=num_folds)

dcmf_base.__init__ - start
dcmf_base.__init__ - end
#
dCMF:
---
#
dCMF: 
#
learning_rate:  0.001
weight_decay:  0.05
convg_thres:  0.1
max_epochs:  500
isPretrain:  True
pretrain_thres:  0.1
max_pretrain_epochs:  2
num_chunks:  2
k:  100
kf:  0.5
e_actf:  tanh
d_actf:  tanh
is_gpu:  False
gpu_ids:  1
num entities:  4
num matrices:  10
num_val_sets:  1
X_val #matrices:  2
val_metric (used only if X_val #matrices > 0):  auc
at_k (used only if X_val #matrices > 0 and val_metric is r@k or p@k):  10
is_val_transpose:  True
is_linear_last_enc_layer:  False
is_linear_last_dec_layer:  False
#


#### *Fitting... *
- Performs the input transformation and network construction
- (Pre-trains and) trains the model to obtain the entity representations
- Reconstruct the input matrices using the entity representations obtained

In [18]:
start_time = datetime.now()

In [19]:
dcmf_model.fit()

## fold_num:  1  ##
dcmf_base.__init__ - start
dcmf_base.__init__ - end
#
dCMF: 
#
learning_rate:  0.001
weight_decay:  0.05
convg_thres:  0.1
max_epochs:  500
isPretrain:  True
pretrain_thres:  0.1
max_pretrain_epochs:  2
num_chunks:  2
k:  100
kf:  0.5
e_actf:  tanh
d_actf:  tanh
is_gpu:  False
gpu_ids:  1
num entities:  4
num matrices:  10
num_val_sets:  1
X_val #matrices:  2
val_metric (used only if X_val #matrices > 0):  auc
at_k (used only if X_val #matrices > 0 and val_metric is r@k or p@k):  10
is_val_transpose:  True
is_linear_last_enc_layer:  False
is_linear_last_dec_layer:  False
#
dcmf - model construction - start
__input_transformation - start
#
concatenated-matrix construction...
e_id:  e1
X_id_list:  ['X1', 'X2', 'X4', 'X8']
X_id:  X1
X[X_id].shape:  (2661, 2661)
X_id:  X2
X[X_id].shape:  (2661, 4288)
X_id:  X4
X[X_id].shape:  (5546, 2661)
X_id:  X8
X[X_id].shape:  (592, 2661)
C_dict[e].shape:  torch.Size([2661, 13087])
---
e_id:  e2
X_id_list:  ['X2', 'X3', 'X5', 'X9']


In [20]:
end_time = datetime.now()
runtime = end_time - start_time
runtime_seconds = runtime.total_seconds()
print(f"Total runtime = {runtime_seconds}")

Total runtime = 128.288569


#### *Result attributes:*
- **out_dict_U**:  dict, keys are validation set IDs and values are dict with entity IDs as keys and np.array of entity representations/encodings as values
- **out_dict_X_prime**: dict, keys are matrix IDs and values are matrix reconstructions
- **out_dict_info**: dict, keys are loss/validation performance attributes and values are corresponding results.

In [21]:
dcmf_model.out_dict_U['1'].keys()

dict_keys(['e1', 'e2', 'e3', 'e4'])

In [22]:
dcmf_model.out_dict_X_prime['1'].keys()

dict_keys(['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10'])

In [23]:
dcmf_model.out_dict_info

{'params': {'learning_rate': 0.001,
  'weight_decay': 0.05,
  'convg_thres': 0.1,
  'max_epochs': 500,
  'is_pretrain': True,
  'pretrain_thres': 0.1,
  'max_pretrain_epochs': 2,
  'num_chunks': 2,
  'k': 100,
  'kf': 0.5,
  'e_actf': 'tanh',
  'd_actf': 'tanh',
  'is_linear_last_enc_layer': False,
  'is_linear_last_dec_layer': False},
 'num_val_sets': 1,
 'loss_all_folds': {'1': [5.904464602470398,
   4.30480569601059,
   4.563240230083466,
   0.08669799938797951,
   0.15517068840563297,
   5.06783401966095,
   0.3950411770492792,
   0.23750270530581474,
   13.897372342646122,
   0.054021626710891724,
   0.013634377857670188,
   0.14469264540821314,
   0.12048300029709935,
   0.03301609680056572]},
 'loss_all_folds_avg_tuple': [5.904464602470398,
  4.30480569601059,
  4.563240230083466,
  0.08669799938797951,
  0.15517068840563297,
  5.06783401966095,
  0.3950411770492792,
  0.23750270530581474,
  13.897372342646122,
  0.054021626710891724,
  0.013634377857670188,
  0.1446926454082131

In [24]:
gene_emb = list(dcmf_model.out_dict_U['1'].values())[0].detach().numpy()
disease_emb = list(dcmf_model.out_dict_U['1'].values())[1].detach().numpy()
chemical_emb = list(dcmf_model.out_dict_U['1'].values())[2].detach().numpy()
species_emb = list(dcmf_model.out_dict_U['1'].values())[3].detach().numpy()

In [25]:
import pandas as pd

In [26]:
gene_emb_df = pd.DataFrame(gene_emb)
gene_emb_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.012586,-0.008959,0.033649,0.038172,0.047238,-0.019642,0.027918,-0.046837,0.035617,0.037527,...,0.014126,-0.00385,-0.083101,0.044,0.000114,-0.038394,0.007904,0.015998,-0.027901,-0.022369
1,0.012552,-0.008971,0.033674,0.038215,0.047258,-0.019656,0.027943,-0.046863,0.035648,0.037529,...,0.014116,-0.003859,-0.083104,0.044027,0.000111,-0.038383,0.007912,0.015982,-0.027914,-0.022379
2,0.012498,-0.008996,0.033681,0.038279,0.047285,-0.019693,0.027974,-0.046895,0.035672,0.037519,...,0.014109,-0.003855,-0.083101,0.044064,0.000125,-0.038385,0.007916,0.015981,-0.027953,-0.022416
3,0.012526,-0.00898,0.033693,0.038247,0.047274,-0.019666,0.027961,-0.046883,0.03567,0.03753,...,0.014109,-0.003867,-0.083107,0.044048,0.000108,-0.038375,0.007919,0.01597,-0.027923,-0.022387
4,0.012526,-0.00898,0.033693,0.038247,0.047274,-0.019667,0.027961,-0.046883,0.03567,0.03753,...,0.014109,-0.003866,-0.083107,0.044048,0.000108,-0.038374,0.00792,0.01597,-0.027923,-0.022387


In [27]:
gene_emb_df.shape

(2661, 100)

In [28]:
disease_emb_df = pd.DataFrame(disease_emb)
disease_emb_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.004316,-0.084514,-0.02798,0.033638,-0.004902,-0.078709,0.007908,0.059953,0.03102,-0.062141,...,-0.013238,0.030658,-0.019459,-0.035335,0.039591,-0.002671,-0.030361,0.032964,-0.062233,-0.097052
1,-0.004316,-0.084514,-0.02798,0.033638,-0.004901,-0.078708,0.007908,0.059953,0.031021,-0.062141,...,-0.013238,0.03066,-0.019459,-0.035334,0.03959,-0.002671,-0.030362,0.032965,-0.062233,-0.097052
2,-0.004316,-0.084514,-0.027981,0.033638,-0.004901,-0.078708,0.007908,0.059952,0.031021,-0.062141,...,-0.013238,0.03066,-0.01946,-0.035334,0.039591,-0.002671,-0.030362,0.032965,-0.062233,-0.097052
3,-0.004316,-0.08451,-0.027979,0.033636,-0.004898,-0.078714,0.007903,0.059949,0.031016,-0.062136,...,-0.013236,0.030662,-0.019457,-0.035327,0.039591,-0.002675,-0.03036,0.032964,-0.062227,-0.097043
4,-0.004316,-0.084515,-0.027981,0.033638,-0.004901,-0.078708,0.007908,0.059952,0.031021,-0.062141,...,-0.013238,0.03066,-0.019459,-0.035334,0.03959,-0.002671,-0.030362,0.032965,-0.062233,-0.097052


In [29]:
disease_emb_df.shape

(4288, 100)

In [30]:
chemical_emb_df = pd.DataFrame(chemical_emb)
chemical_emb_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.001212,-0.049071,-0.025815,0.01318,-0.005914,-0.05358,0.005332,-0.024344,-0.037511,0.041849,...,-0.057714,0.022621,-0.042388,-0.033961,0.07267,-0.023662,0.033286,0.037168,-0.050068,0.021526
1,0.001204,-0.049092,-0.025825,0.013156,-0.005905,-0.053592,0.005288,-0.02439,-0.037525,0.041852,...,-0.057715,0.022637,-0.042375,-0.033959,0.072695,-0.023686,0.033293,0.03718,-0.050067,0.021501
2,0.001238,-0.048997,-0.025776,0.013266,-0.005941,-0.05354,0.005488,-0.024175,-0.037461,0.041841,...,-0.057712,0.022561,-0.042436,-0.033963,0.072582,-0.02358,0.033263,0.037126,-0.050071,0.021615
3,0.001207,-0.049084,-0.025821,0.013165,-0.005908,-0.053587,0.005305,-0.024373,-0.03752,0.041851,...,-0.057716,0.022631,-0.04238,-0.03396,0.072685,-0.023678,0.03329,0.037176,-0.050067,0.02151
4,0.001204,-0.049092,-0.025826,0.013156,-0.005906,-0.053592,0.005288,-0.02439,-0.037525,0.041851,...,-0.057716,0.022637,-0.042375,-0.033959,0.072695,-0.023687,0.033293,0.03718,-0.050067,0.021501


In [31]:
chemical_emb_df.shape

(5546, 100)

In [32]:
species_emb_df = pd.DataFrame(species_emb)
species_emb_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.052899,-0.007933,-0.033996,-0.010222,0.001377,-0.018764,-0.018992,0.014222,-0.053999,0.015202,...,-0.000699,0.006391,0.019961,0.053436,0.030129,-0.052179,0.060046,-0.020806,0.036969,-0.02161
1,-0.0529,-0.007933,-0.033996,-0.010221,0.001377,-0.018764,-0.018993,0.014223,-0.054,0.015202,...,-0.0007,0.006391,0.019961,0.053435,0.03013,-0.052179,0.060046,-0.020806,0.036969,-0.021609
2,-0.052901,-0.007933,-0.033997,-0.01022,0.001377,-0.018764,-0.018994,0.014222,-0.053999,0.015202,...,-0.0007,0.006392,0.019961,0.053436,0.03013,-0.052179,0.060046,-0.020805,0.036968,-0.02161
3,-0.0529,-0.007932,-0.033996,-0.010221,0.001377,-0.018764,-0.018992,0.014222,-0.054,0.015202,...,-0.0007,0.006392,0.01996,0.053435,0.03013,-0.052179,0.060046,-0.020806,0.036969,-0.02161
4,-0.0529,-0.007933,-0.033996,-0.010221,0.001377,-0.018764,-0.018992,0.014222,-0.054,0.015202,...,-0.0007,0.006392,0.01996,0.053436,0.03013,-0.052179,0.060047,-0.020806,0.036968,-0.02161


In [33]:
species_emb_df.shape

(592, 100)

In [34]:
emb_df = pd.concat([gene_emb_df, disease_emb_df, chemical_emb_df, species_emb_df], ignore_index = True, axis = 0)
emb_df.shape

(13087, 100)

In [35]:
emb_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.012586,-0.008959,0.033649,0.038172,0.047238,-0.019642,0.027918,-0.046837,0.035617,0.037527,...,0.014126,-0.00385,-0.083101,0.044,0.000114,-0.038394,0.007904,0.015998,-0.027901,-0.022369
1,0.012552,-0.008971,0.033674,0.038215,0.047258,-0.019656,0.027943,-0.046863,0.035648,0.037529,...,0.014116,-0.003859,-0.083104,0.044027,0.000111,-0.038383,0.007912,0.015982,-0.027914,-0.022379
2,0.012498,-0.008996,0.033681,0.038279,0.047285,-0.019693,0.027974,-0.046895,0.035672,0.037519,...,0.014109,-0.003855,-0.083101,0.044064,0.000125,-0.038385,0.007916,0.015981,-0.027953,-0.022416
3,0.012526,-0.00898,0.033693,0.038247,0.047274,-0.019666,0.027961,-0.046883,0.03567,0.03753,...,0.014109,-0.003867,-0.083107,0.044048,0.000108,-0.038375,0.007919,0.01597,-0.027923,-0.022387
4,0.012526,-0.00898,0.033693,0.038247,0.047274,-0.019667,0.027961,-0.046883,0.03567,0.03753,...,0.014109,-0.003866,-0.083107,0.044048,0.000108,-0.038374,0.00792,0.01597,-0.027923,-0.022387


In [36]:
with open(f"emb_PubMed_sample_{sample_no}.dat", "w") as file:
    file.write("\n")
    for idx, row in emb_df.iterrows():
        emb = row[:].astype(np.float32)
        emb_str = ' '.join(emb.astype(str))
        file.write(f'{idx}\t{emb_str}\n')
    