## Integrate simulated Cancer Omics dataset; load the saved model and perform feature importance extraction

In [1]:
# Import packages and IntegrAO code
import numpy as np
import pandas as pd
import snf
from sklearn.cluster import spectral_clustering
from sklearn.metrics import v_measure_score
import matplotlib.pyplot as plt

import sys
import os
import argparse
import torch

import umap
from sklearn.model_selection import train_test_split

# Add the parent directory of "integrao" to the Python path
module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from integrao.dataset import GraphDataset
from integrao.main import dist2
from integrao.integrater import integrao_integrater, integrao_predictor

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Set Hyperparameters
neighbor_size = 20
embedding_dims = 64
fusing_iteration = 30
normalization_factor = 1.0
alighment_epochs = 1000
beta = 1.0
mu = 0.5


dataset_name = 'unsupervised_integration_feature_importance'
cluster_number = 15

In [3]:
# create result dir
result_dir = os.path.join(
    module_path, "results/{}".format(dataset_name)
)
if not os.path.exists(result_dir):
    os.makedirs(result_dir)

## Read data

In [4]:
testdata_dir = os.path.join(module_path, "data/omics/")

methyl_ = os.path.join(testdata_dir, "omics1.txt")
expr_ = os.path.join(testdata_dir, "omics2.txt")
protein_ = os.path.join(testdata_dir, "omics3.txt")
truelabel = os.path.join(testdata_dir, "clusters.txt")


methyl = pd.read_csv(methyl_, index_col=0, delimiter="\t")
expr = pd.read_csv(expr_, index_col=0, delimiter="\t")
protein = pd.read_csv(protein_, index_col=0, delimiter="\t")
truelabel = pd.read_csv(truelabel, index_col=0, delimiter="\t")

methyl = np.transpose(methyl)
expr = np.transpose(expr)
protein = np.transpose(protein)
print(methyl.shape)
print(expr.shape)
print(protein.shape)
print(truelabel.shape)
print("finish loading data!")

(500, 367)
(500, 131)
(500, 160)
(500, 2)
finish loading data!


## Random sub-sample the omics dataset to create an incomplete dataset

In [5]:
ratio = 0.7

full_indices = range(len(methyl))
unique_indices, common_indices = train_test_split(full_indices, test_size=ratio)

w1w2_indices, w3_indices = train_test_split(unique_indices, test_size=0.33)
w1_indices, w2_indices = train_test_split(w1w2_indices, test_size=0.5)

w1_full_indices = common_indices + w1_indices
w2_full_indices = common_indices + w2_indices
w3_full_indices = common_indices + w3_indices

methyl_temp = methyl.iloc[w1_full_indices]
expr_temp = expr.iloc[w2_full_indices]
protein_temp = protein.iloc[w3_full_indices]


## IntegrAO integration

In [6]:
# Initialize integrater
integrater = integrao_integrater(
    [methyl_temp, expr_temp, protein_temp],
    dataset_name,
    neighbor_size=neighbor_size,
    embedding_dims=embedding_dims,
    fusing_iteration=fusing_iteration,
    normalization_factor=normalization_factor,
    alighment_epochs=alighment_epochs,
    beta=beta,
    mu=mu,
)
# data indexing
fused_networks = integrater.network_diffusion()
embeds_final, S_final, model = integrater.unsupervised_alignment()

labels = spectral_clustering(S_final, n_clusters=cluster_number)

true_labels = truelabel.sort_values('subjects')['cluster.id'].tolist()

score_all = v_measure_score(true_labels, labels)
print("IntegrAO for clustering union 500 samples NMI score: ", score_all)

Start indexing input expression matrices!
Common sample between view0 and view1: 350
Common sample between view0 and view2: 350
Common sample between view1 and view2: 350
Neighbor size: 20
Start applying diffusion!
Diffusion ends! Times: 4.647532224655151s
Starting unsupervised exmbedding extraction!
Dataset 0: (400, 367)
Dataset 1: (400, 131)
Dataset 2: (400, 160)
epoch 0: loss 30.287778854370117, align_loss:0.747223
epoch 100: loss 20.88467025756836, align_loss:0.178473
epoch 200: loss 1.1458323001861572, align_loss:0.092696
epoch 300: loss 1.144501805305481, align_loss:0.091768
epoch 400: loss 1.1429835557937622, align_loss:0.090755
epoch 500: loss 1.1412932872772217, align_loss:0.089755
epoch 600: loss 1.1394864320755005, align_loss:0.088606
epoch 700: loss 1.1375226974487305, align_loss:0.087426
epoch 800: loss 1.135468602180481, align_loss:0.086274
epoch 900: loss 1.1333705186843872, align_loss:0.085182
Manifold alignment ends! Times: 7.6544740200042725s
IntegrAO for clustering u

In [7]:
# save model
torch.save(model.state_dict(), os.path.join(result_dir, "model_integrao_unsupervised.pth"))

## Now load the saved model and perform embedding extraction using the trained model

In [8]:
# Network fusion for the whole graph; make sure use the integrao_predictor with the same hyperparameters
predictor = integrao_predictor(
    [methyl, expr, protein],
    dataset_name,
    modalities_name_list=["methyl", "expr", "protein"], 
    neighbor_size=neighbor_size,
    embedding_dims=embedding_dims,
    fusing_iteration=fusing_iteration,
    normalization_factor=normalization_factor,
    alighment_epochs=alighment_epochs,
    beta=beta,
    mu=mu,
)
# data indexing
fused_networks = predictor.network_diffusion()

Start indexing input expression matrices!
Common sample between view0 and view1: 500
Common sample between view0 and view2: 500
Common sample between view1 and view2: 500
Neighbor size: 20
Start applying diffusion!
Diffusion ends! Times: 5.997241497039795s


In [9]:
# load model and inference for obtaining the patient embeddings
model_path = os.path.join(result_dir, "model_integrao_unsupervised.pth")
final_embedding_df, S_final = predictor.inference_unsupervised(model_path, new_datasets=[methyl, expr, protein], modalities_names=["methyl", "expr", "protein"])

labels = spectral_clustering(S_final, n_clusters=cluster_number)

true_labels = truelabel.sort_values('subjects')['cluster.id'].tolist()

score_all = v_measure_score(true_labels, labels)
print("IntegrAO for clustering union 500 samples NMI score: ", score_all)

Loaded pre-trained model with success.
IntegrAO for clustering union 500 samples NMI score:  1.0000000000000002


## Now extract the feature importance for the unsurvised integration; the extracted feature importance will be saved in the result dir

In [None]:
df_list = predictor.interpret_unsupervised(model_path=model_path, result_dir=result_dir, new_datasets=[methyl, expr, protein], modalities_names=["methyl", "expr", "protein"])