# SNF with PCA to remove s100b batch effect
s100b is a biological covariate that causes noise in clustering.\
PCA is applied to each dataset. Principal components that are most correlated to s100b are removed. SNF is run on remaining principal components.

In [1]:
import os
import numpy as np
import pandas as pd
import patsy
import scipy.stats as stats

import snf
from snf import metrics
from sklearn.cluster import spectral_clustering
from sklearn.metrics import v_measure_score
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from scipy.stats import spearmanr

### Load and format data

In [2]:
# Load data
def load_data():
    # Proteomics
    prot = pd.read_csv('/nfs/answer/fraenkel_internal/current_omics/proteomics/normalized_imp_corrected_proteomics204_03292022.csv')
    prot = prot.set_index('Unnamed: 0')
    
    # Transcriptomics
    tran = pd.read_csv('/nfs/answer/fraenkel_internal/current_omics/transcriptomics/rna_norm_196_02222023.csv.gz')
    tran = tran.set_index('Unnamed: 0')

    # Epigenomics
    epig = pd.read_csv('/nfs/answer/fraenkel_internal/current_omics/epigenomics/atac_norm_196_02222023.csv.gz')
    epig = epig.set_index('Unnamed: 0')
    
    # Metadata
    meta = pd.read_csv('/nfs/answer/fraenkel_internal/current_omics/proteomics/full_prot_metadata203_03082023.csv',
                        error_bad_lines=False)
    meta = meta.set_index('Unnamed: 0')
    
    return [prot, tran, epig, meta]

In [3]:
# Switch rows and columns to match snf format
def transpose(data):
    data_return = []
    for d in data[:-1]:
        data_return.append(d.T)
    data_return.append(data[-1])
    
    return data_return

In [4]:
# Keep only samples present in all datasets
def filter_and_sort(data):
    
    meta = data[-1]
    epig = data[-2]
    
    meta = meta[meta.index.isin(epig.index)]

    data_return = []
    
    for d in data:
        d = d[d.index.isin(meta.index)]
        data_return.append(d.sort_index())
    
    return data_return

In [5]:
# Remove columns without s100b
def remove_s100b_na(data):
    meta = data[-1]
    meta = meta[~meta['s100b'].isna()]
    data_return = []
    
    for d in data[:-1]:
        d = d[d.index.isin(meta.index)]
        data_return.append(d)
    data_return.append(meta)
    
    return data_return

In [6]:
data = load_data()
data = transpose(data)
data = filter_and_sort(data)
prot, tran, epig, meta = remove_s100b_na(data)

### PCA

In [7]:
# PCA is applied to each dataset
def do_pca(data):
    pca = PCA(n_components=50, random_state=5)
    meta = data[-1]
    s100b = meta['s100b']
    
    pca_data = []
    scaler = StandardScaler(with_mean=True, with_std=False)
    
    for df in data[:-1]:

        df = pd.DataFrame(scaler.fit_transform(df), columns = df.columns, index = df.index)
        
        keep = df.var().sort_values(ascending=False)

        df = df[keep.index[:round(len(keep)/2)]]
        principalComponents = pca.fit_transform(df)
        principalDf = pd.DataFrame(data = principalComponents, index = df.index)
        
        # Principal components that are most correlated to s100b are removed
        drop = []
        for col, colData in principalDf.iteritems():
            r = spearmanr(s100b, colData)[0]
            if r > 0.3 or r < -0.3:
                drop.append(col)
                
        pca_data.append(principalDf.drop(columns=drop))
    
    pca_data.append(meta)
    
    # Remaining principal components are returned
    return pca_data

In [8]:
prot_pca, tran_pca, epig_pca, meta = do_pca([prot, tran, epig, meta])

### Label

In [9]:
# Case
actual_labels = meta['Case'].map({'CASE': 1, 'CTRL': 0}).array

### Apply SNF

In [10]:
def apply_snf(prot, tran, epig):
    '''
    Apply SNF to data.
    Args:
        prot: DataFrame of normalized proteomics data
        tran: DataFrame of normalized transcriptomics data
        epig: DataFrame of normalized epigenomics data
    Returns:
        best: best number of clusters by spectral clustering
        affinity_networks: array with coefficients of similarity
            between each pair of patients for a single modality
        fused_network: array with coefficients of fused similarity
            between each pair of patients
        fused_labels: array of label of 0/1 for each patient,
            determined by SNF and spectral clustering
    '''
    data = [prot, tran, epig]
    
    # K: num of nearest neighbors to consider when constructing affinity matrix
        # good value for K is sqrt(N)
    # mu: scaling factor that weights affinity matrix
    affinity_networks = snf.make_affinity(data, metric='euclidean', K=14, mu=0.5)
    
    # Run SNF algorithm
    fused_network = snf.snf(affinity_networks, K=14)
    
    # Estimate the number of clusters in the data via the “eigengap” method
    best, second = snf.get_n_clusters(fused_network)
    
    # Get labels from clusters
    fused_labels = spectral_clustering(fused_network, n_clusters=best)
    
    return best, affinity_networks, fused_network, fused_labels

# With proteomics data normalized by code
#best, fused_network, fused_labels = apply_snf(prot_z, tran_z, epig_z)
# With already normalized proteomics data
best, affinity_networks, fused_network, fused_labels = apply_snf(prot_pca,
                                                                 tran_pca,
                                                                 epig_pca)
best

2

In [11]:
fused_labels

array([0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0], dtype=int32)

In [12]:
# Scores to evaluate performance of SNF
def score(best, affinity_networks, fused_network, fused_labels, actual_labels):
    '''
    Scores to evaluate performance of SNF
    Returns:
        scores_dict: dictionary of key=score name, value=score value
            for v_measure_score, nmi, and silhouette score
    '''
    # v_measure_score ranges from 0 to 1, where 1 indicates perfect overlap between the derived and true labels and 0 indicates no overlap
    v_measure = v_measure_score(fused_labels, actual_labels)
    
    # Generate cluster labels from the individual affinity matrices
    labels = [actual_labels, fused_labels]
    for arr in affinity_networks:
        labels += [spectral_clustering(arr, n_clusters=best)]
        
    '''
    Normalized mutual information score (NMI) between the labels
    generated by SNF and the ones we just obtained.
    0 indicates no overlap and 1 indicates a perfect correspondence
    between the two sets of labels.
    '''
    nmi = normalized_mutual_info_score(actual_labels, fused_labels)
    '''
    Silhouette score
    Range from -1 to 1, where -1 indicates a poor clustering solution
    and 1 indicates a fantastic solution.
    '''
    np.fill_diagonal(fused_network, 0)
    sil = metrics.silhouette_score(fused_network, fused_labels)
    
    return v_measure, nmi, sil

scores = score(best, affinity_networks, fused_network, fused_labels, actual_labels)
scores_dict = {'v_measure_score': scores[0], 'nmi': scores[1], 'silhouette score': scores[2]}
scores_dict

{'v_measure_score': 0.03134589907277121,
 'nmi': array([[1.        , 0.0313459 , 0.00294143, 0.00492028, 0.02259586],
        [0.0313459 , 1.        , 0.01125853, 0.0662375 , 0.21249449],
        [0.00294143, 0.01125853, 1.        , 0.00127733, 0.0029285 ],
        [0.00492028, 0.0662375 , 0.00127733, 1.        , 0.03545742],
        [0.02259586, 0.21249449, 0.0029285 , 0.03545742, 1.        ]]),
 'silhouette score': 0.037547142466343754}

### Network visualization
Visualize connections within/between clusters generated by SNF.

In [13]:
import networkx as nx 
from pyvis import network as net

import matplotlib.cm as cm
import matplotlib as mpl

In [14]:
def add_node_colors(network, col_attribute, cmap_name="Accent"):
    """
    Function to add node colors based on existing node attribute.
    
    network: networkx object
    col_attribute: Name of attribute to use for determining node colors
    cmap_name: Name of matplotlib colormap to use (default: 'Accent')
    """
    source_att = nx.get_node_attributes(network, col_attribute)
    
    # Get colors
    cmap = cm.get_cmap(cmap_name, len(set(source_att.values())))    
    source_dict = dict(zip(list(set(source_att.values())),
                           np.arange(len(set(source_att.values())))))
    node_colors = {n:mpl.colors.rgb2hex(cmap(source_dict[v]), keep_alpha=True) for (n, v) in source_att.items()}
    # Set colors
    nx.set_node_attributes(network, node_colors, 'color')

In [15]:
n = len(fused_network)
fused_network_vis = fused_network.copy()

# Set all entries below 75% quartile to 0 to avoid too many network connections
cutoff = np.percentile(fused_network_vis, 75)
for row in fused_network_vis:
    row[row < cutoff] = 0
fused_network_vis

array([[0.        , 0.        , 0.        , ..., 0.00676185, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00564674],
       ...,
       [0.00676185, 0.        , 0.        , ..., 0.        , 0.        ,
        0.00568299],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.00564674, ..., 0.00568299, 0.        ,
        0.        ]])

In [16]:
# Create networkx object
nt = nx.from_numpy_matrix(fused_network_vis)
nt = nx.relabel_nodes(nt, dict(zip(np.arange(n), prot.index.values)))

# Set some attributes from metadata
meta_net = meta.copy()
meta_net['fused_labels'] = fused_labels
nx.set_node_attributes(nt, meta_net.loc[:, ['Case', 'Sex.x', 'fused_labels', ]].to_dict('index'))

# Set node colors 
add_node_colors(nt, col_attribute='fused_labels')
nt.nodes['NEUAB000NKC']

{'Case': 'CASE', 'Sex.x': 'Female', 'fused_labels': 0, 'color': '#7fc97fff'}

In [17]:
# Subgraph for ease of visualization
sub_nt = nt.subgraph(list(nt.nodes)[:75])

ntw = net.Network('750px', '750px', notebook=True)
ntw.toggle_physics(False)
ntw.from_nx(sub_nt)

ntw.show('nx.html')

nx.html
