In [1]:
import os
import numpy as np
import pandas as pd
import patsy
import scipy.stats as stats

import snf
from snf import metrics
from sklearn.cluster import spectral_clustering
from sklearn.metrics import v_measure_score
from sklearn.metrics.cluster import normalized_mutual_info_score

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from scipy.stats import spearmanr

### Load and format data

In [2]:
# Load data
def load_data():
    # Proteomics
    #prot_raw = pd.read_csv('/nfs/answer/fraenkel_internal/current_omics/proteomics/proteomics_corr_raw_203.csv')
    prot = pd.read_csv('/nfs/answer/fraenkel_internal/current_omics/proteomics/normalized_imp_corrected_proteomics204_03292022.csv')
    #prot_raw = prot_raw.set_index('Unnamed: 0')
    prot = prot.set_index('Unnamed: 0')
    
    # Transcriptomics
    #tran_raw = pd.read_csv('/nfs/answer/fraenkel_internal/current_omics/transcriptomics/rna_raw_196_02222023.csv')
    tran = pd.read_csv('/nfs/answer/fraenkel_internal/current_omics/transcriptomics/rna_norm_196_02222023.csv.gz')
    #tran_raw = tran_raw.set_index('Unnamed: 0')
    tran = tran.set_index('Unnamed: 0')

    # Epigenomics
    #epig_raw = pd.read_csv('/nfs/answer/fraenkel_internal/current_omics/epigenomics/atac_raw_196_02222023.csv')
    epig = pd.read_csv('/nfs/answer/fraenkel_internal/current_omics/epigenomics/atac_norm_196_02222023.csv.gz')
    #epig_raw = epig_raw.set_index('Unnamed: 0')
    epig = epig.set_index('Unnamed: 0')
    
    # Metadata
    meta = pd.read_csv('/nfs/answer/fraenkel_internal/current_omics/proteomics/full_prot_metadata203_03082023.csv',
                        error_bad_lines=False)
    meta = meta.set_index('Unnamed: 0')
    return [prot, tran, epig, meta]
    #return [prot_raw, prot, tran_raw, tran, epig_raw, epig, meta]

In [3]:
# Switch rows and columns to match snf format
def transpose(data):
    data_return = []
    for d in data[:-1]:
        data_return.append(d.T)
    data_return.append(data[-1])
    
    return data_return

In [4]:
# Keep only samples present in all datasets
def filter_and_sort(data):
    
    meta = data[-1]
    epig = data[-2]
    
    meta = meta[meta.index.isin(epig.index)]

    data_return = []
    
    for d in data:
        d = d[d.index.isin(meta.index)]
        data_return.append(d.sort_index())
    
    return data_return

In [5]:
data = load_data()
data = transpose(data)
prot, tran, epig, meta = filter_and_sort(data)
#prot_raw, prot, tran_raw, tran, epig_raw, epig, meta = filter_and_sort(data)

### Normalize data

In [6]:
# Drop columns with all 0s
def normalize(prot_raw, tran_raw, epig_raw):
    dfs = [prot_raw, tran_raw, epig_raw]
    
    for i in range(len(dfs)):
        dfs[i] = dfs[i].loc[:, (dfs[i] != 0).any(axis=0)]
    
    # Z-score all columns of DFs
    for i in range(len(dfs)):
        dfs[i] = dfs[i].astype(float)
        for col, colData in dfs[i].iteritems():
            dfs[i][col] = stats.zscore(dfs[i][col])
    
    prot_z = dfs[0]
    tran_z = dfs[1]
    epig_z = dfs[2]
    
    return dfs[0], dfs[1], dfs[2]

#prot_z, tran_z, epig_z = normalize(prot_raw, tran_raw, epig_raw)

### Define labels to predict

In [7]:
meta.columns

Index(['GUID_orig', 'GUID_vial', 'GUID', 'attribute_ExperimentalGroup',
       'attribute_SampleType', 'attribute_TechnicalGroup', 'attribute_Species',
       'attribute_DataType', 'attribute_BiologicalGroup', 'Level2', 'Level3',
       'order', 'MS_batch', 'Digestion_batch', 'Differentiation_batch', 'Case',
       'Sex.x', 'Race', 'Site.of.Onset', 'ALSFRS.R.Baseline', 'nefh', 'isl1',
       'nkx6', 'tuj1', 's100b', 'nestin', 'Primary.Tissue', 'SOD1', 'C9',
       'PBMC', 'CHMP7', 'Group', 'Cell.Line', 'Site', 'shipment', 'Batch #',
       'Sex.y', 'Mean DAPI', '%SMI32', '%ISL1', '%NKX6.1', '%TUJ1', '%S100b',
       '% Nestin', 'Number of Visits', 'Subject Group', 'Age At Symptom Onset',
       'Age At Death', 'ALSFRS-R Baseline', 'ALSFRS-R Latest',
       'ALSFRS-R Progression Slope', 'Sex', 'Primary Tissue', 'progressor',
       'estimated_slope_alsfrsr', 'SOD1_gen'],
      dtype='object')

In [8]:
# See unique values in column
meta['Case'].unique()

array(['CASE', 'CTRL'], dtype=object)

In [9]:
# progressor
actual_labels = meta['progressor'].map({'AMBIGUOUS': 0, 'fast': 1, 'slow':2, np.nan:3}).array

In [10]:
# Site.of.Onset
actual_labels = meta['Site.of.Onset'].map({'Limb': 0, 'Bulbar': 1, 'Axial':2, 'Multiple': 3, 'Other': 4, np.nan:5}).array

In [11]:
# Case
actual_labels = meta['Case'].map({'CASE': 1, 'CTRL': 0}).array

In [12]:
# Sex.x
actual_labels = meta['Sex.x'].map({'Male': 1, 'Female': 0}).array

In [13]:
# ALSFRS-R Progression Slope
actual_labels = meta['ALSFRS-R Progression Slope']
s=np.isnan(actual_labels)
actual_labels[s] = 0.0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [14]:
# C9
actual_labels = meta['C9'].map({'POS': 0, 'NEG': 1, 'CTRL': 2, 'UNK':3}).array
#meta_f = meta_f[meta_f.C9 != 'UNK']
#actual_labels = meta_f['C9'].map({'POS': 0, 'NEG': 1, 'CTRL': 2}).array

In [15]:
# SOD1
actual_labels = meta['SOD1'].map({'POS': 0, 'NEG': 1, 'CTRL': 2, 'UNK':3}).array

In [16]:
# PBMC
actual_labels = meta['PBMC'].map({'PBMC/T-Cell': 0, 'PBMC/NT-Cell': 1, np.nan:2}).array

### Apply SNF

In [25]:
def apply_snf(prot, tran, epig):
    '''
    Apply SNF to data.
    Args:
        prot: DataFrame of normalized proteomics data
        tran: DataFrame of normalized transcriptomics data
        epig: DataFrame of normalized epigenomics data
    Returns:
        best: best number of clusters by spectral clustering
        affinity_networks: array with coefficients of similarity
            between each pair of patients for a single modality
        fused_network: array with coefficients of fused similarity
            between each pair of patients
        fused_labels: array of label of 0/1 for each patient,
            determined by SNF and spectral clustering
    '''
    data = [prot, tran, epig]
    
    # K: num of nearest neighbors to consider when constructing affinity matrix
        # good value for K is sqrt(N)
    # mu: scaling factor that weights affinity matrix
    affinity_networks = snf.make_affinity(data, metric='euclidean', K=14, mu=0.5)
    
    # Run SNF algorithm
    fused_network = snf.snf(affinity_networks, K=14)
    
    # Estimate the number of clusters in the data via the “eigengap” method
    best, second = snf.get_n_clusters(fused_network)
    
    # Get labels from clusters
    fused_labels = spectral_clustering(fused_network, n_clusters=best)
    
    return best, affinity_networks, fused_network, fused_labels

# With proteomics data normalized by code
#best, fused_network, fused_labels = apply_snf(prot_z, tran_z, epig_z)
# With already normalized proteomics data
best, affinity_networks, fused_network, fused_labels = apply_snf(prot, tran, epig)
best

2

In [18]:
fused_labels

array([0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1],
      dtype=int32)

In [19]:
def score(best, affinity_networks, fused_network, fused_labels, actual_labels):
    '''
    Scores to evaluate performance of SNF
    Returns:
        scores_dict: dictionary of key=score name, value=score value
            for v_measure_score, nmi, and silhouette score
    '''
    # v_measure_score ranges from 0 to 1, where 1 indicates perfect overlap between the derived and true labels and 0 indicates no overlap
    v_measure = v_measure_score(fused_labels, actual_labels)
    
    # Generate cluster labels from the individual affinity matrices
    labels = [actual_labels, fused_labels]
    for arr in affinity_networks:
        labels += [spectral_clustering(arr, n_clusters=best)]
        
    '''
    Normalized mutual information score (NMI) between the labels
    generated by SNF and the ones we just obtained.
    0 indicates no overlap and 1 indicates a perfect correspondence
    between the two sets of labels.
    '''
    nmi = normalized_mutual_info_score(actual_labels, fused_labels)
    '''
    Silhouette score
    Range from -1 to 1, where -1 indicates a poor clustering solution
    and 1 indicates a fantastic solution.
    '''
    np.fill_diagonal(fused_network, 0)
    sil = metrics.silhouette_score(fused_network, fused_labels)
    
    return v_measure, nmi, sil

scores = score(best, affinity_networks, fused_network, fused_labels, actual_labels)
scores_dict = {'v_measure_score': scores[0], 'nmi': scores[1], 'silhouette score': scores[2]}
scores_dict

{'v_measure_score': 0.007730743558228625,
 'nmi': 0.007730743558228624,
 'silhouette score': 0.23434490987229858}

### Network visualization

In [20]:
import networkx as nx 
from pyvis import network as net

import matplotlib.cm as cm
import matplotlib as mpl

In [21]:
def add_node_colors(network, col_attribute, cmap_name="Accent"):
    """
    Function to add node colors based on existing node attribute.
    
    network: networkx object
    col_attribute: Name of attribute to use for determining node colors
    cmap_name: Name of matplotlib colormap to use (default: 'Accent')
    """
    source_att = nx.get_node_attributes(network, col_attribute)
    
    # Get colors
    cmap = cm.get_cmap(cmap_name, len(set(source_att.values())))    
    source_dict = dict(zip(list(set(source_att.values())),
                           np.arange(len(set(source_att.values())))))
    node_colors = {n:mpl.colors.rgb2hex(cmap(source_dict[v]), keep_alpha=True) for (n, v) in source_att.items()}
    # Set colors
    nx.set_node_attributes(network, node_colors, 'color')

In [22]:
n = len(fused_network)
fused_network_vis = fused_network.copy()

# Set all entries below 75% quartile to 0 to avoid too many network connections
cutoff = np.percentile(fused_network_vis, 75)
for row in fused_network_vis:
    row[row < cutoff] = 0
fused_network_vis

array([[0.        , 0.00543871, 0.        , ..., 0.00657465, 0.        ,
        0.        ],
       [0.00543871, 0.        , 0.        , ..., 0.00545585, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.00647958,
        0.        ],
       ...,
       [0.00657465, 0.00545585, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.00647958, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [23]:
# Create networkx object
nt = nx.from_numpy_matrix(fused_network_vis)
nt = nx.relabel_nodes(nt, dict(zip(np.arange(n), prot.index.values)))

# Set some attributes from metadata
meta_net = meta.copy()
meta_net['fused_labels'] = fused_labels
nx.set_node_attributes(nt, meta_net.loc[:, ['Case', 'Sex.x', 'fused_labels', ]].to_dict('index'))

# Set node colors 
add_node_colors(nt, col_attribute='fused_labels')
#nt.nodes['NEUAB000NKC']

In [24]:
# Subgraph for ease of visualization
sub_nt = nt.subgraph(list(nt.nodes)[:100])

ntw = net.Network('750px', '750px', notebook=True)
ntw.toggle_physics(False)
ntw.from_nx(sub_nt)

ntw.show('nx.html')

nx.html
