<h2> Scripts for clustering chromsome structures from chromatin tracing <h2>

<h4> Import packages for scripts </h4>

In [None]:
import sys
import os
import copy
import numpy as np
import anndata as ad
from scipy import sparse
import h5py
import snapatac2 as snap
import matplotlib.pyplot as plt

sys.path.append(r'/path/to/CommonTools')

import PipelineFunctions as pipeline
import matplotlib.pylab as plt
from scipy.spatial.distance import squareform,pdist,cdist
from scipy.stats import ranksums

from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Dimension reduction and clustering libraries
import umap
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['font.size'] = 15
matplotlib.rcParams['font.family']='Arial'

%matplotlib inline

<h4> Default processing to convert into h5ad for SnapATAC2 </h4>

In [None]:
'''
Here, we are taking in linearly interpolated (no nan values allowed) pairwise distance vectors for each trace 
(interpolated_and_linearized_pairwise_traces.npy). Each trace need to be the same length, ordered in the same way. In addition, we want to
have a list of identifiers for the traces which I later load in and call categories. This list contains a concatenation of 
allele ID "CAST" or "129" and the experiment ID (a number in this case, though the identifier doesn't matter too much.
'''

In [None]:
npy = np.load('/path/to/interpolated_and_linearized_pairwise_traces.npy')
median_npy = np.nanmedian(npy, axis=0)

In [None]:
# removing zeros from the array, setting to median value
for row in range(npy.shape[0]):
    for col in range(npy.shape[1]):
        if npy[row,col] == 0:
            npy[row,col] = median_npy[col]

In [None]:
data = ad.AnnData(npy)
data.write('/path/to/your_name.h5ad')

<h3> Choose the h5ad file you want to look at here as well as the save location </h3>

In [None]:
data = snap.read('/path/to/your_name.h5ad', backed=None)
categories = np.load('/path/to/category_file.npy')
cluster_folder = '/path/to/save/folder'

<h4>Start running SnapATAC2 from feature selection, PCA, etc</h4>

In [None]:
snap.pp.select_features(data, n_features=100)

In [None]:
%%time
data.X = sparse.csr_matrix(data.X)
snap.tl.spectral(data)

In [None]:
snap.pl.spectral_eigenvalues(data)

In [None]:
data

In [None]:
%%time
snap.tl.umap(data, n_comps=2)

In [None]:
%%time
snap.pp.knn(data)
snap.tl.leiden(data)

In [None]:
categories

In [None]:
experiment_labels = [ x.replace("129","") for x in categories ]
experiment_labels = [ x.replace("cast","") for x in experiment_labels ]
data.obs['experiment'] = experiment_labels

In [None]:
# We saved all alleles in order with CAST first, then 129, so this was a straightforward way to handle this
data.obs['allele'] = 1*np.array([ '129' in categories[i] for i in range(len(categories)) ])

In [None]:
data.obs['experiment']

In [None]:
%matplotlib inline
savefile = cluster_folder+os.sep+r'allele_labels_umap.png'
snap.pl.umap(data, color='log_gyr',
             interactive=True, height=800, width=1000, marker_size=4, show=True, use_rep='X_spectral')
                                                                                 

In [None]:
%matplotlib inline
savefile = cluster_folder+os.sep+r'allele_labels_umap.png'
snap.pl.umap(data, color='allele',
             interactive=False, height=1600, width=2000, marker_size=5, show=True, out_file=savefile)
                                                                                 

In [None]:
%matplotlib inline
savefile = cluster_folder+os.sep+r'leiden_labels_umap.png'
snap.pl.umap(data, color='leiden',
             interactive=True, height=800, width=1000, marker_size=3, show=True, out_file=savefile)
                                                                                 

In [None]:
%matplotlib inline
savefile = cluster_folder+os.sep+r'experiment_labels_umap.png'
snap.pl.umap(data, color='experiment',
             interactive=True, height=800, width=1000, marker_size=4, out_file=savefile)

In [None]:
%matplotlib inline
savefile = cluster_folder+os.sep+r'rad_gyr_labels_umap.png'
snap.pl.umap(data, color='rad_gyr',
             interactive=False, height=1600, width=2000, marker_size=10, out_file=savefile)

In [None]:
%matplotlib inline
savefile = cluster_folder+os.sep+r'log_rad_gyr_labels_umap.png'
snap.pl.umap(data, color='log_gyr',
             interactive=False, height=1600, width=2000, marker_size=10, out_file=savefile)

In [None]:
def pixellated_umap_by_expt(anndata, num_pixels=100):
    
    def myround(x, base=5):
        return base * round(x/base)
    
    # grab umap coordinates and labels
    umap_coords = anndata.obsm['X_umap']
    umap_experiment_labels_allele = anndata.obs['experiment']
    
    umap_experiment_labels = [ x.replace("129","") for x in anndata.obs['experiment'] ]
    umap_experiment_labels = [ x.replace("cast","") for x in umap_experiment_labels ]
    
    experiment_counts = {}
    
    # make a dictionary to normalize the counts by experiment total representation
    uniq_expts, expt_cts = np.unique(umap_experiment_labels, return_counts=True)
    for i in range(len(uniq_expts)):
        experiment_counts[uniq_expts[i]] = expt_cts[i]
    
    print(experiment_counts)
    
    # find x min/max and y min/max and make them nice numbers to work with.
    x_min = np.floor(np.amin(umap_coords[:,1]))
    x_max = np.ceil(np.amax(umap_coords[:,1]))
    
    y_min = np.floor(np.amin(umap_coords[:,0]))
    y_max = np.ceil(np.amax(umap_coords[:,0]))
    
    # find bin size
    binsize_x = (x_max-x_min)/num_pixels
    binsize_y = (y_max-y_min)/num_pixels
    
    # create the empty array to work with.
    pixels = {}
    
    for idx, point in enumerate(umap_coords):

        if np.all([x_min < point[1],point[1] <= x_max, y_min < point[0], point[0] <= y_max] ):
            # find which index it goes in for x and y and add it to that.
            x_bin = int((x_max-point[1])//binsize_x)
            y_bin = int((point[0]-y_min)//binsize_y)

            # add the point to its respective pixel entry.
            key = str(x_bin)+'_'+str(y_bin)
            if key not in pixels.keys():
                pixels[key] = [umap_experiment_labels[idx]]
            else:
                pixels[key].append(umap_experiment_labels[idx])
        else:
            continue

    pixellated_matrix = np.empty((num_pixels, num_pixels))
    pixellated_matrix[:,:] = np.nan
    
    for key, value in pixels.items():
        
        inds = [int(x) for x in key.split('_') ]
        
        ids, counts = np.unique(value, return_counts=True)
        
        normalized_counts = [ counts[i]/experiment_counts[ids[i]] for i in range(len(ids)) ]
        dominant_cluster = list(normalized_counts).index(max(normalized_counts))
        
        pixellated_matrix[inds[0], inds[1]] = dominant_cluster
    
    return pixellated_matrix

In [None]:
def pixellated_umap_by_log_gyr(anndata, num_pixels=100):
    
    def myround(x, base=5):
        return base * round(x/base)
    
    # grab umap coordinates and labels
    umap_coords = anndata.obsm['X_umap']
    umap_experiment_labels_allele = anndata.obs['experiment']
    
    umap_experiment_labels = [ x.replace("129","") for x in anndata.obs['experiment'] ]
    umap_experiment_labels = [ x.replace("cast","") for x in umap_experiment_labels ]
    
    experiment_counts = {}
    
    # make a dictionary to normalize the counts by experiment total representation
    uniq_expts, expt_cts = np.unique(umap_experiment_labels, return_counts=True)
    for i in range(len(uniq_expts)):
        experiment_counts[uniq_expts[i]] = expt_cts[i]
    
    print(experiment_counts)
    
    # find x min/max and y min/max and make them nice numbers to work with.
    x_min = np.floor(np.amin(umap_coords[:,1]))
    x_max = np.ceil(np.amax(umap_coords[:,1]))
    
    y_min = np.floor(np.amin(umap_coords[:,0]))
    y_max = np.ceil(np.amax(umap_coords[:,0]))
    
    # find bin size
    binsize_x = (x_max-x_min)/num_pixels
    binsize_y = (y_max-y_min)/num_pixels
    
    # create the empty array to work with.
    pixels = {}
    
    for idx, point in enumerate(umap_coords):

        if np.all([x_min < point[1],point[1] <= x_max, y_min < point[0], point[0] <= y_max] ):
            # find which index it goes in for x and y and add it to that.
            x_bin = int((x_max-point[1])//binsize_x)
            y_bin = int((point[0]-y_min)//binsize_y)

            # add the point to its respective pixel entry.
            key = str(x_bin)+'_'+str(y_bin)
            if key not in pixels.keys():
                pixels[key] = [anndata.obs['log_gyr'][idx]]
            else:
                pixels[key].append(anndata.obs['log_gyr'][idx])
        else:
            continue

    pixellated_matrix = np.empty((num_pixels, num_pixels))
    pixellated_matrix[:,:] = np.nan
    
    for key, value in pixels.items():
        
        inds = [int(x) for x in key.split('_') ]
        
        average = np.nanmedian(value)
        
        pixellated_matrix[inds[0], inds[1]] = average
    
    return pixellated_matrix

In [None]:
def pixellated_umap_by_rad_gyr(anndata, num_pixels=100):
    
    def myround(x, base=5):
        return base * round(x/base)
    
    # grab umap coordinates and labels
    umap_coords = anndata.obsm['X_umap']
    umap_experiment_labels_allele = anndata.obs['experiment']
    
    umap_experiment_labels = [ x.replace("129","") for x in anndata.obs['experiment'] ]
    umap_experiment_labels = [ x.replace("cast","") for x in umap_experiment_labels ]
    
    experiment_counts = {}
    
    # make a dictionary to normalize the counts by experiment total representation
    uniq_expts, expt_cts = np.unique(umap_experiment_labels, return_counts=True)
    for i in range(len(uniq_expts)):
        experiment_counts[uniq_expts[i]] = expt_cts[i]
    
    print(experiment_counts)
    
    # find x min/max and y min/max and make them nice numbers to work with.
    x_min = np.floor(np.amin(umap_coords[:,1]))
    x_max = np.ceil(np.amax(umap_coords[:,1]))
    
    y_min = np.floor(np.amin(umap_coords[:,0]))
    y_max = np.ceil(np.amax(umap_coords[:,0]))
    
    # find bin size
    binsize_x = (x_max-x_min)/num_pixels
    binsize_y = (y_max-y_min)/num_pixels
    
    # create the empty array to work with.
    pixels = {}
    
    for idx, point in enumerate(umap_coords):

        if np.all([x_min < point[1],point[1] <= x_max, y_min < point[0], point[0] <= y_max] ):
            # find which index it goes in for x and y and add it to that.
            x_bin = int((x_max-point[1])//binsize_x)
            y_bin = int((point[0]-y_min)//binsize_y)

            # add the point to its respective pixel entry.
            key = str(x_bin)+'_'+str(y_bin)
            if key not in pixels.keys():
                pixels[key] = [anndata.obs['rad_gyr'][idx]]
            else:
                pixels[key].append(anndata.obs['rad_gyr'][idx])
        else:
            continue

    pixellated_matrix = np.empty((num_pixels, num_pixels))
    pixellated_matrix[:,:] = np.nan
    
    for key, value in pixels.items():
        
        inds = [int(x) for x in key.split('_') ]
        
        average = np.nanmedian(value)
        
        pixellated_matrix[inds[0], inds[1]] = average
    
    return pixellated_matrix

In [None]:
def pixellated_umap_by_allele(anndata, num_pixels=100):
    
    def myround(x, base=5):
        return base * round(x/base)
    
    # grab umap coordinates and labels
    umap_coords = anndata.obsm['X_umap']
    umap_allele_labels = anndata.obs['allele']
    
    allele_counts = {}
    
    # make a dictionary to normalize the counts by experiment total representation
    uniq_alleles, allele_cts = np.unique(umap_allele_labels, return_counts=True)
    for i in range(len(uniq_alleles)):
        allele_counts[uniq_alleles[i]] = allele_cts[i]
    
    print(allele_counts)
    
    # find x min/max and y min/max and make them nice numbers to work with.
    x_min = np.floor(np.amin(umap_coords[:,1]))
    x_max = np.ceil(np.amax(umap_coords[:,1]))
    
    y_min = np.floor(np.amin(umap_coords[:,0]))
    y_max = np.ceil(np.amax(umap_coords[:,0]))
    
    # find bin size
    binsize_x = (x_max-x_min)/num_pixels
    binsize_y = (y_max-y_min)/num_pixels
    
    # create the empty array to work with.
    pixels = {}
    
    for idx, point in enumerate(umap_coords):

        if np.all([x_min < point[1],point[1] <= x_max, y_min < point[0], point[0] <= y_max] ):
            # find which index it goes in for x and y and add it to that.
            x_bin = int((x_max-point[1])//binsize_x)
            y_bin = int((point[0]-y_min)//binsize_y)

            # add the point to its respective pixel entry.
            key = str(x_bin)+'_'+str(y_bin)
            if key not in pixels.keys():
                pixels[key] = [umap_allele_labels[idx]]
            else:
                pixels[key].append(umap_allele_labels[idx])
        else:
            continue

    pixellated_matrix = np.empty((num_pixels, num_pixels))
    pixellated_matrix[:,:] = np.nan
    
    for key, value in pixels.items():
        
        inds = [int(x) for x in key.split('_') ]
        
        ids, counts = np.unique(value, return_counts=True)
        
        # check if we have both, neither, or only one allele in that box.
        if len(counts) > 1:
            pixellated_matrix[inds[0], inds[1]] = (counts[0]-counts[1])
        elif len(counts) == 0:
            pixellated_matrix[inds[0], inds[1]] = 0
        else:
            # if we only have one, check if it is a 0 or a 1. (CAST is 0, 129 is 1)
            if ids[0] == '0':
                pixellated_matrix[inds[0], inds[1]] = counts[0]
            elif ids[0] == '1':
                pixellated_matrix[inds[0], inds[1]] = -counts[0]


    ############ CAST domninant is positive, 129 dominant is negative
    return pixellated_matrix

In [None]:
import scipy
import seaborn as sns

matrix = pixellated_umap_by_rad_gyr(data)
plt.figure(figsize=(10,7))
sns.heatmap(matrix, cmap='Reds')
ax = plt.gca()

plt.axis('on')
# Hide X and Y axes label marks
ax.xaxis.set_tick_params(labelbottom=False)
ax.yaxis.set_tick_params(labelleft=False)

# Hide X and Y axes tick marks
ax.set_xticks([])
ax.set_yticks([])
plt.savefig(cluster_folder+os.sep+'rad_gyr_downsample.png', bbox_inches='tight')
plt.savefig(cluster_folder+os.sep+'rad_gyr_downsample.pdf', bbox_inches='tight')

In [None]:
import scipy
import seaborn as sns

matrix = pixellated_umap_by_log_gyr(data, 30)
plt.figure(figsize=(10,7))
sns.heatmap(matrix, cmap='magma_r')
ax = plt.gca()

plt.axis('on')
# Hide X and Y axes label marks
ax.xaxis.set_tick_params(labelbottom=False)
ax.yaxis.set_tick_params(labelleft=False)

# Hide X and Y axes tick marks
ax.set_xticks([])
ax.set_yticks([])
plt.savefig(cluster_folder+os.sep+'log_rad_gyr_downsample.png', bbox_inches='tight')
plt.savefig(cluster_folder+os.sep+'log_rad_gyr_downsample.pdf', bbox_inches='tight')

In [None]:
import scipy
import seaborn as sns

matrix = pixellated_umap_by_allele(data, 50)
plt.figure(figsize=(10,7))
sns.heatmap(matrix, cmap='seismic')
ax = plt.gca()

plt.axis('on')
# Hide X and Y axes label marks
ax.xaxis.set_tick_params(labelbottom=False)
ax.yaxis.set_tick_params(labelleft=False)

# Hide X and Y axes tick marks
ax.set_xticks([])
ax.set_yticks([])
plt.savefig(cluster_folder+os.sep+'dominant_allele_downsample.png', bbox_inches='tight')
plt.savefig(cluster_folder+os.sep+'dominant_allele_downsample.pdf', bbox_inches='tight')

In [None]:
import scipy
import seaborn as sns

matrix = pixellated_umap_by_expt(data)
plt.figure(figsize=(10,7))
sns.heatmap(matrix, cmap='tab20b')
ax = plt.gca()

plt.axis('on')
# Hide X and Y axes label marks
ax.xaxis.set_tick_params(labelbottom=False)
ax.yaxis.set_tick_params(labelleft=False)

# Hide X and Y axes tick marks
ax.set_xticks([])
ax.set_yticks([])
plt.savefig(cluster_folder+os.sep+'dominant_cluster_downsample.png', bbox_inches='tight')
plt.savefig(cluster_folder+os.sep+'dominant_cluster_downsample.pdf', bbox_inches='tight')

In [None]:
def pixellated_umap_by_single_expt(anndata, experiment_num, num_pixels=100):
    
    def myround(x, base=5):
        return base * round(x/base)
    
    # grab umap coordinates and labels
    umap_coords = anndata.obsm['X_umap']
    umap_experiment_labels_allele = anndata.obs['experiment']
    
    umap_experiment_labels = [ x.replace("129","") for x in anndata.obs['experiment'] ]
    umap_experiment_labels = [ x.replace("cast","") for x in umap_experiment_labels ]
    
    experiment_counts = {}
    
    # make a dictionary to normalize the counts by experiment total representation
    uniq_expts, expt_cts = np.unique(umap_experiment_labels, return_counts=True)
    for i in range(len(uniq_expts)):
        experiment_counts[uniq_expts[i]] = expt_cts[i]
    
    print(experiment_counts)
    
    # find x min/max and y min/max and make them nice numbers to work with.
    x_min = np.floor(np.amin(umap_coords[:,1]))
    x_max = np.ceil(np.amax(umap_coords[:,1]))
    
    y_min = np.floor(np.amin(umap_coords[:,0]))
    y_max = np.ceil(np.amax(umap_coords[:,0]))
    
    # find bin size
    binsize_x = (x_max-x_min)/num_pixels
    binsize_y = (y_max-y_min)/num_pixels
    
    # create the empty array to work with.
    pixels = {}
    
    for idx, point in enumerate(umap_coords):

        if np.all([x_min < point[1],point[1] <= x_max, y_min < point[0], point[0] <= y_max] ):
            # find which index it goes in for x and y and add it to that.
            x_bin = int((x_max-point[1])//binsize_x)
            y_bin = int((point[0]-y_min)//binsize_y)

            # add the point to its respective pixel entry.
            key = str(x_bin)+'_'+str(y_bin)
            if key not in pixels.keys():
                pixels[key] = [umap_experiment_labels[idx]]
            else:
                pixels[key].append(umap_experiment_labels[idx])
        else:
            continue

    pixellated_matrix = np.empty((num_pixels, num_pixels))
    pixellated_matrix[:,:] = np.nan
    
    for key, value in pixels.items():
        
        inds = [int(x) for x in key.split('_') ]
        
        count = sum([ v == experiment_num for v in value ])/experiment_counts[experiment_num]
        
        pixellated_matrix[inds[0], inds[1]] = count
    
    return pixellated_matrix

In [None]:
import scipy
import seaborn as sns
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

for e in np.unique(data.obs['experiment']):
    matrix = pixellated_umap_by_single_expt(data, e, 30)
    plt.figure(figsize=(10,7))
    sns.heatmap(matrix, cmap='Reds')
    ax = plt.gca()

    plt.axis('on')
    # Hide X and Y axes label marks
    ax.xaxis.set_tick_params(labelbottom=False)
    ax.yaxis.set_tick_params(labelleft=False)

    # Hide X and Y axes tick marks
    plt.title("Downsampled distribution of chromosomes for Experiment "+e)
    ax.set_xticks([])
    ax.set_yticks([])
    plt.savefig(cluster_folder+os.sep+'cluster_'+e+'downsample.png', bbox_inches='tight')
    plt.savefig(cluster_folder+os.sep+'cluster_'+e+'downsample.pdf', bbox_inches='tight')