In [1]:
# base
import scipy as sc
from scipy import sparse, io
import numpy as np
import pandas as pd
# plotting
import seaborn as sns
from matplotlib import pyplot as plt
# clustering and umap
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import AgglomerativeClustering
import umap
from itertools import compress

In [2]:
import matplotlib
matplotlib.rcParams['figure.figsize'] = [12, 12]
from pandas.plotting import scatter_matrix

In [3]:
import scanpy as scp
import anndata

In [4]:
def read_mtx(prefix):
    mtx = io.mmread(infolder + ".counts.mtx")
    mtx = mtx.tocsr()
    with open(infolder + ".colnames.txt") as col:
        colnames = col.read().splitlines()
    col.close()
    with open(infolder + ".rownames.txt") as row:
        rownames = row.read().splitlines()
    row.close()
    return mtx, rownames, colnames

In [5]:
# from mtx
def preprocess_mtx(sparse_mtx, rownames, colnames, min_cell_sum=1000, min_region_sum=10):
    ## binarize
    nonzero_mask = np.array(sparse_mtx[sparse_mtx.nonzero()] > 1)[0]
    rows = sparse_mtx.nonzero()[0][nonzero_mask]
    cols = sparse_mtx.nonzero()[1][nonzero_mask]
    sparse_mtx[rows, cols] = 1
    
    ## filter low counts
    colmask = np.array(np.sum(sparse_mtx, axis = 0) >= min_cell_sum)[0]
    rowmask = np.array(np.sum(sparse_mtx, axis = 1) >= min_region_sum)
    rowmask = np.array([x[0] for x in rowmask])
    sparse_mtx = sparse_mtx[rowmask, :]
    sparse_mtx = sparse_mtx[:, colmask]
    
    ## create anndata
    row_subset = list(compress(rownames, rowmask))
    col_subset = list(compress(colnames, colmask))
    adata = anndata.AnnData(sparse_mtx.transpose(), 
                            obs=pd.DataFrame(col_subset), 
                            var=pd.DataFrame(row_subset))
    
    return adata

# from anndata
def preprocess_adata(adata, min_cell_sum=1000, min_region_sum=10):
    # binarize
    sparse_mtx = adata.X
    nonzero_mask = np.array(sparse_mtx[sparse_mtx.nonzero()] > 1)[0]
    rows = sparse_mtx.nonzero()[0][nonzero_mask]
    cols = sparse_mtx.nonzero()[1][nonzero_mask]
    sparse_mtx[rows, cols] = 1
    adata.x = sparse_mtx
    # save some QC
    scp.pp.calculate_qc_metrics(adata, inplace=True)
    # filter
    adata = adata[adata.obs.n_genes_by_counts >= min_cell_sum, :]
    adata = adata[:, adata.var.n_cells_by_counts >= min_region_sum]
    
    return adata

In [6]:
def runLSA(sparse_mtx, nPCs, scaleFactor): 
    #tf = (sparse_mtx.transpose() / sparse_mtx.sum(axis=0)).transpose()
    tf = sparse_mtx / sparse_mtx.sum(axis=0)
    tf = np.log1p(tf * scaleFactor)
    
    idf = np.log1p(sparse_mtx.shape[1] / sparse_mtx.sum(axis=1))
    tfidf = np.multiply(tf, idf)
    svd = TruncatedSVD(n_components=nPCs)
    pca = svd.fit(np.nan_to_num(tfidf, nan = 0.0))
    
    return pca, tfidf

## for anndata

def lsa_anndata(adata, n_pcs, scale_factor=None):
    mtx = sparse.csr_matrix(adata.X)
    lsa_out, tfidf = runLSA(mtx.transpose(), n_pcs, scale_factor)

    adata.X = np.squeeze(np.asarray(tfidf.transpose()))
    adata.obsm['X_pca'] = lsa_out.components_.transpose()
    adata.uns['pca_variance'] = lsa_out.explained_variance_
    adata.uns['pca_variance_ratio'] = lsa_out.explained_variance_ratio_
    adata.layers['raw_counts'] = mtx
    
    return adata


In [15]:
def UMAP_clustering(adata):
    
    # make umap on PCA
    lsa_out = adata.obsm['X_pca'].transpose()[1:, :]
    reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, spread=1.0, metric='euclidean', init = 'random')
    embeddings = reducer.fit_transform(lsa_out.transpose())
    adata.obsm['X_umap'] = embeddings
    
    # louvain (from scanpy)
    scp.pp.neighbors(adata)
    scp.tl.louvain(adata)
    cluster_id = [int(x) for x in adata.obs['louvain'].to_list()]
    
    # return
    return adata


In [None]:
## ESC
infolder="/hpc/hub_oudenaarden/vbhardwaj/2019_scHiC/04_ESC_NPC/04_workflow_outputs/downstream/ESC_NPC_clustering/scDeepTools_filtered_ESC-NPC_peaks_counts"
mtx, row, col = read_mtx(infolder)
esc_adata = preprocess_mtx(mtx, row, col)

works for ESC-NPC

## Run it on bone marrow

In [28]:
infolder="/hpc/hub_oudenaarden/vbhardwaj/2019_scHiC/05_bone_marrow/03_ctcf_k4me3_dChIC_run-1/counts/scDeepTools_filtered_counts_50kWindows"
mtx_bm = read_mtx(infolder)
ad_bm_mtx = preprocess_mtx(mtx_bm[0], mtx_bm[1], mtx_bm[2])



In [29]:
pd.DataFrame(np.array(np.sum(ad_bm_mtx.X.transpose(), axis = 0))[0]).describe()

Unnamed: 0,0
count,649.0
mean,6468.325195
std,3608.01001
min,1034.0
25%,3636.0
50%,5947.0
75%,8511.0
max,18895.0


matches the data frame description in R

In [30]:
## check the same for tfIDF df
lsa_out, tfidf_bm = runLSA(ad_bm_mtx.X.transpose(), nPCs = 10, scaleFactor=100000)

In [31]:
pd.DataFrame(np.array(np.sum(tfidf_bm, axis = 0))[0]).describe()

Unnamed: 0,0
count,649.0
mean,32207.909679
std,13130.870718
min,8278.38967
25%,22171.198522
50%,30921.609861
75%,40584.246169
max,72222.733708


Match!

In [32]:
ad_bm_mtx = lsa_anndata(ad_bm_mtx, n_pcs=30, scale_factor=100000)

In [33]:
ad_bm_mtx

AnnData object with n_obs × n_vars = 649 × 49801
    obs: 0
    var: 0
    uns: 'pca_variance', 'pca_variance_ratio'
    obsm: 'X_pca'
    layers: 'raw_counts'

In [34]:
ad_bm_mtx = UMAP_clustering(ad_bm_mtx)

  adata.uns['neighbors']['distances'] = neighbors.distances
  adata.uns['neighbors']['connectivities'] = neighbors.connectivities
  adjacency = adata.uns['neighbors']['connectivities']


In [35]:
cluster_id = ad_bm_mtx.obs.louvain.to_list()
cluster_id = [int(x) for x in cluster_id]
embeddings = ad_bm_mtx.obsm['X_umap']

df = pd.DataFrame({'Cluster': cluster_id,
              'UMAP1': embeddings[:, 0],
              'UMAP2': embeddings[:, 1]
             })

In [None]:
df.index = ad_bm

In [36]:
def plotUMAP(embeddings, cluster_id):
    plotWidth = 10
    plotHeight = 10
    plotFile = "test.png"
    plotFileFormat = 'png'
    ## plotting mods
    plt.rcParams['font.size'] = 8.0
    # convert cm values to inches
    fig = plt.figure(figsize=(plotWidth / 2.54, plotHeight / 2.54))
    fig.suptitle('LSA-UMAP', y=(1 - (0.06 / plotHeight)))
    plt.scatter(
        embeddings[:, 0],
        embeddings[:, 1],
        c=[sns.color_palette()[x] for x in cluster_id])
    plt.tight_layout()
    plt.savefig(plotFile, dpi=200, format=plotFileFormat)
    plt.close()
    
    return None


## LSA-clustering from anndata

In [45]:
ad_bm = anndata.read_h5ad("/hpc/hub_oudenaarden/vbhardwaj/2019_scHiC/05_bone_marrow/03_ctcf_k4me3_dChIC_run-1/counts/scDeepTools_filtered_counts_50kWindows.h5ad")

In [46]:
ad_bm = preprocess_adata(ad_bm)

In [47]:
ad_bm = lsa_anndata(ad_bm, 20, 100000)

In [49]:
umap_bm_ad = UMAP_clustering(ad_bm)

  adata.uns['neighbors']['distances'] = neighbors.distances
  adata.uns['neighbors']['connectivities'] = neighbors.connectivities
  adjacency = adata.uns['neighbors']['connectivities']


In [55]:
cluster_id = ad_bm.obs.louvain.to_list()
cluster_id = [int(x) for x in cluster_id]
embeddings = ad_bm.obsm['X_umap']

df = pd.DataFrame({'UMAP1': embeddings[:, 0],
                  'UMAP2': embeddings[:, 1],
                  'Cluster': cluster_id,
                 })

df.index = ad_bm.obs.index

In [56]:
df

Unnamed: 0_level_0,UMAP1,UMAP2,Cluster
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
i1_ACACACTA,6.225847,-3.453841,0
i1_ACACATAG,6.743467,-3.158327,0
i1_ACACTGAT,6.492131,-3.022453,0
i1_ACAGTACG,8.007348,-2.739637,0
i1_ACATCACA,6.822153,0.185938,2
...,...,...,...
i2_CGCGGTGG,6.919651,-2.140476,0
i2_GGTCCGTA,8.103642,-2.315047,0
i2_AGGTCACC,8.518637,-0.693374,4
i2_GAACGCAA,8.417742,-1.313372,4


**Match with R**

In [None]:
mtx = pd.DataFrame(ad_bm.X.transpose())

In [None]:
mtx_original = pd.DataFrame.sparse.from_spmatrix(ad_bm.layers['raw_counts'].transpose())

In [None]:
mtx_original.sum(axis=0).describe()

In [None]:
mtx.sum(axis = 0).describe()

In [None]:
ad_bm = UMAP_clustering(ad_bm)

In [None]:
ad_bm[0].obsm['X_umap']

In [None]:
## umap and louvain using scanpy
scp.pp.neighbors(ad_bm)
scp.tl.umap(ad_bm)
scp.tl.louvain(ad_bm)
scp.pl.umap(ad_bm, color='louvain')
## not so great output