In [1]:
import os
import pandas as pd
import numpy as np

import scanpy as sc
import pyranges as pr
import warnings

In [2]:
import palantir 
import phenograph
import harmony

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

findfont: Font family ['Raleway'] not found. Falling back to DejaVu Sans.
findfont: Font family ['Lato'] not found. Falling back to DejaVu Sans.


In [3]:
%matplotlib inline

In [4]:
sns.set_style('white')
matplotlib.rcParams['figure.figsize'] = [4, 4]
matplotlib.rcParams['image.cmap'] = 'Spectral_r'
warnings.filterwarnings(action="ignore", module="matplotlib", message="findfont")

In [3]:
def log_transform(ad, ps=0.1):
    ad.X.data = np.log2(ad.X.data + ps) - np.log2(ps)

In [4]:
def pyranges_from_strings(pos_list):
    # Chromosome and positions
    chr = pos_list.str.split(':').str.get(0)
    start = pd.Series(pos_list.str.split(':').str.get(1)).str.split('-').str.get(0)
    end = pd.Series(pos_list.str.split(':').str.get(1)).str.split('-').str.get(1)
    
    # Create ranges
    gr = pr.PyRanges(chromosomes=chr, starts=start, ends=end)
    
    return gr

In [5]:
data_dir = os.path.expanduser('MERGED_PEER/')

In [6]:
# Peaks data
from scipy.io import mmread
counts = mmread(data_dir + 'AML7_MERGE_counts.mtx')

In [7]:
# Cell and peak information
cells = pd.read_csv(data_dir + 'AML7_MERGE_cells.csv', index_col=0).iloc[:, 0]
peaks = pd.read_csv(data_dir + 'AML7_MERGE_peaks.csv', index_col=0)
peaks.index = peaks['seqnames'] + ':' + peaks['start'].astype(str) + '-' + peaks['end'].astype(str)
peaks.head()

Unnamed: 0,seqnames,start,end,width,strand,score,replicateScoreQuantile,groupScoreQuantile,Reproducibility,GroupReplicate,distToGeneStart,nearestGene,peakType,distToTSS,nearestTSS,GC,idx,N
chr1:817109-817609,chr1,817109,817609,501,*,43.6,0.913,0.838,2,C4._.AML7_CD34,11,FAM87B,Promoter,11,uc057aum.1,0.483,1,0
chr1:826659-827159,chr1,826659,827159,501,*,3.52005,0.401,0.123,2,C3._.AML7_CD133,612,LINC00115,Exonic,76,uc031tlo.2,0.5888,2,0
chr1:827267-827767,chr1,827267,827767,501,*,116.064,0.911,0.833,2,C6._.AML7_CD133,4,LINC00115,Promoter,4,uc031tlo.2,0.6926,3,0
chr1:838190-838690,chr1,838190,838690,501,*,2.5847,0.627,0.262,2,C9._.Rep2,10917,LINC00115,Intronic,6834,uc031tlo.2,0.4631,4,0
chr1:858545-859045,chr1,858545,859045,501,*,1.25495,0.28,0.04,2,C15._.AML7_CD34,18107,FAM41C,Exonic,7446,uc057aux.1,0.5629,5,0


In [8]:
ad = sc.AnnData(counts.T)
ad.obs_names = cells
ad.var_names = peaks.index
for col in ['distToGeneStart', 'nearestGene', 'peakType',
       'distToTSS', 'nearestTSS','replicateScoreQuantile']:
    ad.var[col] = peaks[col]

In [9]:
ad

AnnData object with n_obs × n_vars = 22404 × 183768
    var: 'distToGeneStart', 'nearestGene', 'peakType', 'distToTSS', 'nearestTSS', 'replicateScoreQuantile'

In [10]:
ad.obsm['X_svd'] = pd.read_csv(data_dir + 'AML7_MERGE_svd.csv', index_col=0).loc[ad.obs_names, : ].values

In [11]:
# UMAP
ad.obsm['UMAP'] = pd.read_csv(data_dir + 'AML7_MERGE_umap.csv', index_col=0).loc[ad.obs_names, :].values

In [12]:
# Metadata
meta = pd.read_csv(data_dir + 'AML7_MERGE_cell_metadata.csv', index_col=0).loc[ad.obs_names, :]
for col in meta.columns:
    ad.obs[col] = meta[col].values

In [13]:
ad.obs['AP-GFP'] = ad.obs['Sample'].str.split('_').str.get(0)
ad.obs['Replicate'] = ad.obs['Sample'].str.split('_').str.get(1)

In [14]:
# Gene scores
gene_scores = pd.read_csv(data_dir + 'AML7_MERGE_all_cells_gene_scores.csv', index_col=0).T

In [15]:
ad.obsm['GeneScores'] = gene_scores.loc[ad.obs_names, :].values
ad.uns['GeneScoresColums'] = gene_scores.columns.values

In [19]:
#warnings.filterwarnings('ignore')
#dm_res = palantir.utils.run_diffusion_maps(pd.DataFrame(ad.obsm['X_svd'], index=ad.obs_names))
#warnings.filterwarnings('default')

In [20]:
#ad.obsm['FDL'] = harmony.plot.force_directed_layout(dm_res['kernel']).values

In [21]:
#fdl = pd.DataFrame(ad.obsm['FDL'], index=ad.obs_names, columns=['x', 'y'])
#umap = pd.DataFrame(ad.obsm['UMAP'], index=ad.obs_names, columns=['x', 'y'])

In [16]:
dense=ad.X.todense()

In [17]:
ad.X=dense

In [18]:
ad.write('AML7_MERGE_ATAC.h5ad')

... storing 'Sample' as categorical
... storing 'Harmony_Clusters' as categorical
... storing 'LSI_Clusters' as categorical
... storing 'AP-GFP' as categorical
... storing 'Replicate' as categorical
... storing 'nearestGene' as categorical
... storing 'peakType' as categorical
... storing 'nearestTSS' as categorical
