In [1]:
import pegasus as pg
from readers import auto_reader 

In [2]:
# adata = pg.read_input("MantonBM_nonmix_subset.h5sc")
adata = auto_reader("mc_tm", 11, 4)
adata

2020-05-31 17:39:20,531 - pegasus - INFO - Time spent on 'read_input' = 3.90s.


AnnData object with n_obs × n_vars = 737280 × 54446 
    obs: 'Channel'
    var: 'gene_ids'
    uns: 'genome'

In [3]:
adata.X

<737280x54446 sparse matrix of type '<class 'numpy.float32'>'
	with 4644645 stored elements in Compressed Sparse Row format>

In [4]:
adata.obs.head()

Unnamed: 0,Channel
AAACCTGAGAAACCAT,
AAACCTGAGAAACCGC,
AAACCTGAGAAACCTA,
AAACCTGAGAAACGAG,
AAACCTGAGAAACGCC,


In [5]:
adata.obs['Channel'].value_counts()

    737280
Name: Channel, dtype: int64

In [6]:
adata.var.head()

Unnamed: 0,gene_ids
4933401J01Rik,ENSMUSG00000102693.1
Gm26206,ENSMUSG00000064842.1
Xkr4,ENSMUSG00000051951.5
Gm18956,ENSMUSG00000102851.1
Gm37180,ENSMUSG00000103377.1


In [7]:
adata.uns['genome']

'gencode.vM19'

In [9]:
pg.qc_metrics(adata, mito_prefix="mt-")

In [10]:
stats_samples, stats_genes = pg.get_filter_stats(adata)

In [11]:
stats_samples

Unnamed: 0_level_0,kept,median_n_genes,median_n_umis,median_percent_mito,filt,total,median_n_genes_before,median_n_umis_before,median_percent_mito_before
Channel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,330,1811,3909.0,1.897263,736950,737280,0,0.0,0.0


In [12]:
stats_genes

Unnamed: 0_level_0,n_cells,percent_cells
gene,Unnamed: 1_level_1,Unnamed: 2_level_1
4933401J01Rik,0,0.0
Ighv2-9-1,0,0.0
Gm37976,0,0.0
Gm37418,0,0.0
Ighv5-9-1,0,0.0
...,...,...
Gm38901,0,0.0
Gm23098,0,0.0
Gm42811,0,0.0
Mir7233,0,0.0


In [13]:
pg.violin(adata, keys = ['n_genes', 'n_counts', 'percent_mito'], by = 'passed_qc')

In [14]:
pg.scatter(adata, 'n_genes', 'n_counts', color = 'passed_qc')

In [15]:
pg.scatter(adata, 'n_genes', 'percent_mito', color = 'passed_qc')

In [16]:
pg.filter_data(adata)
adata

2020-05-31 17:41:48,953 - pegasus - INFO - After filtration, 330/737280 cells and 19607/54446 genes are kept. Among 19607 genes, 19607 genes are robust.


AnnData object with n_obs × n_vars = 330 × 19607 
    obs: 'Channel', 'passed_qc', 'n_genes', 'n_counts', 'percent_mito'
    var: 'gene_ids', 'n_cells', 'percent_cells', 'robust', 'highly_variable_features'
    uns: 'genome'

In [17]:
adata.obs['Channel'].value_counts()

    330
Name: Channel, dtype: int64

In [18]:
pg.log_norm(adata)

2020-05-31 17:42:00,672 - pegasus - INFO - Time spent on 'log_norm' = 0.02s.


In [20]:
pg.highly_variable_features(adata, consider_batch = False)

2020-05-31 17:42:03,164 - pegasus - INFO - 2000 highly variable features have been selected.
2020-05-31 17:42:03,165 - pegasus - INFO - Time spent on 'highly_variable_features' = 0.07s.


In [21]:
pg.variable_feature_plot(adata)

In [22]:
adata.var.loc[adata.var['highly_variable_features']].sort_values(by = 'hvf_rank').head()

Unnamed: 0,gene_ids,n_cells,percent_cells,robust,highly_variable_features,mean,var,hvf_loess,hvf_rank
Tmsb4x,ENSMUSG00000049775.16,278,84.242424,True,True,4.989697,6.677295,2.367642,1
Malat1,ENSMUSG00000092341.2,308,93.333333,True,True,7.050563,5.879161,2.696411,5
Cd74,ENSMUSG00000024610.14,199,60.30303,True,True,2.885039,8.463235,4.053889,5
B2m,ENSMUSG00000060802.8,259,78.484848,True,True,4.157108,6.137618,3.099602,20
H2-Eb1,ENSMUSG00000060586.10,115,34.848485,True,True,1.623785,6.508124,3.335874,21


In [23]:
pg.pca(adata)

2020-05-31 17:42:15,040 - pegasus - INFO - PCA is done. Time spent = 0.07s.


In [24]:
coord_pc1 = adata.uns['PCs'][:, 0]
coord_pc1

array([ 0.03533416, -0.03569239,  0.01421165, ..., -0.00224235,
       -0.02503295, -0.02423959], dtype=float32)

In [25]:
adata.var.loc[adata.var['highly_variable_features']].index.values

array(['Sox17', 'Adhfe1', '2610203C22Rik', ..., 'G530011O06Rik',
       'mt-Atp6', 'mt-Nd3'], dtype=object)

In [26]:
adata.obsm['X_pca'].shape

(330, 50)

In [27]:
pg.neighbors(adata)

2020-05-31 17:42:24,461 - pegasus - INFO - Time spent on 'get_neighbors' = 0.08s.
2020-05-31 17:42:24,494 - pegasus - INFO - Time spent on 'calculate_affinity_matrix' = 0.03s.


In [28]:
print("Get {} nearest neighbors (excluding itself) for each cell.".format(adata.uns['pca_knn_indices'].shape[1]))
adata.uns['pca_knn_indices']

Get 99 nearest neighbors (excluding itself) for each cell.


array([[ 33, 163, 328, ...,  86, 179, 197],
       [ 27, 309, 262, ..., 213, 116, 111],
       [ 20, 115, 167, ...,  67,  78,  27],
       ...,
       [220, 213, 267, ..., 282,  78,  41],
       [163,  33,   0, ..., 306,  59,  47],
       [ 94,  24, 178, ...,   8, 305, 263]])

In [29]:
adata.uns['pca_knn_distances']

array([[13.88835774, 14.71252133, 15.82173284, ..., 48.31180939,
        48.34574911, 48.36195769],
       [11.3513599 , 11.81008558, 11.83060678, ..., 34.67007114,
        34.67173742, 34.67490296],
       [11.79335417, 12.12376532, 13.13446569, ..., 43.79980084,
        43.82367665, 43.83419703],
       ...,
       [23.46120252, 38.39218252, 38.56335242, ..., 43.12982685,
        43.15372681, 43.17177146],
       [15.03050143, 15.25247899, 15.82173284, ..., 47.38351439,
        47.3852429 , 47.38575873],
       [ 5.1965437 ,  6.0513311 ,  7.35642022, ..., 35.87432979,
        36.08086406, 36.43645722]])

In [30]:
pg.louvain(adata)

2020-05-31 17:42:34,135 - pegasus - INFO - Time spent on 'construct_graph' = 0.01s.
2020-05-31 17:42:34,169 - pegasus - INFO - Louvain clustering is done. Get 8 clusters. Time spent = 0.05s.


In [31]:
adata.obs['louvain_labels'].value_counts()

1    92
2    58
3    56
4    54
5    44
6    24
8     1
7     1
Name: louvain_labels, dtype: int64

In [32]:
pg.composition_plot(adata, by = 'louvain_labels', condition = 'Channel')

In [33]:
pg.fitsne(adata)

2020-05-31 17:42:57,386 - pegasus - INFO - Time spent on 'fitsne' = 14.95s.


In [34]:
pg.embedding(adata, basis = 'fitsne', keys = ['louvain_labels', 'Channel'])



In [None]:
pg.umap(adata)

In [None]:
pg.embedding(adata, basis = 'umap', keys = ['louvain_labels', 'Channel'])

In [None]:
pg.de_analysis(adata, cluster = 'louvain_labels', auc = False, t = True, fisher = False, mwu = False,
                temp_folder = "/tmp")

In [None]:
marker_dict = scc.markers(adata)

In [None]:
marker_dict['1']['up']

In [None]:
pg.volcano(adata, cluster_ids = ['1'])

In [None]:
pg.write_results_to_excel(marker_dict, "MantonBM_nonmix_subset.louvain_labels.de.xlsx")

In [None]:
celltype_dict = pg.infer_cell_types(adata, markers = 'human_immune', de_test = 't')
cluster_names = pg.infer_cluster_names(celltype_dict)

In [None]:
adata.rename_categories('louvain_labels', cluster_names)

In [None]:
pg.embedding(adata, basis = 'fitsne', keys = ['louvain_labels'])

In [None]:
pg.embedding(adata, basis = 'fitsne', keys = ['louvain_labels'], legend='data')

In [None]:
marker_genes = ['CD38', 'JCHAIN', 'FCGR3A', 'HLA-DPA1', 'CD14', 'CD79A', 'MS4A1', 'CD34', 'TRAC', 'CD3D', 'CD8A',
                'CD8B', 'GYPA', 'NKG7', 'CD4', 'SELL', 'CCR7']

pg.dotplot(adata, keys = marker_genes, by = 'louvain_labels')

In [None]:
pg.heatmap(adata, keys = marker_genes, by = 'louvain_labels')

In [None]:
pg.violin(adata, keys = ['TRAC'], by = 'louvain_labels', width = 900, height = 450)

In [None]:
pg.embedding(adata, basis = 'fitsne', keys = ['TRAC', 'CD79A', 'CD14', 'CD34'])

In [None]:
pg.write_output(adata, 'tutorial_results')