In [13]:
import os
import re
import subprocess
import sys

import numpy as np
import pandas as pd
import pegasus as pg
import pegasusio as io

In [14]:
tissue = "Adipose"
task_id = 0 
res = 1.4  # this resolution gives results closest to seurat
INF = 10 ** 10  # infinity for cases no filtering is required

In [15]:
def calculate_percent_ribo(adata, ribo_prefix):
    ribo_prefixes = ribo_prefix.split(",")  # parse ribo prefixes
    def startswith(name):
        for prefix in ribo_prefixes:
            if re.match(prefix, name, flags=re.IGNORECASE):
                return True
        return False
    ribo_genes = adata.var_names.map(startswith).values.nonzero()[0]  # get all genes that match the pattern
    # calculate percent ribo
    adata.obs["percent_ribo"] = (adata.X[:, ribo_genes].sum(axis=1).A1 / np.maximum(adata.obs["n_counts"].values,                                                                         1.0)) * 100
    return adata


# basic qc that is performed for each method
def initial_qc(adata, n_genes, n_cells, is_human):
    mt_prefix = "MT-" if is_human else "mt-"
    pg.qc_metrics(adata, mito_prefix=mt_prefix, min_umis=-INF, max_umis=INF, min_genes=n_genes, max_genes=INF,
                  percent_mito=80)  # default PG filtering with custom cutoffs
    adata = calculate_percent_ribo(adata, "^Rp[sl]\d")  # calculate percent ribo
    adata.var["n_cells"] = adata.X.getnnz(axis=0)
    # adata = adata[:, adata.var.n_cells > n_cells]  # filtering based on nCells
    pg.filter_data(adata)  # filtering based on the parameters from qc_metrics
    pg.identify_robust_genes(adata)
    return adata


In [20]:
def cluster_data(adata, resolution=1, compute_markers=False, compute_reductions=False):
    pg.log_norm(adata)
    pg.highly_variable_features(adata, consider_batch=False)
    pg.pca(adata)
    pg.neighbors(adata, K=20)  # K=20 to make it closer to seurat
    pg.louvain(adata, resolution=resolution)
    if compute_reductions:
        # pg.fitsne(adata)
        pg.umap(adata)
    if compute_markers:
        pg.de_analysis(adata, cluster='louvain_labels', t=True, fisher=False, temp_folder="/tmp")
        marker_dict = pg.markers(adata, alpha=1)
        return adata, marker_dict
    else:
        return adata

In [22]:
is_human = False  # this is mouse data
data_path = "/Volumes/scqc/data/pegasus_data/dataset_1(ebi_tm)/adipose.h5ad"
filename = "read_info_{}_{}.csv".format("ebi_tm", task_id)  # filename of csv used by aggregate_matrices
read_info = open(filename, "w")  # csv for aggregate_matrices
read_info.write("Sample,Location,Reference,\n")
read_info.write("{},{},{},\n".format(tissue, data_path, "GRCm38"))  # add the file info to csv
read_info.close()
adata = io.aggregate_matrices(filename)  # read data
os.remove(filename)  # remove the info csv

2020-11-20 17:22:04,626 - pegasusio.readwrite - INFO - h5ad file '/Volumes/scqc/data/pegasus_data/dataset_1(ebi_tm)/adipose.h5ad' is loaded.
2020-11-20 17:22:04,627 - pegasusio.readwrite - INFO - Function 'read_input' finished in 14.79s.
2020-11-20 17:22:04,771 - pegasusio.aggr_data - INFO - Function 'aggregate' finished in 0.14s.
2020-11-20 17:22:04,775 - pegasusio.data_aggregation - INFO - Aggregated 1 files.
2020-11-20 17:22:04,775 - pegasusio.data_aggregation - INFO - Function 'aggregate_matrices' finished in 14.95s.


In [23]:
adata = initial_qc(adata, 100, 3, is_human)  # perform initial qc with min 100 genes and min 3 cells

2020-11-20 17:22:05,238 - pegasusio.qc_utils - INFO - After filtration, 3993 out of 4056 cell barcodes are kept in UnimodalData object GRCm38-rna.
2020-11-20 17:22:05,505 - pegasus.tools.preprocessing - INFO - After filtration, 30405/36251 genes are kept. Among 30405 genes, 27471 genes are robust.


In [24]:
adata, marker_dict = cluster_data(adata, compute_markers=True, compute_reductions=True, resolution=res)

2020-11-20 17:22:05,979 - pegasus.tools.preprocessing - INFO - Function 'log_norm' finished in 0.47s.
2020-11-20 17:22:06,388 - pegasus.tools.hvf_selection - INFO - 2000 highly variable features have been selected.
2020-11-20 17:22:06,388 - pegasus.tools.hvf_selection - INFO - Function 'highly_variable_features' finished in 0.41s.
2020-11-20 17:22:06,916 - pegasus.tools.preprocessing - INFO - Function 'pca' finished in 0.53s.
2020-11-20 17:22:07,464 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 0.55s.
2020-11-20 17:22:07,544 - pegasus.tools.nearest_neighbors - INFO - Function 'calculate_affinity_matrix' finished in 0.08s.
2020-11-20 17:22:07,575 - pegasus.tools.graph_operations - INFO - Function 'construct_graph' finished in 0.03s.
2020-11-20 17:22:07,849 - pegasus.tools.clustering - INFO - Louvain clustering is done. Get 23 clusters.
2020-11-20 17:22:07,858 - pegasus.tools.clustering - INFO - Function 'louvain' finished in 0.31s.
UMAP(min_dist=0.5, ra

In [28]:
marker_dict["1"]["up"]

Unnamed: 0_level_0,log2Mean,log2Mean_other,log2FC,percentage,percentage_other,percentage_fold_change,auroc,mwu_U,mwu_pval,mwu_qval,t_tstat,t_pval,t_qval
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Cd79a,8.323681,0.284697,8.038984,100.000000,8.682301,11.517685,0.987185,1453336.0,0.000000e+00,0.000000,185.835419,0.000000,0.000000
Ighm,8.889214,1.314677,7.574537,98.783455,32.551647,3.034668,0.977514,1439098.0,0.000000e+00,0.000000,85.764305,0.000000,0.000000
H2-DMb2,7.776985,0.473927,7.303058,98.053528,16.945841,5.786289,0.974452,1434590.0,0.000000e+00,0.000000,83.304474,0.000000,0.000000
Igkc,8.119233,0.270578,7.848655,97.566910,8.403127,11.610786,0.972991,1432440.0,0.000000e+00,0.000000,94.936020,0.000000,0.000000
Ighd,6.629440,0.180809,6.448632,95.620438,5.555555,17.211679,0.967845,1424863.0,0.000000e+00,0.000000,56.360153,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Plk4,0.358766,0.325434,0.033332,28.710463,39.921833,0.719167,0.460204,677513.0,2.566415e-03,0.009283,0.735939,0.462127,0.543852
Nek4,0.491978,0.490306,0.001672,40.389294,52.400894,0.770775,0.457020,672826.5,2.357907e-03,0.008594,0.034445,0.972536,0.979495
Ppcdc,0.657136,0.576418,0.080718,28.467154,40.145172,0.709105,0.453894,668223.0,4.846683e-04,0.002013,1.031293,0.302939,0.404798
Hinfp,0.571304,0.493277,0.078027,31.630171,44.919041,0.704160,0.450653,663452.5,2.901044e-04,0.001253,1.229162,0.219632,0.353853
