In [1]:
from math import ceil
import re
import subprocess
import sys

import pandas as pd
import pegasus as pg
import numpy as np

import paths
from config import do_counts, do_genes, do_mito, do_ribo, OUTPUT_DIR, DATA_DIR
from filters import filter_cells, initial_qc
from local_config import local
from readers import auto_reader
from utils import cluster_data, safe_mkdir, add_cd_scores, title

TASKS_PER_TISS = 1  # how many different methods per one tissue. Used to determine method and param from task id
#DATA_DIR = "/Users/michaelalperovich/Documents/primes_storage/data/"
DATA_DIR = "/Volumes/easystore/primes_storage/data/"

project = "mc_ebi_tm"
task_id = 3
res = 1.4

In [2]:
def get_ncols(arr, n_rows=4):
    return max(3, ceil(len(arr) / n_rows))

def make_plots(markers):
    pg.dotplot(adata, genes=markers, groupby='louvain_labels', switch_axes=True)
    pg.scatter(adata, attrs="clusters", basis='umap', legend_loc='on data')
    pg.scatter(adata, attrs="clusters", basis='umap')
    for i in range(0, len(markers) + len(markers) % 2, 2):
        pg.scatter(adata, attrs=markers[i:min(i + 2, len(markers))], basis='umap', ncols=2, cmap="cool")

In [3]:
tissue, is_human, adata = auto_reader(project, task_id, TASKS_PER_TISS)  # read the data for current task id
method, param = "mad", 2
adata = initial_qc(adata, 100, 3, is_human)
pg.log_norm(adata)

2020-12-27 11:58:32,926 - pegasusio.readwrite - INFO - h5ad file '/Volumes/scqc/data/mouse/ebi_tm/Cerebellum/Cerebellum.h5ad' is loaded.
2020-12-27 11:58:32,927 - pegasusio.readwrite - INFO - Function 'read_input' finished in 19.05s.
2020-12-27 11:58:33,017 - pegasusio.aggr_data - INFO - Function 'aggregate' finished in 0.08s.
2020-12-27 11:58:33,029 - pegasusio.data_aggregation - INFO - Aggregated 1 files.
2020-12-27 11:58:33,030 - pegasusio.data_aggregation - INFO - Function 'aggregate_matrices' finished in 19.19s.
2020-12-27 11:58:33,485 - pegasusio.qc_utils - INFO - After filtration, 2426 out of 2427 cell barcodes are kept in UnimodalData object GRCm38-rna.
2020-12-27 11:58:33,643 - pegasus.tools.preprocessing - INFO - After filtration, 28407/36251 genes are kept. Among 28407 genes, 25011 genes are robust.
2020-12-27 11:58:33,737 - pegasus.tools.preprocessing - INFO - Function 'log_norm' finished in 0.09s.


In [7]:
task_directory = "filtered_cells_plots/no_outlier/"
results_dir = "/Users/michaelalperovich/Documents/primes_storage/output_pg/" + project + "/" + tissue + "/" + task_directory   # directory for saving output
cells = pd.read_csv(results_dir + "!cells.csv")
clusters = pd.read_csv(results_dir + "!clusters.csv")

adata.obs["retained"] = False
adata.obs["retained"][cells["barcodekey"]] = True
adata.obs["passed_qc"] = (adata.obs.retained)
pg.filter_data(adata)

2020-12-27 11:59:58,147 - pegasusio.qc_utils - INFO - After filtration, 2234 out of 2426 cell barcodes are kept in UnimodalData object GRCm38-rna.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs["retained"][cells["barcodekey"]] = True


In [8]:
cells.index = cells.barcodekey
cells.louvain_labels = [t - 1 for t in cells.louvain_labels]
cells = cells.reindex(adata.obs.index)
adata.obs["louvain_labels"] = cells.louvain_labels
adata.obs["clusters"] = [str(t) for t in adata.obs["louvain_labels"]]
adata.obs["scale"] = cells.scale
x_umap = np.array([[cells.umap1[i], cells.umap2[i]] for i in range(len(cells.umap1))], dtype="float32")
adata.obsm["X_umap"] = x_umap

In [None]:
cl = 22

In [None]:
genes = clusters["markers"][cl].split("; ")
top10 = []
for gene in genes:
    if not re.match("^Rp[sl]\d", gene, flags=re.IGNORECASE):
        top10.append(gene)
    if len(top10) == 10:
        break
    
make_plots(top10)

In [None]:
low_genes = pd.read_csv(results_dir + "!low_percent_DE.csv")
low_genes_cluster = low_genes[low_genes.cluster == cl]
top = []
for i in range(len(low_genes_cluster["gene"])):
    gene = list(low_genes_cluster.gene)[i]
    if not re.match("^Rp[sl]\d", gene, flags=re.IGNORECASE):
        top.append(gene)
    
make_plots(top)

In [None]:
pg.de_analysis(adata, cluster='clusters', auc=False, t=True, fisher=False, mwu=False,
                       temp_folder="/tmp", subset="4,5")

In [None]:
marker_dict = pg.markers(adata)

In [None]:
def write_markers(marker_dict, min_log_fc=0.25, min_pct=25):
    frames = []
    # iterate through all keys in the markers dict
    for cl in marker_dict.keys():
        for d in marker_dict[cl].keys():
            df = marker_dict[cl][d]
            df['cluster'] = cl
            df['up/down'] = d

            # filter markers based on log_fc and pct
            df = df[(df["mean_logExpr"] >= min_log_fc) & (
                    (df["percentage"] >= min_pct) | (df["percentage_other"] >= min_pct))]

            frames.append(df)
    result = pd.concat(frames)  # merge all marker data frames together
    with open("/Users/michaelalperovich/Desktop/markers.csv", "w") as fout:
        fout.write(result.to_csv())

In [None]:
write_markers(marker_dict)

In [None]:
marker_dict