Most of the steps below have been copied from the scanpy tutorial: https://scanpy-tutorials.readthedocs.io/en/latest/pbmc3k.html

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import os

In [None]:
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=150, facecolor="white", vector_friendly=False)

Make sure that the packages that you have match the ones below IF you want to verify our results and obtain exactly the same results as described below. If you want to simply analyse your own dataset, then most likely later versions will also be fine.

In [None]:
pip list

# drop-seq

In [None]:
import glob

In [None]:
mtx_dirs = sorted(glob.glob('demultiplexed_dropseq_sub_whitelist/*Solo.out/Gene/raw/'))
mtx_dirs

In [None]:
ncells_dict = {
    'SRR1853178': 6600,
    'SRR1853179': 9000,
    'SRR1853180': 6120,
    'SRR1853181': 7650,
    'SRR1853182': 7650,
    'SRR1853183': 8280,
    'SRR1853184': 4000,
}


In [None]:
for mtx_dir in mtx_dirs:
    sample=mtx_dir.replace('demultiplexed_dropseq_sub_whitelist/', '')
    sample=sample.replace('.Solo.out/Gene/raw/', '')
    adata = sc.read_10x_mtx(
        mtx_dir,  # the directory with the `.mtx` file
        var_names="gene_symbols",  # use gene symbols for the variable names (variables-axis index)
        cache=True,
        )  # write a cache file for faster subsequent reading
    adata.obs_names = sample + '_' + adata.obs_names
    adata.var_names_make_unique()
    sc.pp.calculate_qc_metrics(adata, inplace=True)
    adata.obs.sort_values(by='n_genes_by_counts', ascending=False, inplace=True)
    ncells = ncells_dict[sample]
    adata = adata[0:ncells-1]
    filename = 'tables/dropseq_whitelisted_' + sample + '_adata.tsv'
    if not os.path.exists(filename):
        adata.obs.to_csv(filename, sep='\t', header=True)

Merge into one nice dataframe:

In [None]:
df_merged = pd.DataFrame()
for file in glob.glob('tables/dropseq_whitelisted_SRR18531*_adata.tsv'):
    df = pd.read_csv(file, sep='\t', index_col=0)
    df_merged = df_merged.append(df)

In [None]:
df_merged

In [None]:
df_merged.median()