Most of the steps below have been copied from the scanpy tutorial: https://scanpy-tutorials.readthedocs.io/en/latest/pbmc3k.html

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import os

In [2]:
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=150, facecolor="white", vector_friendly=False)

scanpy==1.8.1 anndata==0.7.5 umap==0.5.1 numpy==1.20.2 scipy==1.6.2 pandas==1.3.3 scikit-learn==0.24.1 statsmodels==0.12.2 python-igraph==0.9.0 leidenalg==0.8.3 pynndescent==0.5.2


Make sure that the packages that you have match the ones below IF you want to verify our results and obtain exactly the same results as described below. If you want to simply analyse your own dataset, then most likely later versions will also be fine.

In [3]:
pip list

Package                            Version
---------------------------------- -----------------
adjustText                         0.7.3
aiohttp                            3.7.4.post0
aiohttp-cors                       0.7.0
aioredis                           1.3.1
alabaster                          0.7.12
anaconda-client                    1.7.2
anaconda-navigator                 1.9.7
anaconda-project                   0.8.3
anndata                            0.7.5
annoy                              1.17.0
ansiwrap                           0.8.4
anyio                              2.2.0
appdirs                            1.4.4
arboreto                           0.1.6
argcomplete                        1.12.2
argh                               0.26.2
argon2-cffi                        20.1.0
asciitree                          0.3.3
asn1crypto                         1.4.0
astroid                            2.5
astropy                            3.2.1
async-timeout                     

# drop-seq

In [4]:
import glob

In [5]:
mtx_dirs = sorted(glob.glob('demultiplexed_dropseq_sub_whitelist/*Solo.out/Gene/raw/'))
mtx_dirs

['demultiplexed_dropseq_sub_whitelist/SRR1853178.Solo.out/Gene/raw/',
 'demultiplexed_dropseq_sub_whitelist/SRR1853179.Solo.out/Gene/raw/',
 'demultiplexed_dropseq_sub_whitelist/SRR1853180.Solo.out/Gene/raw/',
 'demultiplexed_dropseq_sub_whitelist/SRR1853181.Solo.out/Gene/raw/',
 'demultiplexed_dropseq_sub_whitelist/SRR1853182.Solo.out/Gene/raw/',
 'demultiplexed_dropseq_sub_whitelist/SRR1853183.Solo.out/Gene/raw/',
 'demultiplexed_dropseq_sub_whitelist/SRR1853184.Solo.out/Gene/raw/']

In [6]:
ncells_dict = {
    'SRR1853178': 6600,
    'SRR1853179': 9000,
    'SRR1853180': 6120,
    'SRR1853181': 7650,
    'SRR1853182': 7650,
    'SRR1853183': 8280,
    'SRR1853184': 4000,
}


In [9]:
for mtx_dir in mtx_dirs:
    sample=mtx_dir.replace('demultiplexed_dropseq_sub_whitelist/', '')
    sample=sample.replace('.Solo.out/Gene/raw/', '')
    adata = sc.read_10x_mtx(
        mtx_dir,  # the directory with the `.mtx` file
        var_names="gene_symbols",  # use gene symbols for the variable names (variables-axis index)
        cache=True,
        )  # write a cache file for faster subsequent reading
    adata.obs_names = sample + '_' + adata.obs_names
    adata.var_names_make_unique()
    sc.pp.calculate_qc_metrics(adata, inplace=True)
    adata.obs.sort_values(by='n_genes_by_counts', ascending=False, inplace=True)
    ncells = ncells_dict[sample]
    adata = adata[0:ncells-1]
    filename = 'tables/dropseq_whitelisted_' + sample + '_adata.tsv'
    if not os.path.exists(filename):
        adata.obs.to_csv(filename, sep='\t', header=True)

... reading from cache file cache/demultiplexed_dropseq_sub_whitelist-SRR1853178.Solo.out-Gene-raw-matrix.h5ad
... reading from cache file cache/demultiplexed_dropseq_sub_whitelist-SRR1853179.Solo.out-Gene-raw-matrix.h5ad
... reading from cache file cache/demultiplexed_dropseq_sub_whitelist-SRR1853180.Solo.out-Gene-raw-matrix.h5ad
... reading from cache file cache/demultiplexed_dropseq_sub_whitelist-SRR1853181.Solo.out-Gene-raw-matrix.h5ad
... writing an h5ad cache file to speedup reading next time
... writing an h5ad cache file to speedup reading next time
... writing an h5ad cache file to speedup reading next time


Merge into one nice dataframe:

In [10]:
df_merged = pd.DataFrame()
for file in glob.glob('tables/dropseq_whitelisted_SRR18531*_adata.tsv'):
    df = pd.read_csv(file, sep='\t', index_col=0)
    df_merged = df_merged.append(df)

In [11]:
df_merged

Unnamed: 0,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes
SRR1853182_ACAGATTGGAAC,7526,8.926252,34639.0,10.452765,12.685124,17.841162,25.251884,38.892578
SRR1853182_CAGAATAGGCTT,7144,8.874168,29813.0,10.302733,13.376715,18.709959,26.069164,39.831617
SRR1853182_GAAAAGAACGAG,6971,8.849657,38458.0,10.557348,14.696552,21.995424,31.652712,47.605180
SRR1853182_GCTGGAGTTTCA,6906,8.840291,28311.0,10.251041,13.772032,19.261065,26.713998,40.591996
SRR1853182_GAAATCAACAGT,6504,8.780326,24016.0,10.086517,13.216189,18.479347,25.845270,39.765157
...,...,...,...,...,...,...,...,...
SRR1853183_ATTTATCAGGCA,366,5.905362,510.0,6.236370,33.529412,47.843137,67.450980,100.000000
SRR1853183_AGGGCCATGCAT,366,5.905362,515.0,6.246107,34.368932,48.349515,67.766990,100.000000
SRR1853183_CATTCGCTGGCT,366,5.905362,498.0,6.212606,31.726908,46.586345,66.666667,100.000000
SRR1853183_TGACGATCGCGT,366,5.905362,495.0,6.206576,30.505051,46.262626,66.464646,100.000000


In [12]:
df_merged.median()

n_genes_by_counts               743.000000
log1p_n_genes_by_counts           6.612041
total_counts                   1071.000000
log1p_total_counts                6.977282
pct_counts_in_top_50_genes       22.564612
pct_counts_in_top_100_genes      32.941176
pct_counts_in_top_200_genes      47.530864
pct_counts_in_top_500_genes      77.340677
dtype: float64