Most of the steps below have been copied from the scanpy tutorial: https://scanpy-tutorials.readthedocs.io/en/latest/pbmc3k.html

In [27]:
import numpy as np
import pandas as pd
import scanpy as sc

In [12]:
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=150, facecolor="white", vector_friendly=False)

scanpy==1.8.1 anndata==0.7.5 umap==0.5.1 numpy==1.20.2 scipy==1.6.2 pandas==1.3.3 scikit-learn==0.24.1 statsmodels==0.12.2 python-igraph==0.9.0 leidenalg==0.8.3 pynndescent==0.5.2


Make sure that the packages that you have match the ones below IF you want to verify our results and obtain exactly the same results as described below. If you want to simply analyse your own dataset, then most likely later versions will also be fine.

In [13]:
pip list

Package                            Version
---------------------------------- -----------------
adjustText                         0.7.3
aiohttp                            3.7.4.post0
aiohttp-cors                       0.7.0
aioredis                           1.3.1
alabaster                          0.7.12
anaconda-client                    1.7.2
anaconda-navigator                 1.9.7
anaconda-project                   0.8.3
anndata                            0.7.5
annoy                              1.17.0
ansiwrap                           0.8.4
anyio                              2.2.0
appdirs                            1.4.4
arboreto                           0.1.6
argcomplete                        1.12.2
argh                               0.26.2
argon2-cffi                        20.1.0
asciitree                          0.3.3
asn1crypto                         1.4.0
astroid                            2.5
astropy                            3.2.1
async-timeout                     

# drop-seq

In [14]:
import glob

In [15]:
mtx_dirs = sorted(glob.glob('demultiplexed_dropseq_sub/*Solo.out/Gene/raw/'))
mtx_dirs

['demultiplexed_dropseq_sub/SRR1853178.Solo.out/Gene/raw/',
 'demultiplexed_dropseq_sub/SRR1853179.Solo.out/Gene/raw/',
 'demultiplexed_dropseq_sub/SRR1853180.Solo.out/Gene/raw/',
 'demultiplexed_dropseq_sub/SRR1853181.Solo.out/Gene/raw/',
 'demultiplexed_dropseq_sub/SRR1853182.Solo.out/Gene/raw/',
 'demultiplexed_dropseq_sub/SRR1853183.Solo.out/Gene/raw/',
 'demultiplexed_dropseq_sub/SRR1853184.Solo.out/Gene/raw/']

In [None]:
GSM1626793_P14Retina_1.digital_expression.txt.bcs: 6600
GSM1626794_P14Retina_2.digital_expression.txt.bcs: 9000
GSM1626795_P14Retina_3.digital_expression.txt.bcs: 6120
GSM1626796_P14Retina_4.digital_expression.txt.bcs: 7650
GSM1626797_P14Retina_5.digital_expression.txt.bcs: 7650
GSM1626798_P14Retina_6.digital_expression.txt.bcs: 8280
GSM1626799_P14Retina_7.digital_expression.txt.bcs: 4000

In [18]:
ncells_dict = {
    'SRR1853178': 6600,
    'SRR1853179': 9000,
    'SRR1853180': 6120,
    'SRR1853181': 7650,
    'SRR1853182': 7650,
    'SRR1853183': 8280,
    'SRR1853184': 4000,
}


In [26]:
for mtx_dir in mtx_dirs:
    sample=mtx_dir.replace('demultiplexed_dropseq_sub/', '')
    sample=sample.replace('.Solo.out/Gene/raw/', '')
    adata = sc.read_10x_mtx(
        mtx_dir,  # the directory with the `.mtx` file
        var_names="gene_symbols",  # use gene symbols for the variable names (variables-axis index)
        cache=True,
        )  # write a cache file for faster subsequent reading
    adata.obs_names = sample + '_' + adata.obs_names
    adata.var_names_make_unique()
    sc.pp.calculate_qc_metrics(adata, inplace=True)
    adata.obs.sort_values(by='n_genes_by_counts', ascending=False, inplace=True)
    ncells = ncells_dict[sample]
    adata = adata[0:ncells-1]
    adata.obs.to_csv(sample + '_adata.tsv', sep='\t', header=True)

... reading from cache file cache/demultiplexed_dropseq_sub-SRR1853178.Solo.out-Gene-raw-matrix.h5ad
... reading from cache file cache/demultiplexed_dropseq_sub-SRR1853179.Solo.out-Gene-raw-matrix.h5ad
... reading from cache file cache/demultiplexed_dropseq_sub-SRR1853180.Solo.out-Gene-raw-matrix.h5ad
... reading from cache file cache/demultiplexed_dropseq_sub-SRR1853181.Solo.out-Gene-raw-matrix.h5ad
... reading from cache file cache/demultiplexed_dropseq_sub-SRR1853182.Solo.out-Gene-raw-matrix.h5ad
... reading from cache file cache/demultiplexed_dropseq_sub-SRR1853183.Solo.out-Gene-raw-matrix.h5ad
... reading from cache file cache/demultiplexed_dropseq_sub-SRR1853184.Solo.out-Gene-raw-matrix.h5ad


Merge into one nice dataframe:

In [35]:
df_merged = pd.DataFrame()
for file in glob.glob('SRR18531*_adata.tsv'):
    df = pd.read_csv(file, sep='\t', index_col=0)
    df_merged = df_merged.append(df)

In [37]:
df_merged

Unnamed: 0,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes
SRR1853181_AACATTGAATTC,8777,9.080004,48706.0,10.793578,9.902271,15.080278,22.331951,35.981193
SRR1853181_ATCGGCTACCGA,7812,8.963544,38197.0,10.550538,12.676388,17.899311,25.195696,38.772678
SRR1853181_TGCCACATGGGC,7213,8.883779,41429.0,10.631761,19.211181,25.499047,32.911728,45.767458
SRR1853181_TAGATATCTTAT,7067,8.863333,29729.0,10.299912,15.806115,20.922332,28.305695,41.703387
SRR1853181_CCTGGATTGTAC,6983,8.851377,29947.0,10.307218,13.784352,19.237319,26.687147,40.424750
...,...,...,...,...,...,...,...,...
SRR1853184_TAAGCGGACGTC,543,6.298949,839.0,6.733402,28.605483,41.477950,59.117998,94.874851
SRR1853184_ATGCGATCAGCC,542,6.297109,810.0,6.698268,31.728395,44.074074,57.777778,94.814815
SRR1853184_TCAGGCTTCCGA,542,6.297109,647.0,6.473891,22.565688,31.684699,47.140649,93.508501
SRR1853184_CGTATCGGACTT,542,6.297109,676.0,6.517671,25.443787,34.615385,49.408284,93.786982


In [38]:
df_merged.median()

n_genes_by_counts              671.000000
log1p_n_genes_by_counts          6.510258
total_counts                   952.000000
log1p_total_counts               6.859615
pct_counts_in_top_50_genes      23.401950
pct_counts_in_top_100_genes     34.064213
pct_counts_in_top_200_genes     49.536178
pct_counts_in_top_500_genes     81.986900
dtype: float64

I can now also write a drop-seq barcode list!

In [59]:
df_merged['barcode'] = [df_merged.index[x].split('_')[1] for x in range(len(df_merged.index))]

In [67]:
df_merged['barcode'].to_csv('drop-seq_macosko_whitelist.txt', index=False, header=None)

Using uniq -u $whitelist | wc -l, I found that this barcode list is unique. Remap using this whitelist now.

# indrop

For indrop, we take a slightly different approach. We mapped all the pooled data. We read the large unfiltered matrix. We then take the top 1 428 898 458 reads / 52738 = 27094 cells to match hydrop's read depth.

In [46]:
mtx_dir = 'demultiplexed_indrop_index/merged.Solo.out/Gene/raw/'

In [50]:
sample = 'merged'
adata = sc.read_10x_mtx(
    mtx_dir,  # the directory with the `.mtx` file
    var_names="gene_symbols",  # use gene symbols for the variable names (variables-axis index)
    cache=True,
    )  # write a cache file for faster subsequent reading
adata.obs_names = sample + '_' + adata.obs_names
adata.var_names_make_unique()
sc.pp.calculate_qc_metrics(adata, inplace=True)
adata.obs.sort_values(by='n_genes_by_counts', ascending=False, inplace=True)

... reading from cache file cache/demultiplexed_indrop_index-merged.Solo.out-Gene-raw-matrix.h5ad


In [51]:
adata = adata[0:27094-1]
adata.obs.to_csv('indrop' + sample + '_adata.tsv', sep='\t', header=True)

In [53]:
adata.obs.median()

n_genes_by_counts              1321.000000
log1p_n_genes_by_counts           7.186901
total_counts                   1920.000000
log1p_total_counts                7.560601
pct_counts_in_top_50_genes       17.385621
pct_counts_in_top_100_genes      25.818918
pct_counts_in_top_200_genes      37.254902
pct_counts_in_top_500_genes      58.153242
dtype: float64