Most of the steps below have been copied from the scanpy tutorial: https://scanpy-tutorials.readthedocs.io/en/latest/pbmc3k.html

In [3]:
import numpy as np
import pandas as pd
import scanpy as sc

In [4]:
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=150, facecolor="white", vector_friendly=False)

scanpy==1.8.1 anndata==0.7.5 umap==0.5.1 numpy==1.20.2 scipy==1.6.2 pandas==1.3.3 scikit-learn==0.24.1 statsmodels==0.12.2 python-igraph==0.9.0 leidenalg==0.8.3 pynndescent==0.5.2


Make sure that the packages that you have match the ones below IF you want to verify our results and obtain exactly the same results as described below. If you want to simply analyse your own dataset, then most likely later versions will also be fine.

In [5]:
pip list

Package                            Version
---------------------------------- -----------------
adjustText                         0.7.3
aiohttp                            3.7.4.post0
aiohttp-cors                       0.7.0
aioredis                           1.3.1
alabaster                          0.7.12
anaconda-client                    1.7.2
anaconda-navigator                 1.9.7
anaconda-project                   0.8.3
anndata                            0.7.5
annoy                              1.17.0
ansiwrap                           0.8.4
anyio                              2.2.0
appdirs                            1.4.4
arboreto                           0.1.6
argcomplete                        1.12.2
argh                               0.26.2
argon2-cffi                        20.1.0
asciitree                          0.3.3
asn1crypto                         1.4.0
astroid                            2.5
astropy                            3.2.1
async-timeout                     

# drop-seq

In [6]:
import glob

In [7]:
mtx_dirs = sorted(glob.glob('demultiplexed_dropseq_sub/*Solo.out/Gene/raw/'))
mtx_dirs

['demultiplexed_dropseq_sub/SRR1853178.Solo.out/Gene/raw/',
 'demultiplexed_dropseq_sub/SRR1853179.Solo.out/Gene/raw/',
 'demultiplexed_dropseq_sub/SRR1853180.Solo.out/Gene/raw/',
 'demultiplexed_dropseq_sub/SRR1853181.Solo.out/Gene/raw/',
 'demultiplexed_dropseq_sub/SRR1853182.Solo.out/Gene/raw/',
 'demultiplexed_dropseq_sub/SRR1853183.Solo.out/Gene/raw/',
 'demultiplexed_dropseq_sub/SRR1853184.Solo.out/Gene/raw/']

In [8]:
ncells_dict = {
    'SRR1853178': 6600,
    'SRR1853179': 9000,
    'SRR1853180': 6120,
    'SRR1853181': 7650,
    'SRR1853182': 7650,
    'SRR1853183': 8280,
    'SRR1853184': 4000,
}


In [9]:
for mtx_dir in mtx_dirs:
    sample=mtx_dir.replace('demultiplexed_dropseq_sub/', '')
    sample=sample.replace('.Solo.out/Gene/raw/', '')
    adata = sc.read_10x_mtx(
        mtx_dir,  # the directory with the `.mtx` file
        var_names="gene_symbols",  # use gene symbols for the variable names (variables-axis index)
        cache=True,
        )  # write a cache file for faster subsequent reading
    adata.obs_names = sample + '_' + adata.obs_names
    adata.var_names_make_unique()
    sc.pp.calculate_qc_metrics(adata, inplace=True)
    adata.obs.sort_values(by='n_genes_by_counts', ascending=False, inplace=True)
    ncells = ncells_dict[sample]
    adata = adata[0:ncells-1]
    adata.obs.to_csv(sample + '_adata.tsv', sep='\t', header=True)

... reading from cache file cache/demultiplexed_dropseq_sub-SRR1853178.Solo.out-Gene-raw-matrix.h5ad
... reading from cache file cache/demultiplexed_dropseq_sub-SRR1853179.Solo.out-Gene-raw-matrix.h5ad
... reading from cache file cache/demultiplexed_dropseq_sub-SRR1853180.Solo.out-Gene-raw-matrix.h5ad
... reading from cache file cache/demultiplexed_dropseq_sub-SRR1853181.Solo.out-Gene-raw-matrix.h5ad
... reading from cache file cache/demultiplexed_dropseq_sub-SRR1853182.Solo.out-Gene-raw-matrix.h5ad
... reading from cache file cache/demultiplexed_dropseq_sub-SRR1853183.Solo.out-Gene-raw-matrix.h5ad
... reading from cache file cache/demultiplexed_dropseq_sub-SRR1853184.Solo.out-Gene-raw-matrix.h5ad


Merge into one nice dataframe:

In [10]:
df_merged = pd.DataFrame()
for file in glob.glob('tables/dropseq_SRR18531*_adata.tsv'):
    df = pd.read_csv(file, sep='\t', index_col=0)
    df_merged = df_merged.append(df)

In [11]:
df_merged

Unnamed: 0,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes
SRR1853184_CTTTACTTGGTC,7713,8.950792,38554.0,10.559841,14.229393,19.248327,26.280023,39.741661
SRR1853184_CCGCTCTAAAAG,7506,8.923591,45529.0,10.726127,21.280942,27.332030,34.402249,46.945903
SRR1853184_CGGCACAGTGGA,7316,8.897956,33693.0,10.425076,14.982341,20.223785,27.658564,41.041166
SRR1853184_GATGCCGGAATG,6887,8.837536,23715.0,10.073905,10.221379,15.306768,22.230656,35.686275
SRR1853184_AGGGCTCGCGAC,6351,8.756525,29061.0,10.277186,20.395031,26.520078,34.011218,47.338357
...,...,...,...,...,...,...,...,...
SRR1853181_GTCCGCGTCTCT,383,5.950643,547.0,6.306275,33.638026,48.263254,66.544790,100.000000
SRR1853181_GGAAAACGACTG,383,5.950643,574.0,6.354370,33.972125,50.696864,68.118467,100.000000
SRR1853181_GCCCAGAAATCT,383,5.950643,454.0,6.120297,26.651982,37.665198,59.691630,100.000000
SRR1853181_TGGTGGGATCAC,383,5.950643,520.0,6.255750,31.730769,45.576923,64.807692,100.000000


In [12]:
df_merged.median()

n_genes_by_counts              671.000000
log1p_n_genes_by_counts          6.510258
total_counts                   952.000000
log1p_total_counts               6.859615
pct_counts_in_top_50_genes      23.401950
pct_counts_in_top_100_genes     34.064213
pct_counts_in_top_200_genes     49.536178
pct_counts_in_top_500_genes     81.986900
dtype: float64

I can now also write a drop-seq barcode list!

In [13]:
df_merged['barcode'] = [df_merged.index[x].split('_')[1] for x in range(len(df_merged.index))]

In [14]:
df_merged['barcode'].to_csv('drop-seq_macosko_whitelist.txt', index=False, header=None)

Using uniq -u $whitelist | wc -l, I found that this barcode list is unique. Remap using this whitelist now.

# indrop

For indrop, we take a slightly different approach. We mapped all the pooled data. We read the large unfiltered matrix. We then take the top 1 428 898 458 reads / 52738 = 27094 cells to match hydrop's read depth.

In [15]:
mtx_dir = 'demultiplexed_indrop_index/merged.Solo.out/Gene/raw/'

In [18]:
!gzip demultiplexed_indrop_index/merged.Solo.out/Gene/raw/*

In [34]:
sample = 'merged'
adata = sc.read_10x_mtx(
    mtx_dir,  # the directory with the `.mtx` file
    var_names="gene_symbols",  # use gene symbols for the variable names (variables-axis index)
    cache=True,
    )  # write a cache file for faster subsequent reading
adata.obs_names = sample + '_' + adata.obs_names
adata.var_names_make_unique()
sc.pp.calculate_qc_metrics(adata, inplace=True)
adata.obs.sort_values(by='n_genes_by_counts', ascending=False, inplace=True)

... reading from cache file cache/demultiplexed_indrop_index-merged.Solo.out-Gene-raw-matrix.h5ad


In [35]:
adata[0:31293-1].obs

Unnamed: 0,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes
merged_CTCCTCCA_ATTAGACG_TGTTCCAG,8136,9.004177,41203.0,10.626290,8.564910,14.045094,21.556683,35.363930
merged_GGCGACAA_TCTTACGC_GCGCATTC,8129,9.003316,38021.0,10.545920,8.784619,14.284211,21.929986,36.009048
merged_CAAGGTAC_ATTAGACG_ATCCGCTA,7448,8.915835,27611.0,10.226006,9.836659,14.975915,21.889827,35.478614
merged_AAGGGACC_ATAGCCTT_CCATTTGA,6867,8.834628,25914.0,10.162578,10.963186,16.975380,24.507988,38.666358
merged_GGCCCAAT_ACTCTAGG_TTGTGACT,6837,8.830251,24172.0,10.092992,11.662254,17.160351,24.652490,38.528049
...,...,...,...,...,...,...,...,...
merged_GGTTGAGA_CGGAGAGA_TCGGTTTA,418,6.037871,476.0,6.167517,22.689076,33.193277,54.201681,100.000000
merged_TAGTCTCT_TCTTACGC_ACAGCGGA,418,6.037871,491.0,6.198479,25.050916,35.234216,55.600815,100.000000
merged_AAGAAGGT_ATTAGACG_TAACCCGT,418,6.037871,479.0,6.173786,23.173278,33.611691,54.488518,100.000000
merged_CGGTGAGT_ATAGCCTT_AGTGAAAG,418,6.037871,479.0,6.173786,23.173278,33.611691,54.488518,100.000000


65945800 UMIs/1428898458 reads = 4.46%

In [37]:
adata[27094:31293-1].obs

Unnamed: 0,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes
merged_AAGGATGA_CTAGTCGA_CCGCAACT,489,6.194405,573.0,6.352629,20.593368,32.111693,49.563700,100.0
merged_TGGCTAGT_CGGAGAGA_AAAGTCGG,489,6.194405,592.0,6.385194,22.804054,34.290541,51.182432,100.0
merged_CAATTCTC_TAAGGCTC_AAACCTCC,489,6.194405,591.0,6.383507,23.350254,34.179357,51.099831,100.0
merged_ACATCTCG_ACTCTAGG_TACAAACT,489,6.194405,579.0,6.363028,20.725389,32.815199,50.086356,100.0
merged_CAGTTTAA_ATTAGACG_TCTAGCAG,489,6.194405,570.0,6.347389,21.929825,31.754386,49.298246,100.0
...,...,...,...,...,...,...,...,...
merged_TAGTCTCT_TCTTACGC_ACAGCGGA,418,6.037871,491.0,6.198479,25.050916,35.234216,55.600815,100.0
merged_AAGAAGGT_ATTAGACG_TAACCCGT,418,6.037871,479.0,6.173786,23.173278,33.611691,54.488518,100.0
merged_CGGTGAGT_ATAGCCTT_AGTGAAAG,418,6.037871,479.0,6.173786,23.173278,33.611691,54.488518,100.0
merged_GGACTTCT_TAAGGCTC_TAGCCTCG,418,6.037871,484.0,6.184149,23.966942,34.297521,54.958678,100.0


63744212 UMIs/1428898458 reads = 4.46%

In [21]:
adata = adata[0:27094-1]
adata.obs.to_csv('indrop' + sample + '_adata.tsv', sep='\t', header=True)

In [22]:
adata.obs.median()

n_genes_by_counts              1321.000000
log1p_n_genes_by_counts           7.186901
total_counts                   1920.000000
log1p_total_counts                7.560601
pct_counts_in_top_50_genes       17.385621
pct_counts_in_top_100_genes      25.818918
pct_counts_in_top_200_genes      37.254902
pct_counts_in_top_500_genes      58.153242
dtype: float64