Download these: https://support.10xgenomics.com/single-cell-gene-expression/datasets/2.1.0/nuclei_2k  
        https://support.10xgenomics.com/single-cell-gene-expression/datasets/2.1.0/nuclei_900

In [1]:
!ls 10x

nuclei_2k_filtered_gene_bc_matrices
nuclei_2k_filtered_gene_bc_matrices.tar.gz
nuclei_2k_metrics_summary.csv
nuclei_900_filtered_gene_bc_matrices
nuclei_900_filtered_gene_bc_matrices.tar.gz
nuclei_900_metrics_summary.csv


In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import os

In [3]:
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=150, facecolor="white", vector_friendly=False)

scanpy==1.8.1 anndata==0.7.5 umap==0.5.1 numpy==1.20.2 scipy==1.6.2 pandas==1.3.3 scikit-learn==0.24.1 statsmodels==0.12.2 python-igraph==0.9.0 leidenalg==0.8.3 pynndescent==0.5.2


Make sure that the packages that you have match the ones below IF you want to verify our results and obtain exactly the same results as described below. If you want to simply analyse your own dataset, then most likely later versions will also be fine.

In [4]:
pip list

Package                            Version
---------------------------------- -----------------
adjustText                         0.7.3
aiohttp                            3.7.4.post0
aiohttp-cors                       0.7.0
aioredis                           1.3.1
alabaster                          0.7.12
anaconda-client                    1.7.2
anaconda-navigator                 1.9.7
anaconda-project                   0.8.3
anndata                            0.7.5
annoy                              1.17.0
ansiwrap                           0.8.4
anyio                              2.2.0
appdirs                            1.4.4
arboreto                           0.1.6
argcomplete                        1.12.2
argh                               0.26.2
argon2-cffi                        20.1.0
asciitree                          0.3.3
asn1crypto                         1.4.0
astroid                            2.5
astropy                            3.2.1
async-timeout                     

# drop-seq

In [5]:
import glob

In [8]:
mtx_dirs = sorted(glob.glob('10x/*/mm10/'))
mtx_dirs

['10x/nuclei_2k_filtered_gene_bc_matrices/mm10/',
 '10x/nuclei_900_filtered_gene_bc_matrices/mm10/']

In [11]:
for mtx_dir in mtx_dirs:
    sample=mtx_dir.replace('10x/', '')
    sample=sample.replace('/mm10/', '')

    adata = sc.read_10x_mtx(
        mtx_dir,  # the directory with the `.mtx` file
        var_names="gene_symbols",  # use gene symbols for the variable names (variables-axis index)
        cache=True,
        )  # write a cache file for faster subsequent reading
    adata.obs_names = sample + '_' + adata.obs_names
    adata.var_names_make_unique()
    sc.pp.calculate_qc_metrics(adata, inplace=True)
    adata.obs.sort_values(by='n_genes_by_counts', ascending=False, inplace=True)
    filename = 'tables/10x_' + sample + '.tsv'
    if not os.path.exists(filename):
        adata.obs.to_csv(filename, sep='\t', header=True)

... reading from cache file cache/10x-nuclei_2k_filtered_gene_bc_matrices-mm10-matrix.h5ad
... writing an h5ad cache file to speedup reading next time


In [22]:
file = 'tables/10x_nuclei_2k_filtered_gene_bc_matrices.tsv'
df = pd.read_csv(file, sep='\t', index_col=0)

In [23]:
df

Unnamed: 0,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes
nuclei_2k_filtered_gene_bc_matrices_TCTATTGCATGCCACG-1,7635,8.940629,25324.0,10.139547,17.370873,21.213079,26.658506,37.395356
nuclei_2k_filtered_gene_bc_matrices_TACGGTACAGCTGTTA-1,4255,8.356085,8921.0,9.096275,18.731084,23.136420,29.402533,42.484026
nuclei_2k_filtered_gene_bc_matrices_AAACGGGTCGGAAACG-1,3221,8.077758,5680.0,8.644882,20.809859,25.228873,31.373239,44.559859
nuclei_2k_filtered_gene_bc_matrices_ATAACGCTCCAAACTG-1,2735,7.914252,6916.0,8.841738,46.515327,49.768652,54.525737,64.213418
nuclei_2k_filtered_gene_bc_matrices_GGAAAGCTCTAACTGG-1,2654,7.884200,5382.0,8.591002,29.059829,34.039391,41.062802,54.589372
...,...,...,...,...,...,...,...,...
nuclei_2k_filtered_gene_bc_matrices_CCTTCCCAGACAATAC-1,43,3.784190,407.0,6.011267,100.000000,100.000000,100.000000,100.000000
nuclei_2k_filtered_gene_bc_matrices_CTTGGCTGTCGGATCC-1,42,3.761200,397.0,5.986452,100.000000,100.000000,100.000000,100.000000
nuclei_2k_filtered_gene_bc_matrices_TTCTACATCCCTAACC-1,41,3.737670,402.0,5.998937,100.000000,100.000000,100.000000,100.000000
nuclei_2k_filtered_gene_bc_matrices_TCAGGATGTGCTAGCC-1,40,3.713572,531.0,6.276643,100.000000,100.000000,100.000000,100.000000


In [24]:
df['total_counts'].sum()

3014623.0

In [20]:
file = 'tables/10x_nuclei_900_filtered_gene_bc_matrices.tsv'
df = pd.read_csv(file, sep='\t', index_col=0)
df['total_counts'].sum()

10203353.0

In [21]:
df

Unnamed: 0,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes
nuclei_900_filtered_gene_bc_matrices_GGTGCGTAGCTAAACA-1,7109,8.869258,55824.0,10.929977,23.876827,35.144382,46.449556,60.117512
nuclei_900_filtered_gene_bc_matrices_CGCTATCCAACACCTA-1,6887,8.837536,50523.0,10.830204,24.711518,36.872316,47.550621,60.483344
nuclei_900_filtered_gene_bc_matrices_TTGACTTGTGTGGCTC-1,6725,8.813736,42758.0,10.663335,24.825764,36.042378,46.723420,59.478928
nuclei_900_filtered_gene_bc_matrices_CAAGTTGAGGTCGGAT-1,6621,8.798153,46334.0,10.743653,22.875210,33.126861,45.012302,59.258860
nuclei_900_filtered_gene_bc_matrices_TGCCCATCACATCTTT-1,6567,8.789965,45220.0,10.719316,24.862893,37.450243,48.268465,61.393189
...,...,...,...,...,...,...,...,...
nuclei_900_filtered_gene_bc_matrices_TGCCCATAGACGCACA-1,130,4.875197,5722.0,8.652248,98.601887,99.475708,100.000000,100.000000
nuclei_900_filtered_gene_bc_matrices_TTGAACGAGTACGATA-1,126,4.844187,5465.0,8.606302,98.609332,99.524245,100.000000,100.000000
nuclei_900_filtered_gene_bc_matrices_AAACGGGAGTAGGTGC-1,113,4.736198,5123.0,8.541691,98.770252,99.746242,100.000000,100.000000
nuclei_900_filtered_gene_bc_matrices_TACGGATCACGGTAAG-1,105,4.663439,4167.0,8.335192,98.680106,99.880010,100.000000,100.000000


Merge into one nice dataframe:

In [16]:
df_merged = pd.DataFrame()
for file in glob.glob('tables/10x_*.tsv'):
    print(file)
    df = pd.read_csv(file, sep='\t', index_col=0)
    df_merged = df_merged.append(df)

tables/10x_nuclei_2k_filtered_gene_bc_matrices.tsv
tables/10x_nuclei_900_filtered_gene_bc_matrices.tsv


In [15]:
df_merged

Unnamed: 0,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes
nuclei_2k_filtered_gene_bc_matrices_TCTATTGCATGCCACG-1,7635,8.940629,25324.0,10.139547,17.370873,21.213079,26.658506,37.395356
nuclei_2k_filtered_gene_bc_matrices_TACGGTACAGCTGTTA-1,4255,8.356085,8921.0,9.096275,18.731084,23.136420,29.402533,42.484026
nuclei_2k_filtered_gene_bc_matrices_AAACGGGTCGGAAACG-1,3221,8.077758,5680.0,8.644882,20.809859,25.228873,31.373239,44.559859
nuclei_2k_filtered_gene_bc_matrices_ATAACGCTCCAAACTG-1,2735,7.914252,6916.0,8.841738,46.515327,49.768652,54.525737,64.213418
nuclei_2k_filtered_gene_bc_matrices_GGAAAGCTCTAACTGG-1,2654,7.884200,5382.0,8.591002,29.059829,34.039391,41.062802,54.589372
...,...,...,...,...,...,...,...,...
nuclei_900_filtered_gene_bc_matrices_TGCCCATAGACGCACA-1,130,4.875197,5722.0,8.652248,98.601887,99.475708,100.000000,100.000000
nuclei_900_filtered_gene_bc_matrices_TTGAACGAGTACGATA-1,126,4.844187,5465.0,8.606302,98.609332,99.524245,100.000000,100.000000
nuclei_900_filtered_gene_bc_matrices_AAACGGGAGTAGGTGC-1,113,4.736198,5123.0,8.541691,98.770252,99.746242,100.000000,100.000000
nuclei_900_filtered_gene_bc_matrices_TACGGATCACGGTAAG-1,105,4.663439,4167.0,8.335192,98.680106,99.880010,100.000000,100.000000
