In [1]:
%matplotlib inline

import os
import gzip
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import random
from scMVP.dataset import LoadData, SnareDemo, SciCarDemo, PairedDemo

[2020-09-21 09:15:10,916] INFO - scMVP._settings | Added StreamHandler with custom formatter to 'scMVP' logger.


## Load published joint profiling data
----
- You can download SNARE-seq, sci-CAR and Paired-seq dataset from GEO, and uncompressed to your folder. <br>
- You can load these dataset with original format with scMVP built-in demo loader. <br>

In [2]:
# Show built-in dataset choices
tell_me_input_choice=SnareDemo()

[2020-09-21 09:15:11,647] INFO - scMVP.dataset.scMVP_dataloader | Please select from "CellLineMixture", "AdBrainCortex" or "P0_BrainCortex" dataset.


In [3]:
# Load snare-seq cell line dataset to scMVP
snare_cellline_demo = SnareDemo(dataset_name="CellLineMixture",data_path="dataset/snare_seq/")
snare_cellline_demo

[2020-09-21 09:15:11,700] INFO - scMVP.dataset.scMVP_dataloader | Preprocessing joint profiling dataset.
[2020-09-21 09:15:24,996] INFO - scMVP.dataset.scMVP_dataloader | Finished preprocessing dataset
[2020-09-21 09:15:25,046] INFO - scMVP.dataset.dataset | Remapping labels to [0,N]
[2020-09-21 09:15:25,048] INFO - scMVP.dataset.dataset | Remapping batch_indices to [0,N]
[2020-09-21 09:15:25,711] INFO - scMVP.dataset.dataset | Computing the library size for the new data
[2020-09-21 09:15:25,731] INFO - scMVP.dataset.dataset | Downsampled from 1047 to 1047 cells


GeneExpressionDataset object with n_cells x nb_genes = 1047 x 18666
    gene_attribute_names: 'gene_names'
    cell_attribute_names: 'atac_expression', 'barcodes', 'local_means', 'local_vars', 'labels', 'batch_indices'
    cell_categorical_attribute_names: 'labels', 'batch_indices'
    cell_measurements_columns: {'atac_expression': 'atac_names'}

In [4]:
# Load snare-seq mouse brain P0 dataset to scMVP
snare_p0_demo = SnareDemo(dataset_name="P0_BrainCortex",data_path="dataset/snare_seq/")
snare_p0_demo

# Load snare-seq mouse adult brain dataset to scMVP
snare_ad_demo = SnareDemo(dataset_name="AdBrainCortex",data_path="dataset/snare_seq/")
snare_ad_demo

[2020-09-21 09:15:25,751] INFO - scMVP.dataset.scMVP_dataloader | Preprocessing joint profiling dataset.
[2020-09-21 09:16:04,429] INFO - scMVP.dataset.scMVP_dataloader | Finished preprocessing dataset
[2020-09-21 09:16:04,698] INFO - scMVP.dataset.dataset | Remapping labels to [0,N]
[2020-09-21 09:16:04,700] INFO - scMVP.dataset.dataset | Remapping batch_indices to [0,N]
[2020-09-21 09:16:07,151] INFO - scMVP.dataset.dataset | Computing the library size for the new data
[2020-09-21 09:16:07,245] INFO - scMVP.dataset.dataset | Downsampled from 5081 to 5081 cells


GeneExpressionDataset object with n_cells x nb_genes = 5081 x 19322
    gene_attribute_names: 'gene_names'
    cell_attribute_names: 'atac_expression', 'barcodes', 'local_means', 'local_vars', 'labels', 'batch_indices'
    cell_categorical_attribute_names: 'labels', 'batch_indices'
    cell_measurements_columns: {'atac_expression': 'atac_names'}

[2020-09-21 09:16:07,267] INFO - scMVP.dataset.scMVP_dataloader | Preprocessing joint profiling dataset.
[2020-09-21 09:17:38,938] INFO - scMVP.dataset.scMVP_dataloader | Finished preprocessing dataset
[2020-09-21 09:17:39,951] INFO - scMVP.dataset.dataset | Remapping labels to [0,N]
[2020-09-21 09:17:39,952] INFO - scMVP.dataset.dataset | Remapping batch_indices to [0,N]
[2020-09-21 09:17:45,953] INFO - scMVP.dataset.dataset | Computing the library size for the new data
[2020-09-21 09:17:46,493] INFO - scMVP.dataset.dataset | Downsampled from 10309 to 10309 cells


GeneExpressionDataset object with n_cells x nb_genes = 10309 x 33160
    gene_attribute_names: 'gene_names'
    cell_attribute_names: 'atac_expression', 'barcodes', 'local_means', 'local_vars', 'labels', 'batch_indices'
    cell_categorical_attribute_names: 'labels', 'batch_indices'
    cell_measurements_columns: {'atac_expression': 'atac_names'}

In [5]:
## Load sci-CAR or paired-seq dataset
scicar_demo = SciCarDemo(dataset_name="CellLineMixture",data_path="dataset/sci_car/")
scicar_demo

paired_demo = PairedDemo(dataset_name="CellLineMixture",data_path="dataset/paried_seq/Paired_seq_Cell_Mix_matrix/")
paired_demo

[2020-09-21 09:17:47,147] INFO - scMVP.dataset.scMVP_dataloader | Preprocessing joint profiling dataset.
[2020-09-21 09:18:02,326] INFO - scMVP.dataset.scMVP_dataloader | Finished preprocessing dataset
[2020-09-21 09:18:03,956] INFO - scMVP.dataset.dataset | Remapping labels to [0,N]
[2020-09-21 09:18:03,958] INFO - scMVP.dataset.dataset | Remapping batch_indices to [0,N]
[2020-09-21 09:18:05,690] INFO - scMVP.dataset.dataset | Computing the library size for the new data
[2020-09-21 09:18:06,489] INFO - scMVP.dataset.dataset | Downsampled from 4825 to 4825 cells


GeneExpressionDataset object with n_cells x nb_genes = 4825 x 113153
    gene_attribute_names: 'gene_names'
    cell_attribute_names: 'atac_expression', 'barcodes', 'local_means', 'local_vars', 'labels', 'batch_indices'
    cell_categorical_attribute_names: 'labels', 'batch_indices'
    cell_measurements_columns: {'atac_expression': 'atac_names'}

[2020-09-21 09:18:06,591] INFO - scMVP.dataset.scMVP_dataloader | Preprocessing joint profiling dataset.
[2020-09-21 09:18:16,598] INFO - scMVP.dataset.scMVP_dataloader | Finished preprocessing dataset
[2020-09-21 09:18:17,101] INFO - scMVP.dataset.dataset | Remapping labels to [0,N]
[2020-09-21 09:18:17,102] INFO - scMVP.dataset.dataset | Remapping batch_indices to [0,N]
[2020-09-21 09:18:17,416] INFO - scMVP.dataset.dataset | Computing the library size for the new data
[2020-09-21 09:18:17,640] INFO - scMVP.dataset.dataset | Downsampled from 3359 to 2986 cells


GeneExpressionDataset object with n_cells x nb_genes = 2986 x 50695
    gene_attribute_names: 'gene_names'
    cell_attribute_names: 'atac_expression', 'barcodes', 'local_means', 'local_vars', 'labels', 'batch_indices'
    cell_categorical_attribute_names: 'labels', 'batch_indices'
    cell_measurements_columns: {'atac_expression': 'atac_names'}

In [6]:
# Manual load large joint profiling dataset
my_demo_data_set = {
                "gene_names": "Adult_CTX_RNA/genes.tsv",
                "gene_expression": "Adult_CTX_RNA/matrix.mtx",
                "gene_barcodes": "Adult_CTX_RNA/barcodes.tsv",
                "atac_names": "Adult_CTX_DNA/genes.tsv",
                "atac_expression": "Adult_CTX_DNA/matrix.mtx",
                "atac_barcodes": "Adult_CTX_DNA/barcodes.tsv"
                }
manual_demo = LoadData(dataset=my_demo_data_set,data_path="dataset/paried_seq/Adult_Cerebrail_Cortex/",
                       dense=False,gzipped=False, atac_threshold=0.005,cell_threshold=100)

[2020-09-21 09:18:17,651] INFO - scMVP.dataset.scMVP_dataloader | Preprocessing joint profiling dataset.
[2020-09-21 09:18:59,822] INFO - scMVP.dataset.scMVP_dataloader | Finished preprocessing dataset
[2020-09-21 09:19:01,118] INFO - scMVP.dataset.dataset | Remapping labels to [0,N]
[2020-09-21 09:19:01,120] INFO - scMVP.dataset.dataset | Remapping batch_indices to [0,N]
[2020-09-21 09:19:01,958] INFO - scMVP.dataset.dataset | Computing the library size for the new data
[2020-09-21 09:19:02,623] INFO - scMVP.dataset.dataset | Downsampled from 15191 to 15191 cells
