In [1]:
%matplotlib inline

import os
import gzip
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import random
from scMVP.dataset import LoadData, SnareDemo, SciCarDemo, PairedDemo

[2020-09-23 22:01:14,502] INFO - scMVP._settings | Added StreamHandler with custom formatter to 'scMVP' logger.


## Load published joint profiling data
----
- You can download SNARE-seq, sci-CAR and Paired-seq dataset from GEO, and uncompressed to your folder. <br>
- You can load these dataset with original format with scMVP built-in demo loader. <br>

In [2]:
# Show built-in dataset choices
tell_me_input_choice=SnareDemo()

[2020-09-23 22:01:15,248] INFO - scMVP.dataset.scMVP_dataloader | Please select from "CellLineMixture", "AdBrainCortex" or "P0_BrainCortex" dataset.


In [3]:
# Load snare-seq cell line dataset to scMVP
snare_cellline_demo = SnareDemo(dataset_name="CellLineMixture",data_path="dataset/snare_seq/")
snare_cellline_demo

[2020-09-23 22:01:15,294] INFO - scMVP.dataset.scMVP_dataloader | Preprocessing joint profiling dataset.
[2020-09-23 22:01:27,940] INFO - scMVP.dataset.scMVP_dataloader | Finished preprocessing dataset
[2020-09-23 22:01:27,991] INFO - scMVP.dataset.dataset | Remapping batch_indices to [0,N]
[2020-09-23 22:01:27,993] INFO - scMVP.dataset.dataset | Remapping labels to [0,N]
[2020-09-23 22:01:28,658] INFO - scMVP.dataset.dataset | Computing the library size for the new data
[2020-09-23 22:01:28,678] INFO - scMVP.dataset.dataset | Downsampled from 1047 to 1047 cells


GeneExpressionDataset object with n_cells x nb_genes = 1047 x 18666
    gene_attribute_names: 'gene_names'
    cell_attribute_names: 'batch_indices', 'barcodes', 'local_vars', 'labels', 'local_means', 'atac_expression'
    cell_categorical_attribute_names: 'batch_indices', 'labels'
    cell_measurements_columns: {'atac_expression': 'atac_names'}

In [4]:
# Load snare-seq mouse brain P0 dataset to scMVP
snare_p0_demo = SnareDemo(dataset_name="P0_BrainCortex",data_path="dataset/snare_seq/")
snare_p0_demo

# Load snare-seq mouse adult brain dataset to scMVP
snare_ad_demo = SnareDemo(dataset_name="AdBrainCortex",data_path="dataset/snare_seq/")
snare_ad_demo

[2020-09-23 22:01:28,699] INFO - scMVP.dataset.scMVP_dataloader | Preprocessing joint profiling dataset.
[2020-09-23 22:02:03,537] INFO - scMVP.dataset.scMVP_dataloader | Finished preprocessing dataset
[2020-09-23 22:02:03,810] INFO - scMVP.dataset.dataset | Remapping batch_indices to [0,N]
[2020-09-23 22:02:03,811] INFO - scMVP.dataset.dataset | Remapping labels to [0,N]
[2020-09-23 22:02:06,268] INFO - scMVP.dataset.dataset | Computing the library size for the new data
[2020-09-23 22:02:06,360] INFO - scMVP.dataset.dataset | Downsampled from 5081 to 5081 cells


GeneExpressionDataset object with n_cells x nb_genes = 5081 x 19322
    gene_attribute_names: 'gene_names'
    cell_attribute_names: 'batch_indices', 'barcodes', 'local_vars', 'labels', 'local_means', 'atac_expression'
    cell_categorical_attribute_names: 'batch_indices', 'labels'
    cell_measurements_columns: {'atac_expression': 'atac_names'}

[2020-09-23 22:02:06,382] INFO - scMVP.dataset.scMVP_dataloader | Preprocessing joint profiling dataset.
[2020-09-23 22:03:31,221] INFO - scMVP.dataset.scMVP_dataloader | Finished preprocessing dataset
[2020-09-23 22:03:32,393] INFO - scMVP.dataset.dataset | Remapping batch_indices to [0,N]
[2020-09-23 22:03:32,394] INFO - scMVP.dataset.dataset | Remapping labels to [0,N]
[2020-09-23 22:03:39,031] INFO - scMVP.dataset.dataset | Computing the library size for the new data
[2020-09-23 22:03:39,671] INFO - scMVP.dataset.dataset | Downsampled from 10309 to 10309 cells


GeneExpressionDataset object with n_cells x nb_genes = 10309 x 33160
    gene_attribute_names: 'gene_names'
    cell_attribute_names: 'batch_indices', 'barcodes', 'local_vars', 'labels', 'local_means', 'atac_expression'
    cell_categorical_attribute_names: 'batch_indices', 'labels'
    cell_measurements_columns: {'atac_expression': 'atac_names'}

In [5]:
## Load sci-CAR dataset
scicar_demo = SciCarDemo(dataset_name="CellLineMixture",data_path="dataset/sci_car/")
scicar_demo

[2020-09-23 22:03:40,320] INFO - scMVP.dataset.scMVP_dataloader | Preprocessing joint profiling dataset.
[2020-09-23 22:03:55,615] INFO - scMVP.dataset.scMVP_dataloader | Finished preprocessing dataset
[2020-09-23 22:03:57,254] INFO - scMVP.dataset.dataset | Remapping batch_indices to [0,N]
[2020-09-23 22:03:57,256] INFO - scMVP.dataset.dataset | Remapping labels to [0,N]
[2020-09-23 22:03:59,003] INFO - scMVP.dataset.dataset | Computing the library size for the new data
[2020-09-23 22:03:59,807] INFO - scMVP.dataset.dataset | Downsampled from 4825 to 4825 cells


GeneExpressionDataset object with n_cells x nb_genes = 4825 x 113153
    gene_attribute_names: 'gene_names'
    cell_attribute_names: 'batch_indices', 'barcodes', 'local_vars', 'labels', 'local_means', 'atac_expression'
    cell_categorical_attribute_names: 'batch_indices', 'labels'
    cell_measurements_columns: {'atac_expression': 'atac_names'}

In [6]:
# Load demo paried-seq dataset
paired_demo = PairedDemo(dataset_name="CellLineMixture",data_path="dataset/paried_seq/Paired_seq_Cell_Mix_matrix/")
paired_demo

[2020-09-23 22:03:59,870] INFO - scMVP.dataset.scMVP_dataloader | Preprocessing joint profiling dataset.
[2020-09-23 22:04:09,341] INFO - scMVP.dataset.scMVP_dataloader | Loading Label into dataset.
[2020-09-23 22:04:09,344] INFO - scMVP.dataset.scMVP_dataloader | Finished preprocessing dataset
[2020-09-23 22:04:09,845] INFO - scMVP.dataset.dataset | Remapping batch_indices to [0,N]
[2020-09-23 22:04:09,847] INFO - scMVP.dataset.dataset | Remapping labels to [0,N]
[2020-09-23 22:04:10,159] INFO - scMVP.dataset.dataset | Computing the library size for the new data
[2020-09-23 22:04:10,382] INFO - scMVP.dataset.dataset | Downsampled from 3359 to 2986 cells


GeneExpressionDataset object with n_cells x nb_genes = 2986 x 50695
    gene_attribute_names: 'gene_names'
    cell_attribute_names: 'batch_indices', 'barcodes', 'local_vars', 'labels', 'local_means', 'atac_expression', 'Label_label'
    cell_categorical_attribute_names: 'batch_indices', 'labels'
    cell_measurements_columns: {'atac_expression': 'atac_names', 'Label_label': 'Label'}

In [7]:
# Manual load paired-seq dataset with custom threshold and embedding files
my_demo_data_set = {
                "gene_names": "Cell_Mix_RNA/genes.tsv",
                "gene_expression": "Cell_Mix_RNA/matrix.mtx",
                "gene_barcodes": "Cell_Mix_RNA/barcodes.tsv",
                "atac_names": "Cell_Mix_DNA/genes.tsv",
                "atac_expression": "Cell_Mix_DNA/matrix.mtx",
                "atac_barcodes": "Cell_Mix_DNA/barcodes.tsv"
                }
cell_embeds=pd.read_csv("dataset/paried_seq/Paired_seq_Cell_Mix_matrix/Cell_embeddings.xls",sep='\t')
cell_embeds_avail=cell_embeds.iloc[:,0:2]
manual_demo = LoadData(dataset=my_demo_data_set,data_path="dataset/paried_seq/Paired_seq_Cell_Mix_matrix/",
                       dense=False,gzipped=False, atac_threshold=0.0005,
                       cell_threshold=1,cell_meta=cell_embeds_avail)

[2020-09-23 22:04:10,399] INFO - scMVP.dataset.scMVP_dataloader | Preprocessing joint profiling dataset.
[2020-09-23 22:04:23,384] INFO - scMVP.dataset.scMVP_dataloader | Loading Ident into dataset.
[2020-09-23 22:04:23,387] INFO - scMVP.dataset.scMVP_dataloader | Finished preprocessing dataset
[2020-09-23 22:04:23,973] INFO - scMVP.dataset.dataset | Remapping batch_indices to [0,N]
[2020-09-23 22:04:23,974] INFO - scMVP.dataset.dataset | Remapping labels to [0,N]
[2020-09-23 22:04:45,621] INFO - scMVP.dataset.dataset | Computing the library size for the new data
[2020-09-23 22:04:45,946] INFO - scMVP.dataset.dataset | Downsampled from 3359 to 3259 cells
