In [1]:
import os
import sys 
import json
import pickle

import torch
import anndata
import numpy as np
import pandas as pd
import scanpy as sc

In [2]:
def save_pickle(data, file_name):
    f = open(file_name, "wb")
    pickle.dump(data, f)
    f.close()

def load_pickle(file_name):
    f = open(file_name, "rb+")
    data = pickle.load(f)
    f.close()
    return data



In [3]:
import collections
import scipy.sparse as sp_sparse
import tables
 
CountMatrix = collections.namedtuple('CountMatrix', ['feature_ref', 'barcodes', 'matrix'])
 
def get_matrix_from_h5(filename):
    with tables.open_file(filename, 'r') as f:
        mat_group = f.get_node(f.root, 'matrix')
        barcodes = f.get_node(mat_group, 'barcodes').read()
        data = getattr(mat_group, 'data').read()
        indices = getattr(mat_group, 'indices').read()
        indptr = getattr(mat_group, 'indptr').read()
        shape = getattr(mat_group, 'shape').read()
        matrix = sp_sparse.csc_matrix((data, indices, indptr), shape=shape)
         
        feature_ref = {}
        feature_group = f.get_node(mat_group, 'features')
        feature_ids = getattr(feature_group, 'id').read()
        feature_names = getattr(feature_group, 'name').read()
        feature_types = getattr(feature_group, 'feature_type').read()
        feature_ref['id'] = feature_ids
        feature_ref['name'] = feature_names
        feature_ref['feature_type'] = feature_types
        tag_keys = getattr(feature_group, '_all_tag_keys').read()
        for key in tag_keys:
            key = key.decode("utf-8")
            feature_ref[key] = getattr(feature_group, key).read()
            
        return CountMatrix(feature_ref, barcodes, matrix)

In [4]:
def concatenate_csc_matrices_by_columns(matrix1, matrix2):
    new_data = np.concatenate((matrix1.data, matrix2.data))
    new_indices = np.concatenate((matrix1.indices, matrix2.indices))
    new_ind_ptr = matrix2.indptr + len(matrix1.data)
    new_ind_ptr = new_ind_ptr[1:]
    new_ind_ptr = np.concatenate((matrix1.indptr, new_ind_ptr))

    return scipy.sparse.csc_matrix((new_data, new_indices, new_ind_ptr))

# Data Manipulation: convert h5ad to mtx

## RNA + ATAC

In [8]:
RNA_PATH = "/home/wsg/BM/data/SHARE/GB/shareseq_mouse_skin_rna.h5ad"
ATAC_PATH = "/home/wsg/BM/data/SHARE/GB/shareseq_mouse_skin_atac.h5ad"

In [9]:
import scanpy as sc
from scipy import io

rna = sc.read_h5ad(RNA_PATH)
atac = sc.read_h5ad(ATAC_PATH)

In [10]:
print(rna)
print(atac)

AnnData object with n_obs × n_vars = 42948 × 23296
    obs: 'rna.bc'
    var: 'gene'
AnnData object with n_obs × n_vars = 34774 × 344592
    obs: 'rna.bc'


In [21]:
rna.obs
# adata.obs['bc'] = 
rna.obs['rna.bc'].str.replace(',', '.')

0        R1.01.R2.01.R3.06.P1.55
1        R1.01.R2.01.R3.36.P1.53
2        R1.01.R2.01.R3.42.P1.55
3        R1.01.R2.01.R3.43.P1.56
4        R1.01.R2.01.R3.64.P1.53
                  ...           
42943    R1.96.R2.96.R3.01.P1.55
42944    R1.96.R2.96.R3.05.P1.55
42945    R1.96.R2.96.R3.12.P1.55
42946    R1.96.R2.96.R3.23.P1.53
42947    R1.96.R2.96.R3.88.P1.53
Name: rna.bc, Length: 42948, dtype: object

In [12]:
atac.obs

Unnamed: 0_level_0,rna.bc
rna.bc,Unnamed: 1_level_1
R1.01.R2.01.R3.06.P1.55,R1.01.R2.01.R3.06.P1.55
R1.01.R2.03.R3.68.P1.55,R1.01.R2.03.R3.68.P1.55
R1.01.R2.05.R3.15.P1.53,R1.01.R2.05.R3.15.P1.53
R1.01.R2.05.R3.40.P1.55,R1.01.R2.05.R3.40.P1.55
R1.01.R2.05.R3.49.P1.55,R1.01.R2.05.R3.49.P1.55
...,...
R1.92.R2.79.R3.05.P1.56,R1.92.R2.79.R3.05.P1.56
R1.93.R2.20.R3.18.P1.53,R1.93.R2.20.R3.18.P1.53
R1.93.R2.80.R3.62.P1.55,R1.93.R2.80.R3.62.P1.55
R1.93.R2.91.R3.82.P1.56,R1.93.R2.91.R3.82.P1.56


In [18]:
rna.obs['rna.bc'].values.isin(atac.obs.index)

AttributeError: 'numpy.ndarray' object has no attribute 'isin'

In [24]:
atac.obs.index.isin(rna.obs['rna.bc'].str.replace(',', '.'))

array([ True,  True,  True, ...,  True,  True,  True])

In [76]:
test = sc.read_h5ad("/home/wsg/BM/data/SHARE/GB/mouse_skin_shareseq_atac_10k.h5ad")

In [77]:
test.var

Unnamed: 0,features,feature
chrX-143482906-143483206,chrX-143482906-143483206,chrX-143482906-143483206
chr6-3200976-3201276,chr6-3200976-3201276,chr6-3200976-3201276
chr9-123461850-123462150,chr9-123461850-123462150,chr9-123461850-123462150
chr1-56782095-56782395,chr1-56782095-56782395,chr1-56782095-56782395
chr9-56223668-56223968,chr9-56223668-56223968,chr9-56223668-56223968
...,...,...
chr10-107887768-107888068,chr10-107887768-107888068,chr10-107887768-107888068
chr10-107038880-107039180,chr10-107038880-107039180,chr10-107038880-107039180
chr10-10549625-10549925,chr10-10549625-10549925,chr10-10549625-10549925
chr10-105270865-105271165,chr10-105270865-105271165,chr10-105270865-105271165


In [4]:
BMMC_rna.layers['processed'] = BMMC_rna.X.copy()
BMMC_rna.X = BMMC_rna.layers['counts'].copy()
BMMC_rna.X.todense()

[[ 0.          0.          0.         ...  4.410295    0.
   0.        ]
 [ 0.          0.          0.         ... 13.168547    0.
   0.        ]
 [ 0.41061893  0.          0.         ...  3.2849514   0.
   0.        ]
 ...
 [ 0.          0.          0.         ...  4.6001105   0.
   0.        ]
 [ 0.          0.          0.         ...  1.2580403   0.
   0.        ]
 [ 0.          0.          0.         ...  4.014935    0.
   0.        ]]
[[0. 0. 0. ... 2. 0. 0.]
 [0. 0. 0. ... 6. 0. 0.]
 [1. 0. 0. ... 8. 0. 0.]
 ...
 [0. 0. 0. ... 2. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 2. 0. 0.]]


In [5]:
BMMC_atac.layers['binarized'] = BMMC_atac.X.copy()
BMMC_atac.X = BMMC_atac.layers['counts'].copy()
BMMC_atac.X.todense()

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 2. 0.]
 [0. 0. 0. ... 0. 2. 0.]
 [0. 0. 0. ... 0. 4. 0.]]


In [11]:
# print(BMMC_atac.X.T)
print(BMMC_atac.layers['binarized'].T)

  (6, 0)	1.0
  (46, 0)	1.0
  (55, 0)	1.0
  (80, 0)	1.0
  (92, 0)	1.0
  (150, 0)	1.0
  (162, 0)	1.0
  (177, 0)	1.0
  (200, 0)	1.0
  (203, 0)	1.0
  (220, 0)	1.0
  (224, 0)	1.0
  (240, 0)	1.0
  (242, 0)	1.0
  (250, 0)	1.0
  (285, 0)	1.0
  (321, 0)	1.0
  (359, 0)	1.0
  (383, 0)	1.0
  (452, 0)	1.0
  (710, 0)	1.0
  (749, 0)	1.0
  (754, 0)	1.0
  (827, 0)	1.0
  (829, 0)	1.0
  :	:
  (115445, 69248)	1.0
  (115465, 69248)	1.0
  (115480, 69248)	1.0
  (115542, 69248)	1.0
  (115585, 69248)	1.0
  (115601, 69248)	1.0
  (115764, 69248)	1.0
  (115767, 69248)	1.0
  (115924, 69248)	1.0
  (115927, 69248)	1.0
  (115940, 69248)	1.0
  (115951, 69248)	1.0
  (116081, 69248)	1.0
  (116102, 69248)	1.0
  (116157, 69248)	1.0
  (116192, 69248)	1.0
  (116200, 69248)	1.0
  (116251, 69248)	1.0
  (116270, 69248)	1.0
  (116278, 69248)	1.0
  (116342, 69248)	1.0
  (116383, 69248)	1.0
  (116404, 69248)	1.0
  (116425, 69248)	1.0
  (116488, 69248)	1.0


In [6]:
output_path = "/Data/wangsg/BM/pipeline/data/BMMC/RNA+ATAC/multiome/RawData"

# save hd5
BMMC_rna.write_h5ad("{}/BMMC-multiome-raw-RNA-counts.h5ad".format(output_path))
# save hd5
BMMC_atac.write_h5ad("{}/BMMC-multiome-raw-ATAC-peaks.h5ad".format(output_path))

In [8]:
# Save RNA
## makr dir
!mkdir RNA+ATAC/multiome/RawData/BMMC-multiome-raw-RNA-counts.mtx
## save X to mtx
io.mmwrite('RNA+ATAC/multiome/RawData/BMMC-multiome-raw-RNA-counts.mtx/matrix', BMMC_rna.X.T)
## save barcodes
with open('RNA+ATAC/multiome/RawData/BMMC-multiome-raw-RNA-counts.mtx/barcodes.tsv', 'w') as f:
    for item in BMMC_rna.obs_names:
        f.write(item + '\n')      
## save features
with open('RNA+ATAC/multiome/RawData/BMMC-multiome-raw-RNA-counts.mtx/features.tsv', 'w') as f:
    for item in BMMC_rna.var_names:
        f.write(item + '\n')
## gzip file
!gzip RNA+ATAC/multiome/RawData/BMMC-multiome-raw-RNA-counts.mtx/*
## save metadata
BMMC_rna.obs.to_csv('RNA+ATAC/multiome/RawData/BMMC-multiome-raw-RNA-counts.mtx/metadata.csv')

mkdir: cannot create directory ‘RNA+ATAC/multiome/RawData/BMMC-multiome-raw-RNA-counts.mtx’: File exists


In [16]:
# Save ATAC
## makr dir
!mkdir RNA+ATAC/multiome/RawData/BMMC-multiome-raw-ATAC-peaks.mtx
## save X to mtx
io.mmwrite('RNA+ATAC/multiome/RawData/BMMC-multiome-raw-ATAC-peaks.mtx/matrix', BMMC_atac.X.T)
## save barcodes
with open('RNA+ATAC/multiome/RawData/BMMC-multiome-raw-ATAC-peaks.mtx/barcodes.tsv', 'w') as f:
    for item in BMMC_atac.obs_names:
        f.write(item + '\n')      
## save features
with open('RNA+ATAC/multiome/RawData/BMMC-multiome-raw-ATAC-peaks.mtx/features.tsv', 'w') as f:
    for item in BMMC_atac.var_names:
        f.write(item + '\n')
## gzip file
!gzip RNA+ATAC/multiome/RawData/BMMC-multiome-raw-ATAC-peaks.mtx/*
## save metadata
BMMC_atac.obs.to_csv('RNA+ATAC/multiome/RawData/BMMC-multiome-raw-ATAC-peaks.mtx/metadata.csv')

mkdir: cannot create directory ‘RNA+ATAC/multiome/RawData/BMMC-multiome-raw-ATAC-peaks.mtx’: File exists


In [12]:
# Save binarized ATAC
## makr dir
!mkdir RNA+ATAC/multiome/RawData/BMMC-multiome-binarized-ATAC-peaks.mtx
## save X to mtx
io.mmwrite('RNA+ATAC/multiome/RawData/BMMC-multiome-binarized-ATAC-peaks.mtx/matrix', BMMC_atac.layers['binarized'].T)
## save barcodes
with open('RNA+ATAC/multiome/RawData/BMMC-multiome-binarized-ATAC-peaks.mtx/barcodes.tsv', 'w') as f:
    for item in BMMC_atac.obs_names:
        f.write(item + '\n')      
## save features
with open('RNA+ATAC/multiome/RawData/BMMC-multiome-binarized-ATAC-peaks.mtx/features.tsv', 'w') as f:
    for item in BMMC_atac.var_names:
        f.write(item + '\n')
## gzip file
!gzip RNA+ATAC/multiome/RawData/BMMC-multiome-binarized-ATAC-peaks.mtx/*
## save metadata
BMMC_atac.obs.to_csv('RNA+ATAC/multiome/RawData/BMMC-multiome-binarized-ATAC-peaks.mtx/metadata.csv')

mkdir: cannot create directory ‘RNA+ATAC/multiome/RawData/BMMC-multiome-binarized-ATAC-peaks.mtx’: File exists


In [14]:
# print(BMMC_multi.obs.Site.value_counts())
# print(BMMC_multi.obs.Samplename.value_counts())
# print(BMMC_multi.obs.batch.value_counts())
# print(BMMC_multi.obs.Modality.value_counts())
# print(BMMC_multi.obs.cell_type.value_counts())

# print(BMMC_multi.obs.DonorAge.value_counts())
# print(BMMC_multi.obs.DonorBloodType.value_counts())
# print(BMMC_multi.obs.DonorBMI.value_counts())
# print(BMMC_multi.obs.DonorGender.value_counts())
# print(BMMC_multi.obs.DonorID.value_counts())
# print(BMMC_multi.obs.DonorNumber.value_counts())
# print(BMMC_multi.obs.DonorRace.value_counts())
# print(BMMC_multi.obs.DonorSmoker.value_counts())

In [9]:
import scanpy as sc
from scipy import io
BMMC_multi = sc.read_h5ad(BMMC_multi_h5ad)

# print(BMMC_multi.obs.DonorSmoker.value_counts())
BMMC_rna = BMMC_multi[:, BMMC_multi.var["feature_types"] == "GEX"]
BMMC_atac = BMMC_multi[:, BMMC_multi.var["feature_types"] == "ATAC"]
print(BMMC_rna.var.feature_types)
print(BMMC_atac.var.feature_types)

AL627309.5    GEX
LINC01409     GEX
LINC01128     GEX
NOC2L         GEX
KLHL17        GEX
             ... 
MT-ND5        GEX
MT-ND6        GEX
MT-CYB        GEX
AL592183.1    GEX
AC240274.1    GEX
Name: feature_types, Length: 13431, dtype: category
Categories (1, object): ['GEX']
chr1-9776-10668            ATAC
chr1-180726-181005         ATAC
chr1-181117-181803         ATAC
chr1-191133-192055         ATAC
chr1-267562-268456         ATAC
                           ... 
GL000219.1-90062-90937     ATAC
GL000219.1-99257-100160    ATAC
KI270726.1-27152-28034     ATAC
KI270713.1-21434-22336     ATAC
KI270713.1-29629-30491     ATAC
Name: feature_types, Length: 116490, dtype: category
Categories (1, object): ['ATAC']


In [10]:
print(BMMC_rna.X.todense())
BMMC_rna.layers['processed'] = BMMC_rna.X.copy()
BMMC_rna.X = BMMC_rna.layers['counts'].copy()
print(BMMC_rna.X.todense())

[[ 0.          0.          0.         ...  4.410295    0.
   0.        ]
 [ 0.          0.          0.         ... 13.168547    0.
   0.        ]
 [ 0.41061893  0.          0.         ...  3.2849514   0.
   0.        ]
 ...
 [ 0.          0.          0.         ...  4.6001105   0.
   0.        ]
 [ 0.          0.          0.         ...  1.2580403   0.
   0.        ]
 [ 0.          0.          0.         ...  4.014935    0.
   0.        ]]
[[0. 0. 0. ... 2. 0. 0.]
 [0. 0. 0. ... 6. 0. 0.]
 [1. 0. 0. ... 8. 0. 0.]
 ...
 [0. 0. 0. ... 2. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 2. 0. 0.]]


In [11]:
print(BMMC_atac.X.todense())
BMMC_atac.layers['processed'] = BMMC_atac.X.copy()
BMMC_atac.X = BMMC_atac.layers['counts'].copy()
print(BMMC_atac.X.todense())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 2. 0.]
 [0. 0. 0. ... 0. 2. 0.]
 [0. 0. 0. ... 0. 4. 0.]]


In [18]:
!ls /Data/wangsg/BM/pipeline/results/BMMC/data_preprocess

BMMC-raw-pair-ATAC-peaks.h5ad	   BMMC-raw-pair-RNA-counts.mtx
BMMC-raw-pair-ATAC-peaks.h5Seurat  BMMC-raw-pair-RNA-counts.rds
BMMC-raw-pair-ATAC-peaks.mtx	   BMMC-scglue-pair-RNA-barcodes.tsv
BMMC-raw-pair-ATAC-peaks.rds	   BMMC-scglue-pair-RNA-genes.tsv
BMMC-raw-pair-RNA-counts.h5ad	   BMMC-scglue-pair-RNA-matrix.mtx
BMMC-raw-pair-RNA-counts.h5Seurat


In [20]:
output_path = "/Data/wangsg/BM/pipeline/results/BMMC/data_preprocess"

# save hd5
BMMC_rna.write_h5ad("{}/BMMC-raw-pair-RNA-counts.h5ad".format(output_path))

# save hd5
BMMC_atac.write_h5ad("{}/BMMC-raw-pair-ATAC-peaks.h5ad".format(output_path))

In [26]:
# Save RNA
## makr dir
!mkdir BMMC-raw-pair-RNA-counts.mtx
## save X to mtx
io.mmwrite('BMMC-raw-pair-RNA-counts.mtx/matrix', BMMC_rna.X.T)
## save barcodes
with open('BMMC-raw-pair-RNA-counts.mtx/barcodes.tsv', 'w') as f:
    for item in BMMC_rna.obs_names:
        f.write(item + '\n')      
## save features
with open('BMMC-raw-pair-RNA-counts.mtx/features.tsv', 'w') as f:
    for item in BMMC_rna.var_names:
        f.write(item + '\n')
## gzip file
!gzip BMMC-raw-pair-RNA-counts.mtx/*
## save metadata
BMMC_rna.obs.to_csv('BMMC-raw-pair-RNA-counts.mtx/metadata.csv')

In [27]:
# Save ATAC
## makr dir
!mkdir BMMC-raw-pair-ATAC-peaks.mtx
## save X to mtx
io.mmwrite('BMMC-raw-pair-ATAC-peaks.mtx/matrix', BMMC_atac.X.T)
## save barcodes
with open('BMMC-raw-pair-ATAC-peaks.mtx/barcodes.tsv', 'w') as f:
    for item in BMMC_atac.obs_names:
        f.write(item + '\n')      
## save features
with open('BMMC-raw-pair-ATAC-peaks.mtx/features.tsv', 'w') as f:
    for item in BMMC_atac.var_names:
        f.write(item + '\n')
## gzip file
!gzip BMMC-raw-pair-ATAC-peaks.mtx/*
## save metadata
BMMC_atac.obs.to_csv('BMMC-raw-pair-ATAC-peaks.mtx/metadata.csv')

In [12]:
# Save ATAC
BMMC_atac.X = BMMC_atac.layers['processed'].copy()

## makr dir
!mkdir BMMC-raw-pair-ATAC-peaks-binarized.mtx
## save X to mtx
io.mmwrite('BMMC-raw-pair-ATAC-peaks-binarized.mtx/matrix', BMMC_atac.X.T)
## save barcodes
with open('BMMC-raw-pair-ATAC-peaks-binarized.mtx/barcodes.tsv', 'w') as f:
    for item in BMMC_atac.obs_names:
        f.write(item + '\n')      
## save features
with open('BMMC-raw-pair-ATAC-peaks-binarized.mtx/features.tsv', 'w') as f:
    for item in BMMC_atac.var_names:
        f.write(item + '\n')
## gzip file
!gzip BMMC-raw-pair-ATAC-peaks-binarized.mtx/*
## save metadata
BMMC_atac.obs.to_csv('BMMC-raw-pair-ATAC-peaks-binarized.mtx/metadata.csv')

## CITE-seq

In [3]:
BMMC_multi_h5ad = "/Data/wangsg/BM/pipeline/data/BMMC/RawData/GSE194122_openproblems_neurips2021_cite_BMMC_processed.h5ad"

In [4]:
import scanpy as sc
from scipy import io
BMMC_multi = sc.read_h5ad(BMMC_multi_h5ad)

print(BMMC_multi.var.feature_types)
print(BMMC_multi.var.feature_types.value_counts())

BMMC_rna = BMMC_multi[:, BMMC_multi.var["feature_types"] == "GEX"]
BMMC_adt = BMMC_multi[:, BMMC_multi.var["feature_types"] == "ADT"]

print(BMMC_rna.var.feature_types)
print(BMMC_adt.var.feature_types)

AL627309.5    GEX
LINC01409     GEX
LINC01128     GEX
LINC00115     GEX
FAM41C        GEX
             ... 
HLA-E         ADT
CD82          ADT
CD101         ADT
CD88          ADT
CD224         ADT
Name: feature_types, Length: 14087, dtype: category
Categories (2, object): ['ADT', 'GEX']
GEX    13953
ADT      134
Name: feature_types, dtype: int64


  utils.warn_names_duplicates("var")


In [7]:
print(BMMC_rna.X.todense())
BMMC_rna.layers['processed'] = BMMC_rna.X.copy()
BMMC_rna.X = BMMC_rna.layers['counts'].copy()
print(BMMC_rna.X.todense())

[[0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 ...
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        1.5111434 0.       ]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]]


In [10]:
print(BMMC_adt.X.todense())
BMMC_adt.layers['processed'] = BMMC_adt.X.copy()
BMMC_adt.X = BMMC_adt.layers['counts'].copy()
print(BMMC_adt.X.todense())

[[0.15437832 0.15437832 1.3004414  ... 0.15437832 0.91724646 0.6068599 ]
 [1.2411374  0.27233985 0.9930009  ... 2.0187593  0.72912055 1.9254438 ]
 [0.         0.22376126 1.3568352  ... 0.         0.7554061  0.4064944 ]
 ...
 [0.09068501 1.3225017  1.030889   ... 0.6674399  0.5649686  1.0964116 ]
 [0.13610515 1.5667104  1.2673818  ... 0.45945412 0.9285848  0.7388458 ]
 [0.11087249 1.3386939  1.3689729  ... 0.7758898  0.53269356 0.7758898 ]]
[[  1.   1.  16. ...   1.   9.   5.]
 [ 55.   7.  38. ... 146.  24. 131.]
 [  0.   2.  23. ...   0.   9.   4.]
 ...
 [  1.  29.  19. ...  10.   8.  21.]
 [  2.  52.  35. ...   8.  21.  15.]
 [  1.  24.  25. ...  10.   6.  10.]]


In [13]:
output_path = "/Data/wangsg/BM/pipeline/data/BMMC/RNA+ADT/CITE-seq"

# save hd5
BMMC_rna.write_h5ad("{}/BMMC-raw-pair-RNA-counts.h5ad".format(output_path))
# save hd5
BMMC_adt.write_h5ad("{}/BMMC-raw-pair-ADT-counts.h5ad".format(output_path))

In [8]:
# Save RNA
## makr dir
!mkdir RNA+ADT/CITE-seq/BMMC-raw-pair-RNA-counts.mtx
## save X to mtx
io.mmwrite('RNA+ADT/CITE-seq/BMMC-raw-pair-RNA-counts.mtx/matrix', BMMC_rna.X.T)
## save barcodes
with open('RNA+ADT/CITE-seq/BMMC-raw-pair-RNA-counts.mtx/barcodes.tsv', 'w') as f:
    for item in BMMC_rna.obs_names:
        f.write(item + '\n')      
## save features
with open('RNA+ADT/CITE-seq/BMMC-raw-pair-RNA-counts.mtx/features.tsv', 'w') as f:
    for item in BMMC_rna.var_names:
        f.write(item + '\n')
## gzip file
!gzip RNA+ADT/CITE-seq/BMMC-raw-pair-RNA-counts.mtx/*
## save metadata
BMMC_rna.obs.to_csv('RNA+ADT/CITE-seq/BMMC-raw-pair-RNA-counts.mtx/metadata.csv')

In [11]:
# Save ADT
## makr dir
!mkdir RNA+ADT/CITE-seq/BMMC-raw-pair-ADT-counts.mtx
## save X to mtx
io.mmwrite('RNA+ADT/CITE-seq/BMMC-raw-pair-ADT-counts.mtx/matrix', BMMC_adt.X.T)
## save barcodes
with open('RNA+ADT/CITE-seq/BMMC-raw-pair-ADT-counts.mtx/barcodes.tsv', 'w') as f:
    for item in BMMC_adt.obs_names:
        f.write(item + '\n')      
## save features
with open('RNA+ADT/CITE-seq/BMMC-raw-pair-ADT-counts.mtx/features.tsv', 'w') as f:
    for item in BMMC_adt.var_names:
        f.write(item + '\n')
## gzip file
!gzip RNA+ADT/CITE-seq/BMMC-raw-pair-ADT-counts.mtx/*
## save metadata
BMMC_adt.obs.to_csv('RNA+ADT/CITE-seq/BMMC-raw-pair-ADT-counts.mtx/metadata.csv')

# Function: create pkl data of cobolt

In [17]:
def cobolt_pkl_data(input_path,
                   output_path,
                   dataset):
    from cobolt.utils import SingleData, MultiomicDataset
    from cobolt.model import Cobolt
    
    # Load Data
    rna = SingleData.from_file(path=input_path,
                               dataset_name=dataset["data_name"],
                               feature_name="GeneExpr",
                               count_file=dataset["gene_expression"],
                               barcode_file=dataset["gene_barcodes"],
                               feature_file=dataset["gene_names"])
    atac = SingleData.from_file(path=input_path,
                                dataset_name=dataset["data_name"],
                                feature_name="ChromAccess",
                                count_file=dataset["atac_expression"],
                                barcode_file=dataset["atac_barcodes"],
                                feature_file=dataset["atac_names"])
    # Filter Data
    # rna.filter_features(upper_quantile=0.99, lower_quantile=0.7)
    atac.filter_features(upper_quantile=0.99, lower_quantile=0.7)

    # Merge Data
    multi_dt = MultiomicDataset.from_singledata(rna, atac)

    # Save Data
    save_pickle(multi_dt,
                "{}/{}-{}-{}-multi-filtered.pkl".format(output_path, dataset["data_name"],
                                                "cobolt", dataset["task_type"]))

In [20]:
input_path = "/Data/wangsg/BM/pipeline/results/BMMC/pair/data_preprocess"
output_path = "/Data/wangsg/BM/pipeline/results/BMMC/pair/data_preprocess"
dataset = json.load(open("/Data/wangsg/BM/pipeline/results/BMMC/pair/data_preprocess/BMMC.json"))

In [21]:
cobolt_pkl_data(input_path, output_path, dataset)

# Function: create h5 data of multivi¶

In [22]:
def multivi_h5ad_data(input_path,
                     output_path,
                     dataset):
    # Load Data
    rna = sc.read_h5ad("{}/{}-{}-{}-RNA-counts.h5ad".format(input_path, 
                                                               dataset["data_name"], 
                                                               "raw", 
                                                               dataset["task_type"]))

    atac = sc.read_h5ad("{}/{}-{}-{}-ATAC-peaks.h5ad".format(input_path, 
                                                               dataset["data_name"], 
                                                               "raw", 
                                                               dataset["task_type"]))


    # Filter Data
    ## RNA
    sc.pp.filter_genes(rna, min_cells=int(rna.shape[0] * 0.001))
    # sc.pp.filter_cells(rna, min_genes=3)
    ## ATAC
    sc.pp.filter_genes(atac, min_cells=int(rna.shape[0] * 0.001))
    sc.pp.filter_genes(atac, max_cells=int(rna.shape[0] * 0.1))
    # sc.pp.filter_cells(atac, min_genes=3)

    # Merge Data
    multi = anndata.concat([rna, atac], axis=1, join="outer")
    multi.var_names_make_unique()

    # Add Metadata
    multi.var['modality'] = np.repeat(["Gene Expression", "Peaks"], [rna.shape[1], atac.shape[1]], axis=0)

    # Save Data
    del multi.raw
    multi.write_h5ad("{}/{}-{}-{}-multi-filtered.h5ad".format(output_path, dataset["data_name"],
                                                "multivi", dataset["task_type"]))

In [23]:
input_path = "/Data/wangsg/BM/pipeline/results/BMMC/pair/data_preprocess"
output_path = "/Data/wangsg/BM/pipeline/results/BMMC/pair/data_preprocess"
dataset = json.load(open("/Data/wangsg/BM/pipeline/results/BMMC/pair/data_preprocess/BMMC.json"))

In [24]:
multivi_h5ad_data(input_path, output_path, dataset)

  utils.warn_names_duplicates("var")


# Function: create pkl of scMVP

In [25]:
def scMVP_pkl_data(input_path,
                   output_path, 
                   dataset, 
                   annotation=None, 
                   annotation_column=None):
    
    from scMVP.dataset import LoadData, GeneExpressionDataset, CellMeasurement

    # Load Data
    ## prepare dataset
    input_path = "{}/".format(input_path)
    output_path = "{}/".format(output_path)
    dataset_sub = {
        k: dataset.get(k, None)
        for k in ("gene_names", "gene_barcodes", "gene_expression",
                  "atac_names", "atac_barcodes", "atac_expression")
    }
    ## multi
    if annotation:
        cell_embeddings = pd.read_csv(input_path + annotation,
                                      sep="\t",
                                      index_col=None).iloc[:,
                                                           annotation_column]
        scmvp_multi_data = LoadData(dataset=dataset_sub,
                                   data_path=input_path,
                                   dense=False,
                                   gzipped=False,
                                   atac_threshold=0.001,
                                   cell_threshold=1,
                                   cell_meta=cell_embeddings)
    else:
        scmvp_multi_data = LoadData(dataset=dataset_sub,
                                   data_path=input_path,
                                   dense=False,
                                   gzipped=False,
                                   atac_threshold=0.001,
                                   cell_threshold=1)

    # Save Data
    save_pickle(scmvp_multi_data,
                "{}/{}-{}-{}-multi-filtered.pkl".format(output_path, dataset["data_name"],
                                                "scMVP", dataset["task_type"]))

In [27]:
input_path = "/Data/wangsg/BM/pipeline/results/BMMC/pair/data_preprocess"
output_path = "/Data/wangsg/BM/pipeline/results/BMMC/pair/data_preprocess"
dataset = json.load(open("/Data/wangsg/BM/pipeline/results/BMMC/pair/data_preprocess/BMMC.json"))

In [28]:
scMVP_pkl_data(input_path, output_path, dataset)

[2022-11-15 16:09:31,861] INFO - scMVP._settings | Added StreamHandler with custom formatter to 'scMVP' logger.
[2022-11-15 16:09:31,960] INFO - scMVP.dataset.scMVP_dataloader | Preprocessing joint profiling dataset.
[2022-11-15 16:18:58,143] INFO - scMVP.dataset.scMVP_dataloader | Finished preprocessing dataset
[2022-11-15 16:19:01,398] INFO - scMVP.dataset.dataset | Remapping labels to [0,N]
[2022-11-15 16:19:01,402] INFO - scMVP.dataset.dataset | Remapping batch_indices to [0,N]
[2022-11-15 16:24:13,193] INFO - scMVP.dataset.dataset | Computing the library size for the new data
[2022-11-15 16:24:16,110] INFO - scMVP.dataset.dataset | Downsampled from 69249 to 69249 cells


In [30]:
annotation=None
annotation_column=None

In [31]:
from scMVP.dataset import LoadData, GeneExpressionDataset, CellMeasurement

# Load Data
## prepare dataset
input_path = "{}/".format(input_path)
output_path = "{}/".format(output_path)
dataset_sub = {
    k: dataset.get(k, None)
    for k in ("gene_names", "gene_barcodes", "gene_expression",
              "atac_names", "atac_barcodes", "atac_expression")
}
## multi
if annotation:
    cell_embeddings = pd.read_csv(input_path + annotation,
                                  sep="\t",
                                  index_col=None).iloc[:,
                                                       annotation_column]
    scmvp_multi_data = LoadData(dataset=dataset_sub,
                               data_path=input_path,
                               dense=False,
                               gzipped=False,
                               atac_threshold=0.001,
                               cell_threshold=1,
                               cell_meta=cell_embeddings)
else:
    scmvp_multi_data = LoadData(dataset=dataset_sub,
                               data_path=input_path,
                               dense=False,
                               gzipped=False,
                               atac_threshold=0.001,
                               cell_threshold=1)

# # Save Data
# save_pickle(scmvp_multi_data,
#             "{}/{}-{}-{}-multi-filtered.pkl".format(output_path, dataset["data_name"],
#                                             "scMVP", dataset["task_type"]))

[2022-11-15 16:31:07,966] INFO - scMVP.dataset.scMVP_dataloader | Preprocessing joint profiling dataset.
[2022-11-15 16:40:41,058] INFO - scMVP.dataset.scMVP_dataloader | Finished preprocessing dataset
[2022-11-15 16:40:46,553] INFO - scMVP.dataset.dataset | Remapping labels to [0,N]
[2022-11-15 16:40:46,557] INFO - scMVP.dataset.dataset | Remapping batch_indices to [0,N]
[2022-11-15 16:42:10,676] INFO - scMVP.dataset.dataset | Computing the library size for the new data
[2022-11-15 16:42:12,429] INFO - scMVP.dataset.dataset | Downsampled from 69249 to 69249 cells
