In [1]:
import os
import sys 
import json
import pickle

import torch
import anndata
import numpy as np
import pandas as pd
import scanpy as sc

from scipy import io

# Data Manipulation: convert h5ad to mtx

## multiome

In [2]:
HSPC_METADATA = '/home/wsg/BM/data/HSPC/RawData/Multimodal_Single-Cell_Integration/metadata.csv'
metadata = pd.read_csv(HSPC_METADATA)

metadata_multi = metadata[metadata.technology=="multiome"]
metadata_cite = metadata[metadata.technology=="citeseq"]

In [3]:
metadata_multi.shape, metadata_cite.shape

((161877, 5), (119651, 5))

In [4]:
# we only use train_data because test_data only have ATAC modality

# HSPC train_data ATAC-seq peak counts transformed
HSPC_multi_ATAC_path ='/home/wsg/BM/data/HSPC/RawData/Raw_Counts/train_multi_inputs_raw.h5'

# HSPC train_data RNA gene expression levels as library-size normalized and log1p transformed counts for the same cells
HSPC_multi_RNA_path ='/home/wsg/BM/data/HSPC/RawData/Raw_Counts/train_multi_targets_raw.h5'

In [6]:
# HSPC_multi_RNA_counts = pd.read_hdf(HSPC_multi_RNA_path)
HSPC_multi_RNA_counts.shape

(105933, 23418)

In [7]:
# HSPC_multi_ATAC_counts = pd.read_hdf(HSPC_multi_ATAC_path)
HSPC_multi_ATAC_counts.shape

(105933, 228942)

In [12]:
sum(HSPC_multi_RNA_counts.index == HSPC_multi_ATAC_counts.index)

105933

In [13]:
metadata_multi_train = metadata_multi[metadata_multi['cell_id'].isin(HSPC_multi_RNA_counts.index)]
metadata_multi_train.shape

metadata_multi_train['barcode'] = metadata_multi_train['cell_id']
metadata_multi_train.set_index('cell_id', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata_multi_train['barcode'] = metadata_multi_train['cell_id']


In [14]:
print(metadata_multi_train.shape)
print(metadata_multi_train['cell_type'].value_counts())

(105868, 5)
HSC     34624
NeuP    21556
EryP    17132
MasP    16733
MkP     13200
MoP      2058
BP        565
Name: cell_type, dtype: int64


In [15]:
np.random.seed(1234)
metadata_multi_train_p10 = metadata_multi_train.sample(frac=0.1)
metadata_multi_train_p10['cell_type'].value_counts()

HSC     3464
NeuP    2112
EryP    1734
MasP    1705
MkP     1293
MoP      216
BP        63
Name: cell_type, dtype: int64

In [16]:
p10_condition = HSPC_multi_RNA_counts.index.isin(metadata_multi_train_p10['barcode'])
HSPC_multi_RNA_counts_p10 = HSPC_multi_RNA_counts[p10_condition]

In [17]:
adata_rna = sc.AnnData(X=HSPC_multi_RNA_counts_p10.values, 
                           obs=metadata_multi_train_p10,
                           var=pd.DataFrame(index=HSPC_multi_RNA_counts_p10.columns))
adata_rna

AnnData object with n_obs × n_vars = 10587 × 23418
    obs: 'day', 'donor', 'cell_type', 'technology', 'barcode'

In [18]:
output_path = "/home/wsg/BM/data/HSPC/RNA+ATAC/p10"
# save hd5
adata_rna.write_h5ad("{}/HSPC-multiome-p10-RNA-counts.h5ad".format(output_path))

In [19]:
HSPC_multi_ATAC_counts_p10 = HSPC_multi_ATAC_counts[p10_condition]

In [20]:
adata_atac = sc.AnnData(X=HSPC_multi_ATAC_counts_p10.values, 
                           obs=metadata_multi_train_p10,
                           var=pd.DataFrame(index=HSPC_multi_ATAC_counts_p10.columns))
adata_atac

AnnData object with n_obs × n_vars = 10587 × 228942
    obs: 'day', 'donor', 'cell_type', 'technology', 'barcode'

In [21]:
output_path = "/home/wsg/BM/data/HSPC/RNA+ATAC/p10"
# save hd5
adata_atac.write_h5ad("{}/HSPC-multiome-p10-ATAC-peaks.h5ad".format(output_path))

## CITE-seq

In [2]:
HSPC_METADATA = '/home/wsg/BM/data/HSPC/RawData/Multimodal_Single-Cell_Integration/metadata.csv'
metadata = pd.read_csv(HSPC_METADATA)

metadata_multi = metadata[metadata.technology=="multiome"]
metadata_cite = metadata[metadata.technology=="citeseq"]

In [3]:
metadata_multi.shape, metadata_cite.shape

((161877, 5), (119651, 5))

In [4]:
# we only use train_data because test_data only have RNA modality

# HSPC train_data RNA gene expression levels as library-size normalized and log1p transformed counts (gene expression levels)
HSPC_cite_RNA_path ='/home/wsg/BM/data/HSPC/RawData/Raw_Counts/train_cite_inputs_raw.h5'

# HSPC train_data surface protein levels for the same cells that have been dsb normalized
HSPC_cite_ADT_path ='/home/wsg/BM/data/HSPC/RawData/Raw_Counts/train_cite_targets_raw.h5'

In [5]:
HSPC_cite_RNA_counts = pd.read_hdf(HSPC_cite_RNA_path)
HSPC_cite_RNA_counts.shape

(70988, 22085)

In [6]:
HSPC_cite_ADT_counts = pd.read_hdf(HSPC_cite_ADT_path)
HSPC_cite_ADT_counts.shape

(70988, 140)

In [7]:
sum(HSPC_cite_RNA_counts.index == HSPC_cite_ADT_counts.index)

70988

In [8]:
metadata_cite_train = metadata_cite[metadata_cite['cell_id'].isin(HSPC_cite_RNA_counts.index)]
metadata_cite_train.shape

metadata_cite_train['barcode'] = metadata_cite_train['cell_id']
metadata_cite_train.set_index('cell_id', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata_cite_train['barcode'] = metadata_cite_train['cell_id']


In [9]:
print(metadata_cite_train.shape)
print(metadata_cite_train['cell_type'].value_counts())

(70988, 5)
HSC     29879
EryP    14241
NeuP    12493
MasP     8242
MkP      5382
MoP       591
BP        160
Name: cell_type, dtype: int64


In [10]:
np.random.seed(1234)
metadata_cite_train_p10 = metadata_cite_train.sample(frac=0.1)
metadata_cite_train_p10['cell_type'].value_counts()

HSC     3014
EryP    1453
NeuP    1242
MasP     788
MkP      533
MoP       54
BP        15
Name: cell_type, dtype: int64

In [11]:
p10_condition = HSPC_cite_RNA_counts.index.isin(metadata_cite_train_p10['barcode'])
HSPC_cite_RNA_counts_p10 = HSPC_cite_RNA_counts[p10_condition]

In [12]:
adata_rna = sc.AnnData(X=HSPC_cite_RNA_counts_p10.values, 
                       obs=metadata_cite_train_p10,
                       var=pd.DataFrame(index=HSPC_cite_RNA_counts_p10.columns))
adata_rna

AnnData object with n_obs × n_vars = 7099 × 22085
    obs: 'day', 'donor', 'cell_type', 'technology', 'barcode'

In [14]:
output_path = "/home/wsg/BM/data/HSPC/RNA+ADT/p10"
# save hd5
adata_rna.write_h5ad("{}/HSPC-cite-p10-RNA-counts.h5ad".format(output_path))

In [15]:
adata_rna.obs.to_csv(output_path + '/metadata.csv')

In [16]:
HSPC_cite_ADT_counts_p10 = HSPC_cite_ADT_counts[p10_condition]

In [17]:
adata_adt = sc.AnnData(X=HSPC_cite_ADT_counts_p10.values, 
                        obs=metadata_cite_train_p10,
                        var=pd.DataFrame(index=HSPC_cite_ADT_counts_p10.columns))
adata_adt

AnnData object with n_obs × n_vars = 7099 × 140
    obs: 'day', 'donor', 'cell_type', 'technology', 'barcode'

In [20]:
output_path = "/home/wsg/BM/data/HSPC/RNA+ADT/p10"
# save hd5
adata_adt.write_h5ad("{}/HSPC-multiome-p10-ADT-counts.h5ad".format(output_path))