In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import ot
import sys
import scanpy as sc
import anndata as ad
import os
import tarfile
import scipy.io
from scipy.sparse import issparse
from scipy.sparse import csr_matrix

In [4]:
data_path = "/workspace/ImputationOT/data"

# 加载RNA数据
rna_3p_barcodes = pd.read_csv(os.path.join(data_path, 'GSM5008737_RNA_3P-barcodes.tsv.gz'), header=None)
rna_3p_features = pd.read_csv(os.path.join(data_path, 'GSM5008737_RNA_3P-features.tsv.gz'), header=None)
rna_3p_matrix = scipy.io.mmread(os.path.join(data_path, 'GSM5008737_RNA_3P-matrix.mtx.gz')).tocsr()

# 加载ADT数据
adt_3p_barcodes = pd.read_csv(os.path.join(data_path, 'GSM5008738_ADT_3P-barcodes.tsv.gz'), header=None)
adt_3p_features = pd.read_csv(os.path.join(data_path, 'GSM5008738_ADT_3P-features.tsv.gz'), header=None)
adt_3p_matrix = scipy.io.mmread(os.path.join(data_path, 'GSM5008738_ADT_3P-matrix.mtx.gz')).tocsr()

# 加载HTO数据
hto_3p_barcodes = pd.read_csv(os.path.join(data_path, 'GSM5008739_HTO_3P-barcodes.tsv.gz'), header=None)
hto_3p_features = pd.read_csv(os.path.join(data_path, 'GSM5008739_HTO_3P-features.tsv.gz'), header=None)
hto_3p_matrix = scipy.io.mmread(os.path.join(data_path, 'GSM5008739_HTO_3P-matrix.mtx.gz')).tocsr()

# 加载元数据
meta_data_3p = pd.read_csv(os.path.join(data_path, 'GSE164378_sc.meta.data_3P.csv.gz'))

In [14]:
rna_adata = ad.AnnData(X=rna_3p_matrix.T,
                       obs=pd.DataFrame(index=rna_3p_barcodes[0]),
                       var=pd.DataFrame(index=rna_3p_features[0]))

# adt_adata = ad.AnnData(X=adt_3p_matrix.T,
#                        obs=pd.DataFrame(index=adt_3p_barcodes[0]),
#                        var=pd.DataFrame(index=adt_3p_features[0]))

# rna_adata.obsm['ADT'] = adt_adata.X.toarray()
rna_adata.obs.index.name = None
meta_data_3p.index.name = None

if 'Unnamed: 0' in rna_adata.obs.columns:
    rna_adata.obs.drop(columns=['Unnamed: 0'], inplace=True)

for col in rna_adata.obs.columns:
    if rna_adata.obs[col].dtype == object:
        rna_adata.obs[col] = rna_adata.obs[col].astype(str)

rna_adata.obs = rna_adata.obs.merge(meta_data_3p, left_index=True, right_index=True, how='left')
rna_adata.write(os.path.join(data_path, '3p.h5ad'))

TypeError: Can't implicitly convert non-string objects to strings

In [3]:
adata = ad.read_h5ad('/workspace/ImputationOT/data/vento_pbmc_processed.h5ad')
print(adata)

AnnData object with n_obs × n_vars = 97499 × 22572
    obs: 'n_genes', 'n_counts', 'Batch', 'Donor_ID', 'Sample_ID', 'SARS-CoV-2_PCR', 'Group', 'Annotation'
    var: 'gene_ids', 'feature_types', 'n_cells'
    obsm: 'X_umap'


In [15]:
X = adata.X
print("GEX data")
print("Matrix Shape:", X.shape)
# print("Density:", np.count_nonzero(X) / (X.shape[0] * X.shape[1]))
print("Density:", X.nnz / (X.shape[0] * X.shape[1]))
print("Minimum Value:", X.min())
print("Maximum Value:", X.max())

GEX data
Matrix Shape: (97499, 22572)
Density: 0.07210065361484998
Minimum Value: 0.0
Maximum Value: 9.150005


In [4]:
batch_counts = adata.obs['Batch'].value_counts()
print(batch_counts)

Batch
1    13086
8    12745
7    11752
6    11170
5    11017
4    10553
9    10315
2     8618
3     8243
Name: count, dtype: int64


In [17]:
print(len(adata.obs['Annotation'].unique()))
print(adata.obs['Annotation'])

28
RV8919578_AAACCTGAGAAACCTA-1    NK CD56(bright)
RV8919578_AAACCTGAGAGCAATT-1            Naïve B
RV8919578_AAACCTGAGAGCTGGT-1                pDC
RV8919578_AAACCTGAGCGAAGGG-1       NK CD56(dim)
RV8919578_AAACCTGAGCGATGAC-1    NK CD56(bright)
                                     ...       
RV8959686_TTTGTCAGTATGAATG-1        CD4 naïve T
RV8959686_TTTGTCAGTCGGCATC-1            Naïve B
RV8959686_TTTGTCAGTGACAAAT-1    NK CD56(bright)
RV8959686_TTTGTCAGTTGCCTCT-1        CD4 naïve T
RV8959686_TTTGTCATCAGTCCCT-1          Platelets
Name: Annotation, Length: 97499, dtype: category
Categories (28, object): ['CD4 memory T', 'CD4 naïve T', 'CD8 memory T', 'CD8 naïve T', ..., 'cDC2', 'cDC3', 'pDC', 'γδT']


In [16]:
print(len(adata.obs['Batch'].unique()))
print(adata.obs['Batch'])

9
RV8919578_AAACCTGAGAAACCTA-1    1
RV8919578_AAACCTGAGAGCAATT-1    1
RV8919578_AAACCTGAGAGCTGGT-1    1
RV8919578_AAACCTGAGCGAAGGG-1    1
RV8919578_AAACCTGAGCGATGAC-1    1
                               ..
RV8959686_TTTGTCAGTATGAATG-1    9
RV8959686_TTTGTCAGTCGGCATC-1    9
RV8959686_TTTGTCAGTGACAAAT-1    9
RV8959686_TTTGTCAGTTGCCTCT-1    9
RV8959686_TTTGTCATCAGTCCCT-1    9
Name: Batch, Length: 97499, dtype: category
Categories (9, object): ['1', '2', '3', '4', ..., '6', '7', '8', '9']
