In [50]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import torch
import torch.nn as nn
import torch.optim as optim
import ot
import sys
import os
import tarfile
import scipy.io
import h5py
import gzip
from scipy.sparse import issparse
from scipy.sparse import csr_matrix

In [41]:
# 读取细胞和基因信息
rna_cell = pd.read_csv("/workspace/ImputationOT/data/GSM3271040_RNA_sciCAR_A549_cell.txt.gz")
rna_gene = pd.read_csv("/workspace/ImputationOT/data/GSM3271040_RNA_sciCAR_A549_gene.txt.gz")

print(rna_cell.head())
print()
print(rna_gene.head())

mm_file_path = "/workspace/ImputationOT/data/GSM3271040_RNA_sciCAR_A549_gene_count.txt.gz"
gene_count_matrix = scipy.io.mmread(mm_file_path)

print(gene_count_matrix.shape)

rna_adata = ad.AnnData(X=gene_count_matrix.T.tocsr(), obs=rna_cell, var=rna_gene)
# rna_adata.write("/workspace/ImputationOT/data/rna_sciCAR_A549.h5ad")
print(rna_adata)

                     sample cell_name experiment  treatment_time
0  sci-RNA-A-001.CGCCAGGCAT      293T    coassay             NaN
1  sci-RNA-A-001.AAGTACGTTA      A549    coassay             3.0
2  sci-RNA-A-001.GCCATCAACT       3T3    coassay             NaN
3  sci-RNA-A-001.TCTCTCATCC      A549    coassay             0.0
4  sci-RNA-A-001.TCCGCCGGTC      A549    coassay             3.0

             gene_id   gene_type gene_short_name
0  ENSG00000223972.4  pseudogene         DDX11L1
1  ENSG00000227232.4  pseudogene          WASH7P
2  ENSG00000243485.2     lincRNA      MIR1302-11
3  ENSG00000237613.2     lincRNA         FAM138A
4  ENSG00000268020.2  pseudogene          OR4G4P
(113153, 6093)
AnnData object with n_obs × n_vars = 6093 × 113153
    obs: 'sample', 'cell_name', 'experiment', 'treatment_time'
    var: 'gene_id', 'gene_type', 'gene_short_name'




In [45]:
adata = ad.read_h5ad('/workspace/ImputationOT/data/rna_sciCAR_A549.h5ad')
print(adata)

AnnData object with n_obs × n_vars = 6093 × 113153
    obs: 'sample', 'cell_name', 'experiment', 'treatment_time'
    var: 'gene_id', 'gene_type', 'gene_short_name'


In [49]:
print(adata.obs['experiment'])

0       coassay
1       coassay
2       coassay
3       coassay
4       coassay
         ...   
6088    coassay
6089    coassay
6090    coassay
6091    coassay
6092    coassay
Name: experiment, Length: 6093, dtype: category
Categories (1, object): ['coassay']


In [46]:
print(adata.var['gene_type'])

0             pseudogene
1             pseudogene
2                lincRNA
3                lincRNA
4             pseudogene
               ...      
113148    protein_coding
113149           Mt_tRNA
113150    protein_coding
113151           Mt_tRNA
113152           Mt_tRNA
Name: gene_type, Length: 113153, dtype: category
Categories (46, object): ['3prime_overlapping_ncRNA', '3prime_overlapping_ncrna', 'IG_C_gene', 'IG_C_pseudogene', ..., 'transcribed_unitary_pseudogene', 'transcribed_unprocessed_pseudogene', 'unitary_pseudogene', 'unprocessed_pseudogene']


In [29]:
X = adata.X
print("Matrix Shape:", X.shape)
print("Density:", np.count_nonzero(X) / (X.shape[0] * X.shape[1]))
# print("Density:", X.nnz / (X.shape[0] * X.shape[1]))
print("Minimum Value:", X.min())
print("Maximum Value:", X.max())

Matrix Shape: (9409, 60970)
Density: 0.04190173273600859
Minimum Value: 0.0
Maximum Value: 13.540438


In [38]:
atac_cell = pd.read_csv("/workspace/ImputationOT/data/GSM3271041_ATAC_sciCAR_A549_cell.txt.gz")
atac_peak = pd.read_csv("/workspace/ImputationOT/data/GSM3271041_ATAC_sciCAR_A549_peak.txt.gz")

print(rna_cell.head())
print()
print(atac_peak.head())

mm_file_path = "/workspace/ImputationOT/data/GSM3271041_ATAC_sciCAR_A549_peak_count.txt.gz"
peak_count_matrix = scipy.io.mmread(mm_file_path)

print(peak_count_matrix.shape)

atac_peak = atac_peak.astype(str)
atac_adata = ad.AnnData(X=peak_count_matrix.T.tocsr(), obs=atac_cell, var=atac_peak)
# atac_adata.write("/workspace/ImputationOT/data/atac_sciCAR_A549.h5ad")
print(atac_adata)

  atac_peak = pd.read_csv("/workspace/ImputationOT/data/GSM3271041_ATAC_sciCAR_A549_peak.txt.gz")


                     sample cell_name experiment  treatment_time
0  sci-RNA-A-001.CGCCAGGCAT      293T    coassay             NaN
1  sci-RNA-A-001.AAGTACGTTA      A549    coassay             3.0
2  sci-RNA-A-001.GCCATCAACT       3T3    coassay             NaN
3  sci-RNA-A-001.TCTCTCATCC      A549    coassay             0.0
4  sci-RNA-A-001.TCCGCCGGTC      A549    coassay             3.0

   id           peak chr  start    end
0   1   1-9963-10665   1   9963  10665
1   2  1-11369-12010   1  11369  12010
2   3  1-24886-25386   1  24886  25386
3   4  1-29054-30366   1  29054  30366
4   5  1-36073-36581   1  36073  36581
(189603, 6085)
AnnData object with n_obs × n_vars = 6085 × 189603
    obs: 'sample', 'source', 'group', 'experiment'
    var: 'id', 'peak', 'chr', 'start', 'end'




In [42]:
# 读取细胞和基因信息
rna_cell = pd.read_csv("/workspace/ImputationOT/data/GSM3271042_RNA_only_A549_cell.txt.gz")
rna_gene = pd.read_csv("/workspace/ImputationOT/data/GSM3271042_RNA_only_A549_gene.txt.gz")

print(rna_cell.head())
print()
print(rna_gene.head())

mm_file_path = "/workspace/ImputationOT/data/GSM3271042_RNA_only_A549_gene_count.txt.gz"
gene_count_matrix = scipy.io.mmread(mm_file_path)

print(gene_count_matrix.shape)

rna_adata = ad.AnnData(X=gene_count_matrix.T.tocsr(), obs=rna_cell, var=rna_gene)
# rna_adata.write("/workspace/ImputationOT/data/rna_sciCAR_A549.h5ad")
print(rna_adata)

                       sample cell_name experiment  treatment_time
0  co-RNA-only-001.ACTCGACGCC      A549   RNA_only               0
1  co-RNA-only-001.AAGTACGTTA      A549   RNA_only               3
2  co-RNA-only-001.TCTCTCATCC      A549   RNA_only               0
3  co-RNA-only-001.GATCCAGCGT      A549   RNA_only               3
4  co-RNA-only-001.CTGGTTGGTT      A549   RNA_only               0

             gene_id   gene_type gene_short_name
0  ENSG00000223972.4  pseudogene         DDX11L1
1  ENSG00000227232.4  pseudogene          WASH7P
2  ENSG00000243485.2     lincRNA      MIR1302-11
3  ENSG00000237613.2     lincRNA         FAM138A
4  ENSG00000268020.2  pseudogene          OR4G4P
(113153, 1873)
AnnData object with n_obs × n_vars = 1873 × 113153
    obs: 'sample', 'cell_name', 'experiment', 'treatment_time'
    var: 'gene_id', 'gene_type', 'gene_short_name'




In [40]:
atac_cell = pd.read_csv("/workspace/ImputationOT/data/GSM3271043_ATAC_only_A549_cell.txt.gz")
atac_peak = pd.read_csv("/workspace/ImputationOT/data/GSM3271043_ATAC_only_A549_peak.txt.gz")

print(rna_cell.head())
print()
print(atac_peak.head())

mm_file_path = "/workspace/ImputationOT/data/GSM3271043_ATAC_only_A549_peak_count.txt.gz"
peak_count_matrix = scipy.io.mmread(mm_file_path)

print(peak_count_matrix.shape)

atac_peak = atac_peak.astype(str)
atac_adata = ad.AnnData(X=peak_count_matrix.T.tocsr(), obs=atac_cell, var=atac_peak)
# atac_adata.write("/workspace/ImputationOT/data/atac_sciCAR_A549.h5ad")
print(atac_adata)

  atac_peak = pd.read_csv("/workspace/ImputationOT/data/GSM3271043_ATAC_only_A549_peak.txt.gz")


                     sample cell_name experiment  treatment_time
0  sci-RNA-A-001.CGCCAGGCAT      293T    coassay             NaN
1  sci-RNA-A-001.AAGTACGTTA      A549    coassay             3.0
2  sci-RNA-A-001.GCCATCAACT       3T3    coassay             NaN
3  sci-RNA-A-001.TCTCTCATCC      A549    coassay             0.0
4  sci-RNA-A-001.TCCGCCGGTC      A549    coassay             3.0

   id           peak chr  start    end
0   1   1-9963-10665   1   9963  10665
1   2  1-11369-12010   1  11369  12010
2   3  1-24886-25386   1  24886  25386
3   4  1-29054-30366   1  29054  30366
4   5  1-36073-36581   1  36073  36581
(189603, 2006)




AnnData object with n_obs × n_vars = 2006 × 189603
    obs: 'sample', 'source', 'group', 'experiment'
    var: 'id', 'peak', 'chr', 'start', 'end'


In [43]:
# 读取细胞和基因信息
rna_cell = pd.read_csv("/workspace/ImputationOT/data/GSM3271044_RNA_mouse_kidney_cell.txt.gz")
rna_gene = pd.read_csv("/workspace/ImputationOT/data/GSM3271044_RNA_mouse_kidney_gene.txt.gz")

print(rna_cell.head())
print()
print(rna_gene.head())

mm_file_path = "/workspace/ImputationOT/data/GSM3271044_RNA_mouse_kidney_gene_count.txt.gz"
gene_count_matrix = scipy.io.mmread(mm_file_path)

print(gene_count_matrix.shape)

rna_adata = ad.AnnData(X=gene_count_matrix.T.tocsr(), obs=rna_cell, var=rna_gene)
# rna_adata.write("/workspace/ImputationOT/data/rna_sciCAR_A549.h5ad")
print(rna_adata)

                            sample source    replicate experiment     tsne_1  \
0  coRNA-RNA-plate1-001.TCGGCGTCGT  Mouse  Replicate 1    coassay        NaN   
1  coRNA-RNA-plate1-001.CATGACTCAA  Mouse  Replicate 1    coassay   2.808426   
2  coRNA-RNA-plate1-001.CGCCAGGCAT  Mouse  Replicate 1    coassay  -4.660380   
3  coRNA-RNA-plate1-001.CTGGTTGGTT  Mouse  Replicate 1    coassay        NaN   
4  coRNA-RNA-plate1-001.CCAGGCTCTT  Mouse  Replicate 2    coassay  10.309332   

      tsne_2                    cell_name  
0        NaN                          NaN  
1 -45.948924         Loop of Henle cells   
2 -41.004499         Loop of Henle cells   
3        NaN                          NaN  
4  11.143745  Proximal tubule S1/S2 cells  

                gene_id             gene_type gene_short_name
0  ENSMUSG00000102693.1                   TEC   4933401J01Rik
1  ENSMUSG00000064842.1                 snRNA         Gm26206
2  ENSMUSG00000051951.5        protein_coding            Xkr4
3  ENS



In [44]:
atac_cell = pd.read_csv("/workspace/ImputationOT/data/GSM3271045_ATAC_mouse_kidney_cell.txt.gz")
atac_peak = pd.read_csv("/workspace/ImputationOT/data/GSM3271045_ATAC_mouse_kidney_peak.txt.gz")

print(rna_cell.head())
print()
print(atac_peak.head())

mm_file_path = "/workspace/ImputationOT/data/GSM3271045_ATAC_mouse_kidney_peak_count.txt.gz"
peak_count_matrix = scipy.io.mmread(mm_file_path)

print(peak_count_matrix.shape)

atac_peak = atac_peak.astype(str)
atac_adata = ad.AnnData(X=peak_count_matrix.T.tocsr(), obs=atac_cell, var=atac_peak)
# atac_adata.write("/workspace/ImputationOT/data/atac_sciCAR_A549.h5ad")
print(atac_adata)

                            sample source    replicate experiment     tsne_1  \
0  coRNA-RNA-plate1-001.TCGGCGTCGT  Mouse  Replicate 1    coassay        NaN   
1  coRNA-RNA-plate1-001.CATGACTCAA  Mouse  Replicate 1    coassay   2.808426   
2  coRNA-RNA-plate1-001.CGCCAGGCAT  Mouse  Replicate 1    coassay  -4.660380   
3  coRNA-RNA-plate1-001.CTGGTTGGTT  Mouse  Replicate 1    coassay        NaN   
4  coRNA-RNA-plate1-001.CCAGGCTCTT  Mouse  Replicate 2    coassay  10.309332   

      tsne_2                    cell_name  
0        NaN                          NaN  
1 -45.948924         Loop of Henle cells   
2 -41.004499         Loop of Henle cells   
3        NaN                          NaN  
4  11.143745  Proximal tubule S1/S2 cells  

   id                      peak         chr   start     end
0   1    GL456210.1-58882-59082  GL456210.1   58882   59082
1   2  GL456210.1-110303-110503  GL456210.1  110303  110503
2   3  GL456210.1-123592-123792  GL456210.1  123592  123792
3   4  GL45621



In [67]:
file_path = "/workspace/ImputationOT/data/PBMC.h5"
with h5py.File(file_path, 'r') as f:
    def print_attrs(name, obj):
        print(name)
    
    f.visititems(print_attrs)

matrix
matrix/barcodes
matrix/data
matrix/features
matrix/features/_all_tag_keys
matrix/features/feature_type
matrix/features/genome
matrix/features/id
matrix/features/name
matrix/indices
matrix/indptr
matrix/shape


In [64]:
file_path = "/workspace/ImputationOT/data/PBMC.h5"
with h5py.File(file_path, 'r') as f:
    # 读取条形码（细胞ID）
    barcodes = f['matrix/barcodes'][:]
    
    # 读取特征（基因或其他特征）
    features = f['matrix/features/name'][:]  # 或者读取其他特征相关的数据集
    
    # 读取稀疏矩阵的主数据（非零值）
    data = f['matrix/data'][:]
    
    # 读取稀疏矩阵的行索引
    indices = f['matrix/indices'][:]
    
    # 读取稀疏矩阵的列指针
    indptr = f['matrix/indptr'][:]
    
    # 读取矩阵的形状（行数和列数）
    shape = f['matrix/shape'][:]

# 打印读取的数据的基本信息
print(f"Barcodes: {barcodes[:5]}")  # 打印前5个条形码
print(f"Features: {features[:5]}")  # 打印前5个特征
print(f"Data shape: {shape}")  # 打印矩阵的形状
print(data.shape)

Barcodes: [b'AAACCCAGTCGGCCTA-1' b'AAACCCATCAGATGCT-1' b'AAACGAAAGATTAGCA-1'
 b'AAACGAAAGTGCTACT-1' b'AAACGAAGTCGTAATC-1']
Features: [b'MIR1302-2HG' b'FAM138A' b'OR4F5' b'AL627309.1' b'AL627309.3']
Data shape: [36601  5140]
(14285737,)


In [75]:
obs = pd.DataFrame(index=barcodes)
var = pd.DataFrame(index=features)
sparse_matrix = csr_matrix((data, indices, indptr), shape=(5140, 36601))

adata = ad.AnnData(X=sparse_matrix, obs=obs, var=var)
print(adata)

# 保存为 .h5ad 文件
# adata.write("/workspace/ImputationOT/data/PBMC.h5ad")

AnnData object with n_obs × n_vars = 5140 × 36601


  utils.warn_names_duplicates("var")


In [76]:
file_path = "/workspace/ImputationOT/data/PBMC_info.h5"
with h5py.File(file_path, 'r') as f:
    def print_attrs(name, obj):
        print(name)
    
    f.visititems(print_attrs)

barcode_idx
barcode_info
barcode_info/genomes
barcode_info/pass_filter
barcodes
count
feature_idx
features
features/_all_tag_keys
features/feature_type
features/genome
features/id
features/name
gem_group
library_idx
library_info
metrics_json
umi
umi_type


In [86]:
file_path = "/workspace/ImputationOT/data/PBMC_info.h5"
with h5py.File(file_path, 'r') as f:
    barcode_info_genomes = f['barcode_info/genomes'][:]  # 读取 barcode_info/genomes
    barcode_info_pass_filter = f['barcode_info/pass_filter'][:]  # 读取 barcode_info/pass_filter
    barcodes = f['barcodes'][:]  # 读取 barcodes
    features_info = f['features/name'][:]
    umi_type = f['umi_type'][:]

# adata.obs['genomes'] = barcode_info
print(barcode_info_genomes)
print(barcode_info_pass_filter)
print(barcodes)
adata.var['feature_name'] = [feature.decode('utf-8') for feature in features_info]
adata.uns['umi_type'] = umi_type

[b'GRCh38']
[[   575      0      0]
 [   748      0      0]
 [  1031      0      0]
 ...
 [909282      0      0]
 [909554      0      0]
 [909568      0      0]]
[b'AAACCCAAGAAACCCG' b'AAACCCAAGAAAGCGA' b'AAACCCAAGAAATTCG' ...
 b'TTTGTTGTCTTTCGAT' b'TTTGTTGTCTTTGATC' b'TTTGTTGTCTTTGCAT']


In [99]:
barcodes_file = '/workspace/ImputationOT/data/GSM5008737_RNA_3P-barcodes.tsv.gz'
features_file = '/workspace/ImputationOT/data/GSM5008737_RNA_3P-features.tsv.gz'
matrix_file = '/workspace/ImputationOT/data/GSM5008737_RNA_3P-matrix.mtx.gz'

barcodes = pd.read_csv(barcodes_file, header=None, sep='\t')
features = pd.read_csv(features_file, header=None, sep='\t')
matrix = scipy.io.mmread(matrix_file)

adata = ad.AnnData(X=matrix.T)

adata.obs['barcodes'] = barcodes[0].values
adata.var['symbols'] = features[0].values

print(adata)

AnnData object with n_obs × n_vars = 161764 × 33538
    obs: 'barcodes'
    var: 'symbols'


In [103]:
print(adata)

AnnData object with n_obs × n_vars = 161764 × 33538
    obs: 'barcodes', 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'Batch'
    var: 'symbols'


In [100]:
data_3P = pd.read_csv('/workspace/ImputationOT/data/data_3P.csv.gz')
print(data_3P.head())
print(data_3P.shape)

            Unnamed: 0  nCount_ADT  nFeature_ADT  nCount_RNA  nFeature_RNA  \
0  L1_AAACCCAAGAAACTCA        7535           217       10823          2915   
1  L1_AAACCCAAGACATACA        6013           209        5864          1617   
2  L1_AAACCCACAACTGGTT        6620           213        5067          1381   
3  L1_AAACCCACACGTACTA        3567           202        4786          1890   
4  L1_AAACCCACAGCATACT        6402           215        6505          1621   

      orig.ident lane donor  time celltype.l1 celltype.l2 celltype.l3 Phase  \
0  SeuratProject   L1    P2     7        Mono   CD14 Mono   CD14 Mono    G1   
1  SeuratProject   L1    P1     7       CD4 T     CD4 TCM   CD4 TCM_1    G1   
2  SeuratProject   L1    P4     2       CD8 T   CD8 Naive   CD8 Naive     S   
3  SeuratProject   L1    P3     7          NK          NK        NK_2    G1   
4  SeuratProject   L1    P4     7       CD8 T   CD8 Naive   CD8 Naive    G1   

    Batch  
0  Batch1  
1  Batch1  
2  Batch1  
3  Batch

In [102]:
data_3P.set_index('Unnamed: 0', inplace=True)
adata.obs = adata.obs.merge(data_3P, left_on='barcodes', right_index=True, how='left')
print(adata.obs.head())

              barcodes  nCount_ADT  nFeature_ADT  nCount_RNA  nFeature_RNA  \
0  L1_AAACCCAAGAAACTCA        7535           217       10823          2915   
1  L1_AAACCCAAGACATACA        6013           209        5864          1617   
2  L1_AAACCCACAACTGGTT        6620           213        5067          1381   
3  L1_AAACCCACACGTACTA        3567           202        4786          1890   
4  L1_AAACCCACAGCATACT        6402           215        6505          1621   

      orig.ident lane donor  time celltype.l1 celltype.l2 celltype.l3 Phase  \
0  SeuratProject   L1    P2     7        Mono   CD14 Mono   CD14 Mono    G1   
1  SeuratProject   L1    P1     7       CD4 T     CD4 TCM   CD4 TCM_1    G1   
2  SeuratProject   L1    P4     2       CD8 T   CD8 Naive   CD8 Naive     S   
3  SeuratProject   L1    P3     7          NK          NK        NK_2    G1   
4  SeuratProject   L1    P4     7       CD8 T   CD8 Naive   CD8 Naive    G1   

    Batch  
0  Batch1  
1  Batch1  
2  Batch1  
3  Batch