In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import torch
import torch.nn as nn
import torch.optim as optim
import ot
import sys
import os
import tarfile
import scipy.io
import h5py
import gzip
from scipy.sparse import issparse
from scipy.sparse import csr_matrix

In [41]:
# 读取细胞和基因信息
rna_cell = pd.read_csv("/workspace/ImputationOT/data/GSM3271040_RNA_sciCAR_A549_cell.txt.gz")
rna_gene = pd.read_csv("/workspace/ImputationOT/data/GSM3271040_RNA_sciCAR_A549_gene.txt.gz")

print(rna_cell.head())
print()
print(rna_gene.head())

mm_file_path = "/workspace/ImputationOT/data/GSM3271040_RNA_sciCAR_A549_gene_count.txt.gz"
gene_count_matrix = scipy.io.mmread(mm_file_path)

print(gene_count_matrix.shape)

rna_adata = ad.AnnData(X=gene_count_matrix.T.tocsr(), obs=rna_cell, var=rna_gene)
# rna_adata.write("/workspace/ImputationOT/data/rna_sciCAR_A549.h5ad")
print(rna_adata)

                     sample cell_name experiment  treatment_time
0  sci-RNA-A-001.CGCCAGGCAT      293T    coassay             NaN
1  sci-RNA-A-001.AAGTACGTTA      A549    coassay             3.0
2  sci-RNA-A-001.GCCATCAACT       3T3    coassay             NaN
3  sci-RNA-A-001.TCTCTCATCC      A549    coassay             0.0
4  sci-RNA-A-001.TCCGCCGGTC      A549    coassay             3.0

             gene_id   gene_type gene_short_name
0  ENSG00000223972.4  pseudogene         DDX11L1
1  ENSG00000227232.4  pseudogene          WASH7P
2  ENSG00000243485.2     lincRNA      MIR1302-11
3  ENSG00000237613.2     lincRNA         FAM138A
4  ENSG00000268020.2  pseudogene          OR4G4P
(113153, 6093)
AnnData object with n_obs × n_vars = 6093 × 113153
    obs: 'sample', 'cell_name', 'experiment', 'treatment_time'
    var: 'gene_id', 'gene_type', 'gene_short_name'




In [29]:
adata = ad.read_h5ad('/workspace/ImputationOT/data/facs_normal_lung_blood_scanpy.20200205.RC4.h5ad')
print(adata)

AnnData object with n_obs × n_vars = 9409 × 60970
    obs: 'nGene', 'nReads', 'orig.ident', 'plate.barcode', 'cell.id', 'organism', 'tissue', 'region', 'label', 'sorter', 'sort.location', 'sample', 'location', 'patient', 'percent.ercc', 'percent.ribo', 'gating', 'free_annotation', 'Number of input reads', 'Uniquely mapped reads number', 'Number of splices: Total', 'Number of splices: Annotated (sjdb)', 'Number of splices: GT/AG', 'Number of splices: GC/AG', 'Number of splices: AT/AC', 'Number of splices: Non-canonical', 'Number of reads mapped to multiple loci', 'Number of reads mapped to too many loci', 'Number of chimeric reads', 'Mapping speed, Million of reads per hour', 'Average input read length', 'Uniquely mapped reads %', 'Average mapped length', 'Mismatch rate per base, %', 'Deletion rate per base', 'Deletion average length', 'Insertion rate per base', 'Insertion average length', '% of reads mapped to multiple loci', '% of reads mapped to too many loci', '% of reads unmapped: 

In [33]:
print(adata.obs['patient'])

index
A1_B002014.gencode.vH29     2
A1_B003138.gencode.vH29     2
A1_B003140.gencode.vH29     2
A10_B003138.gencode.vH29    2
A12_B003141.gencode.vH29    2
                           ..
M5_B001771.gencode.vH29     3
N2_B001769.gencode.vH29     3
N2_B002460.gencode.vH29     3
O2_B001774.gencode.vH29     3
O7_B001774.gencode.vH29     3
Name: patient, Length: 9409, dtype: int64


In [46]:
print(adata.var['gene_type'])

0             pseudogene
1             pseudogene
2                lincRNA
3                lincRNA
4             pseudogene
               ...      
113148    protein_coding
113149           Mt_tRNA
113150    protein_coding
113151           Mt_tRNA
113152           Mt_tRNA
Name: gene_type, Length: 113153, dtype: category
Categories (46, object): ['3prime_overlapping_ncRNA', '3prime_overlapping_ncrna', 'IG_C_gene', 'IG_C_pseudogene', ..., 'transcribed_unitary_pseudogene', 'transcribed_unprocessed_pseudogene', 'unitary_pseudogene', 'unprocessed_pseudogene']


In [29]:
X = adata.X
print("Matrix Shape:", X.shape)
print("Density:", np.count_nonzero(X) / (X.shape[0] * X.shape[1]))
# print("Density:", X.nnz / (X.shape[0] * X.shape[1]))
print("Minimum Value:", X.min())
print("Maximum Value:", X.max())

Matrix Shape: (9409, 60970)
Density: 0.04190173273600859
Minimum Value: 0.0
Maximum Value: 13.540438


In [38]:
atac_cell = pd.read_csv("/workspace/ImputationOT/data/GSM3271041_ATAC_sciCAR_A549_cell.txt.gz")
atac_peak = pd.read_csv("/workspace/ImputationOT/data/GSM3271041_ATAC_sciCAR_A549_peak.txt.gz")

print(rna_cell.head())
print()
print(atac_peak.head())

mm_file_path = "/workspace/ImputationOT/data/GSM3271041_ATAC_sciCAR_A549_peak_count.txt.gz"
peak_count_matrix = scipy.io.mmread(mm_file_path)

print(peak_count_matrix.shape)

atac_peak = atac_peak.astype(str)
atac_adata = ad.AnnData(X=peak_count_matrix.T.tocsr(), obs=atac_cell, var=atac_peak)
# atac_adata.write("/workspace/ImputationOT/data/atac_sciCAR_A549.h5ad")
print(atac_adata)

  atac_peak = pd.read_csv("/workspace/ImputationOT/data/GSM3271041_ATAC_sciCAR_A549_peak.txt.gz")


                     sample cell_name experiment  treatment_time
0  sci-RNA-A-001.CGCCAGGCAT      293T    coassay             NaN
1  sci-RNA-A-001.AAGTACGTTA      A549    coassay             3.0
2  sci-RNA-A-001.GCCATCAACT       3T3    coassay             NaN
3  sci-RNA-A-001.TCTCTCATCC      A549    coassay             0.0
4  sci-RNA-A-001.TCCGCCGGTC      A549    coassay             3.0

   id           peak chr  start    end
0   1   1-9963-10665   1   9963  10665
1   2  1-11369-12010   1  11369  12010
2   3  1-24886-25386   1  24886  25386
3   4  1-29054-30366   1  29054  30366
4   5  1-36073-36581   1  36073  36581
(189603, 6085)
AnnData object with n_obs × n_vars = 6085 × 189603
    obs: 'sample', 'source', 'group', 'experiment'
    var: 'id', 'peak', 'chr', 'start', 'end'




In [42]:
# 读取细胞和基因信息
rna_cell = pd.read_csv("/workspace/ImputationOT/data/GSM3271042_RNA_only_A549_cell.txt.gz")
rna_gene = pd.read_csv("/workspace/ImputationOT/data/GSM3271042_RNA_only_A549_gene.txt.gz")

print(rna_cell.head())
print()
print(rna_gene.head())

mm_file_path = "/workspace/ImputationOT/data/GSM3271042_RNA_only_A549_gene_count.txt.gz"
gene_count_matrix = scipy.io.mmread(mm_file_path)

print(gene_count_matrix.shape)

rna_adata = ad.AnnData(X=gene_count_matrix.T.tocsr(), obs=rna_cell, var=rna_gene)
# rna_adata.write("/workspace/ImputationOT/data/rna_sciCAR_A549.h5ad")
print(rna_adata)

                       sample cell_name experiment  treatment_time
0  co-RNA-only-001.ACTCGACGCC      A549   RNA_only               0
1  co-RNA-only-001.AAGTACGTTA      A549   RNA_only               3
2  co-RNA-only-001.TCTCTCATCC      A549   RNA_only               0
3  co-RNA-only-001.GATCCAGCGT      A549   RNA_only               3
4  co-RNA-only-001.CTGGTTGGTT      A549   RNA_only               0

             gene_id   gene_type gene_short_name
0  ENSG00000223972.4  pseudogene         DDX11L1
1  ENSG00000227232.4  pseudogene          WASH7P
2  ENSG00000243485.2     lincRNA      MIR1302-11
3  ENSG00000237613.2     lincRNA         FAM138A
4  ENSG00000268020.2  pseudogene          OR4G4P
(113153, 1873)
AnnData object with n_obs × n_vars = 1873 × 113153
    obs: 'sample', 'cell_name', 'experiment', 'treatment_time'
    var: 'gene_id', 'gene_type', 'gene_short_name'




In [40]:
atac_cell = pd.read_csv("/workspace/ImputationOT/data/GSM3271043_ATAC_only_A549_cell.txt.gz")
atac_peak = pd.read_csv("/workspace/ImputationOT/data/GSM3271043_ATAC_only_A549_peak.txt.gz")

print(rna_cell.head())
print()
print(atac_peak.head())

mm_file_path = "/workspace/ImputationOT/data/GSM3271043_ATAC_only_A549_peak_count.txt.gz"
peak_count_matrix = scipy.io.mmread(mm_file_path)

print(peak_count_matrix.shape)

atac_peak = atac_peak.astype(str)
atac_adata = ad.AnnData(X=peak_count_matrix.T.tocsr(), obs=atac_cell, var=atac_peak)
# atac_adata.write("/workspace/ImputationOT/data/atac_sciCAR_A549.h5ad")
print(atac_adata)

  atac_peak = pd.read_csv("/workspace/ImputationOT/data/GSM3271043_ATAC_only_A549_peak.txt.gz")


                     sample cell_name experiment  treatment_time
0  sci-RNA-A-001.CGCCAGGCAT      293T    coassay             NaN
1  sci-RNA-A-001.AAGTACGTTA      A549    coassay             3.0
2  sci-RNA-A-001.GCCATCAACT       3T3    coassay             NaN
3  sci-RNA-A-001.TCTCTCATCC      A549    coassay             0.0
4  sci-RNA-A-001.TCCGCCGGTC      A549    coassay             3.0

   id           peak chr  start    end
0   1   1-9963-10665   1   9963  10665
1   2  1-11369-12010   1  11369  12010
2   3  1-24886-25386   1  24886  25386
3   4  1-29054-30366   1  29054  30366
4   5  1-36073-36581   1  36073  36581
(189603, 2006)




AnnData object with n_obs × n_vars = 2006 × 189603
    obs: 'sample', 'source', 'group', 'experiment'
    var: 'id', 'peak', 'chr', 'start', 'end'


In [43]:
# 读取细胞和基因信息
rna_cell = pd.read_csv("/workspace/ImputationOT/data/GSM3271044_RNA_mouse_kidney_cell.txt.gz")
rna_gene = pd.read_csv("/workspace/ImputationOT/data/GSM3271044_RNA_mouse_kidney_gene.txt.gz")

print(rna_cell.head())
print()
print(rna_gene.head())

mm_file_path = "/workspace/ImputationOT/data/GSM3271044_RNA_mouse_kidney_gene_count.txt.gz"
gene_count_matrix = scipy.io.mmread(mm_file_path)

print(gene_count_matrix.shape)

rna_adata = ad.AnnData(X=gene_count_matrix.T.tocsr(), obs=rna_cell, var=rna_gene)
# rna_adata.write("/workspace/ImputationOT/data/rna_sciCAR_A549.h5ad")
print(rna_adata)

                            sample source    replicate experiment     tsne_1  \
0  coRNA-RNA-plate1-001.TCGGCGTCGT  Mouse  Replicate 1    coassay        NaN   
1  coRNA-RNA-plate1-001.CATGACTCAA  Mouse  Replicate 1    coassay   2.808426   
2  coRNA-RNA-plate1-001.CGCCAGGCAT  Mouse  Replicate 1    coassay  -4.660380   
3  coRNA-RNA-plate1-001.CTGGTTGGTT  Mouse  Replicate 1    coassay        NaN   
4  coRNA-RNA-plate1-001.CCAGGCTCTT  Mouse  Replicate 2    coassay  10.309332   

      tsne_2                    cell_name  
0        NaN                          NaN  
1 -45.948924         Loop of Henle cells   
2 -41.004499         Loop of Henle cells   
3        NaN                          NaN  
4  11.143745  Proximal tubule S1/S2 cells  

                gene_id             gene_type gene_short_name
0  ENSMUSG00000102693.1                   TEC   4933401J01Rik
1  ENSMUSG00000064842.1                 snRNA         Gm26206
2  ENSMUSG00000051951.5        protein_coding            Xkr4
3  ENS



In [44]:
atac_cell = pd.read_csv("/workspace/ImputationOT/data/GSM3271045_ATAC_mouse_kidney_cell.txt.gz")
atac_peak = pd.read_csv("/workspace/ImputationOT/data/GSM3271045_ATAC_mouse_kidney_peak.txt.gz")

print(rna_cell.head())
print()
print(atac_peak.head())

mm_file_path = "/workspace/ImputationOT/data/GSM3271045_ATAC_mouse_kidney_peak_count.txt.gz"
peak_count_matrix = scipy.io.mmread(mm_file_path)

print(peak_count_matrix.shape)

atac_peak = atac_peak.astype(str)
atac_adata = ad.AnnData(X=peak_count_matrix.T.tocsr(), obs=atac_cell, var=atac_peak)
# atac_adata.write("/workspace/ImputationOT/data/atac_sciCAR_A549.h5ad")
print(atac_adata)

                            sample source    replicate experiment     tsne_1  \
0  coRNA-RNA-plate1-001.TCGGCGTCGT  Mouse  Replicate 1    coassay        NaN   
1  coRNA-RNA-plate1-001.CATGACTCAA  Mouse  Replicate 1    coassay   2.808426   
2  coRNA-RNA-plate1-001.CGCCAGGCAT  Mouse  Replicate 1    coassay  -4.660380   
3  coRNA-RNA-plate1-001.CTGGTTGGTT  Mouse  Replicate 1    coassay        NaN   
4  coRNA-RNA-plate1-001.CCAGGCTCTT  Mouse  Replicate 2    coassay  10.309332   

      tsne_2                    cell_name  
0        NaN                          NaN  
1 -45.948924         Loop of Henle cells   
2 -41.004499         Loop of Henle cells   
3        NaN                          NaN  
4  11.143745  Proximal tubule S1/S2 cells  

   id                      peak         chr   start     end
0   1    GL456210.1-58882-59082  GL456210.1   58882   59082
1   2  GL456210.1-110303-110503  GL456210.1  110303  110503
2   3  GL456210.1-123592-123792  GL456210.1  123592  123792
3   4  GL45621



In [67]:
file_path = "/workspace/ImputationOT/data/PBMC.h5"
with h5py.File(file_path, 'r') as f:
    def print_attrs(name, obj):
        print(name)
    
    f.visititems(print_attrs)

matrix
matrix/barcodes
matrix/data
matrix/features
matrix/features/_all_tag_keys
matrix/features/feature_type
matrix/features/genome
matrix/features/id
matrix/features/name
matrix/indices
matrix/indptr
matrix/shape


In [64]:
file_path = "/workspace/ImputationOT/data/PBMC.h5"
with h5py.File(file_path, 'r') as f:
    # 读取条形码（细胞ID）
    barcodes = f['matrix/barcodes'][:]
    
    # 读取特征（基因或其他特征）
    features = f['matrix/features/name'][:]  # 或者读取其他特征相关的数据集
    
    # 读取稀疏矩阵的主数据（非零值）
    data = f['matrix/data'][:]
    
    # 读取稀疏矩阵的行索引
    indices = f['matrix/indices'][:]
    
    # 读取稀疏矩阵的列指针
    indptr = f['matrix/indptr'][:]
    
    # 读取矩阵的形状（行数和列数）
    shape = f['matrix/shape'][:]

# 打印读取的数据的基本信息
print(f"Barcodes: {barcodes[:5]}")  # 打印前5个条形码
print(f"Features: {features[:5]}")  # 打印前5个特征
print(f"Data shape: {shape}")  # 打印矩阵的形状
print(data.shape)

Barcodes: [b'AAACCCAGTCGGCCTA-1' b'AAACCCATCAGATGCT-1' b'AAACGAAAGATTAGCA-1'
 b'AAACGAAAGTGCTACT-1' b'AAACGAAGTCGTAATC-1']
Features: [b'MIR1302-2HG' b'FAM138A' b'OR4F5' b'AL627309.1' b'AL627309.3']
Data shape: [36601  5140]
(14285737,)


In [75]:
obs = pd.DataFrame(index=barcodes)
var = pd.DataFrame(index=features)
sparse_matrix = csr_matrix((data, indices, indptr), shape=(5140, 36601))

adata = ad.AnnData(X=sparse_matrix, obs=obs, var=var)
print(adata)

# 保存为 .h5ad 文件
# adata.write("/workspace/ImputationOT/data/PBMC.h5ad")

AnnData object with n_obs × n_vars = 5140 × 36601


  utils.warn_names_duplicates("var")


In [76]:
file_path = "/workspace/ImputationOT/data/PBMC_info.h5"
with h5py.File(file_path, 'r') as f:
    def print_attrs(name, obj):
        print(name)
    
    f.visititems(print_attrs)

barcode_idx
barcode_info
barcode_info/genomes
barcode_info/pass_filter
barcodes
count
feature_idx
features
features/_all_tag_keys
features/feature_type
features/genome
features/id
features/name
gem_group
library_idx
library_info
metrics_json
umi
umi_type


In [86]:
file_path = "/workspace/ImputationOT/data/PBMC_info.h5"
with h5py.File(file_path, 'r') as f:
    barcode_info_genomes = f['barcode_info/genomes'][:]  # 读取 barcode_info/genomes
    barcode_info_pass_filter = f['barcode_info/pass_filter'][:]  # 读取 barcode_info/pass_filter
    barcodes = f['barcodes'][:]  # 读取 barcodes
    features_info = f['features/name'][:]
    umi_type = f['umi_type'][:]

# adata.obs['genomes'] = barcode_info
print(barcode_info_genomes)
print(barcode_info_pass_filter)
print(barcodes)
adata.var['feature_name'] = [feature.decode('utf-8') for feature in features_info]
adata.uns['umi_type'] = umi_type

[b'GRCh38']
[[   575      0      0]
 [   748      0      0]
 [  1031      0      0]
 ...
 [909282      0      0]
 [909554      0      0]
 [909568      0      0]]
[b'AAACCCAAGAAACCCG' b'AAACCCAAGAAAGCGA' b'AAACCCAAGAAATTCG' ...
 b'TTTGTTGTCTTTCGAT' b'TTTGTTGTCTTTGATC' b'TTTGTTGTCTTTGCAT']


In [15]:
barcodes_file = '/workspace/ImputationOT/data/GSM5008740_RNA_5P-barcodes.tsv.gz'
features_file = '/workspace/ImputationOT/data/GSM5008740_RNA_5P-features.tsv.gz'
matrix_file = '/workspace/ImputationOT/data/GSM5008740_RNA_5P-matrix.mtx.gz'

barcodes = pd.read_csv(barcodes_file, header=None, sep='\t')
features = pd.read_csv(features_file, header=None, sep='\t')
matrix = scipy.io.mmread(matrix_file)

adata_rna = ad.AnnData(X=matrix.T)

adata_rna.obs['barcodes'] = barcodes[0].values
adata_rna.var['symbols'] = features[0].values

data_5P = pd.read_csv('/workspace/ImputationOT/data/meta_data_5P.csv.gz')
data_5P.set_index('Unnamed: 0', inplace=True)
adata_rna.obs = adata_rna.obs.merge(data_5P, left_on='barcodes', right_index=True, how='left')
adata_rna.var['feature_type'] = 'RNA'

print(adata_rna)

AnnData object with n_obs × n_vars = 49147 × 33538
    obs: 'barcodes', 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Batch'
    var: 'symbols', 'feature_type'


In [16]:
barcodes_file = '/workspace/ImputationOT/data/GSM5008741_ADT_5P-barcodes.tsv.gz'
features_file = '/workspace/ImputationOT/data/GSM5008741_ADT_5P-features.tsv.gz'
matrix_file = '/workspace/ImputationOT/data/GSM5008741_ADT_5P-matrix.mtx.gz'

barcodes = pd.read_csv(barcodes_file, header=None, sep='\t')
features = pd.read_csv(features_file, header=None, sep='\t')
matrix = scipy.io.mmread(matrix_file)

adata_adt = ad.AnnData(X=matrix.T)

adata_adt.obs['barcodes'] = barcodes[0].values
adata_adt.var['symbols'] = features[0].values

data_5P = pd.read_csv('/workspace/ImputationOT/data/meta_data_5P.csv.gz')
data_5P.set_index('Unnamed: 0', inplace=True)
adata_adt.obs = adata_adt.obs.merge(data_5P, left_on='barcodes', right_index=True, how='left')
adata_adt.var['feature_type'] = 'ADT'
print(adata_adt)

AnnData object with n_obs × n_vars = 49147 × 54
    obs: 'barcodes', 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Batch'
    var: 'symbols', 'feature_type'


In [17]:
adata_combined = ad.concat([adata_rna, adata_adt], axis=1, merge='same')
adata_combined.write("/workspace/ImputationOT/data/pbmc2.h5ad")
print(adata_combined)

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 49147 × 33592
    obs: 'barcodes', 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Batch'
    var: 'symbols', 'feature_type'


In [27]:
print(adata_combined.obs["donor"].value_counts())

donor
P4    9006
P1    7160
P8    6764
P7    6169
P3    5407
P5    5228
P6    4746
P2    4667
Name: count, dtype: int64


In [28]:
print(adata_combined.obs["celltype.l1"])

0        CD8 T
1         Mono
2        CD8 T
3         Mono
4        CD8 T
         ...  
49142     Mono
49143     Mono
49144     Mono
49145     Mono
49146        B
Name: celltype.l1, Length: 49147, dtype: category
Categories (8, object): ['B', 'CD4 T', 'CD8 T', 'DC', 'Mono', 'NK', 'other', 'other T']


In [121]:
file_path = '/workspace/ImputationOT/data/GSM2230757_human1_umifm_counts.csv.gz'

data_df = pd.read_csv(file_path)

print(data_df.head())
print(data_df.shape)

                    Unnamed: 0              barcode assigned_cluster  A1BG  \
0  human1_lib1.final_cell_0001  GATGACGGAC-GGTGGGAT           acinar     0   
1  human1_lib1.final_cell_0002  GAGCGTTGCT-ACCTTCTT           acinar     0   
2  human1_lib1.final_cell_0003    CTTACGGG-CCATTACT           acinar     0   
3  human1_lib1.final_cell_0004  GATGTACACG-TTAAACTG           acinar     0   
4  human1_lib1.final_cell_0005  GAGATTGCGA-GTCGTCGT           acinar     0   

   A1CF  A2M  A2ML1  A4GALT  A4GNT  AA06  ...  ZWILCH  ZWINT  ZXDA  ZXDB  \
0     4    0      0       0      0     0  ...       0      0     0     0   
1     0    0      0       0      0     0  ...       0      0     0     0   
2     0    0      0       0      0     0  ...       0      0     0     0   
3     0    0      0       0      0     0  ...       1      0     0     0   
4     0    0      0       0      0     0  ...       0      0     0     0   

   ZXDC  ZYG11B  ZYX  ZZEF1  ZZZ3  pk  
0     0       0    2      0     0 

In [58]:
file_path = '/workspace/ImputationOT/data/GSM2230757_human1_umifm_counts.csv.gz'
data_df = pd.read_csv(file_path)
data_df = data_df.drop(columns=['Unnamed: 0'])
obs_data = data_df[['barcode', 'assigned_cluster']]
gene_expression_data = data_df.drop(columns=['barcode', 'assigned_cluster'])
adata1 = ad.AnnData(X=gene_expression_data.values)
adata1.obs = obs_data
adata1.var['gene_names'] = gene_expression_data.columns
adata1.obs['Batch'] = '1'
print(adata1)

AnnData object with n_obs × n_vars = 1937 × 20125
    obs: 'barcode', 'assigned_cluster', 'Batch'
    var: 'gene_names'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata1.obs['Batch'] = '1'


In [59]:
file_path = '/workspace/ImputationOT/data/GSM2230758_human2_umifm_counts.csv.gz'
data_df = pd.read_csv(file_path)
data_df = data_df.drop(columns=['Unnamed: 0'])
obs_data = data_df[['barcode', 'assigned_cluster']]
gene_expression_data = data_df.drop(columns=['barcode', 'assigned_cluster'])
adata2 = ad.AnnData(X=gene_expression_data.values)
adata2.obs = obs_data
adata2.var['gene_names'] = gene_expression_data.columns
adata2.obs['Batch'] = '2'
print(adata2)

AnnData object with n_obs × n_vars = 1724 × 20125
    obs: 'barcode', 'assigned_cluster', 'Batch'
    var: 'gene_names'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata2.obs['Batch'] = '2'


In [60]:
file_path = '/workspace/ImputationOT/data/GSM2230759_human3_umifm_counts.csv.gz'
data_df = pd.read_csv(file_path)
data_df = data_df.drop(columns=['Unnamed: 0'])
obs_data = data_df[['barcode', 'assigned_cluster']]
gene_expression_data = data_df.drop(columns=['barcode', 'assigned_cluster'])
adata3 = ad.AnnData(X=gene_expression_data.values)
adata3.obs = obs_data
adata3.var['gene_names'] = gene_expression_data.columns
adata3.obs['Batch'] = '3'
print(adata3)

AnnData object with n_obs × n_vars = 3605 × 20125
    obs: 'barcode', 'assigned_cluster', 'Batch'
    var: 'gene_names'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata3.obs['Batch'] = '3'


In [61]:
file_path = '/workspace/ImputationOT/data/GSM2230760_human4_umifm_counts.csv.gz'
data_df = pd.read_csv(file_path)
data_df = data_df.drop(columns=['Unnamed: 0'])
obs_data = data_df[['barcode', 'assigned_cluster']]
gene_expression_data = data_df.drop(columns=['barcode', 'assigned_cluster'])
adata4 = ad.AnnData(X=gene_expression_data.values)
adata4.obs = obs_data
adata4.var['gene_names'] = gene_expression_data.columns
adata4.obs['Batch'] = '4'
print(adata4)

AnnData object with n_obs × n_vars = 1303 × 20125
    obs: 'barcode', 'assigned_cluster', 'Batch'
    var: 'gene_names'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata4.obs['Batch'] = '4'


In [62]:
adata_combined = ad.concat([adata1, adata2, adata3, adata4], merge='same')
print(adata_combined)

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 8569 × 20125
    obs: 'barcode', 'assigned_cluster', 'Batch'
    var: 'gene_names'


OSError: [Errno 28] Can't synchronously write data (file write failed: time = Sun Aug 25 07:21:26 2024
, filename = '/workspace/ImputationOT/data/hmp.h5ad', file descriptor = 70, errno = 28, error message = 'No space left on device', buf = 0x7d67621a7810, total write size = 150749608, bytes this sub-write = 150749608, bytes actually written = 18446744073709551615, offset = 0)

In [63]:
adata_combined.write("/workspace/ImputationOT/data/hmp.h5ad")

OSError: [Errno 28] Can't synchronously write data (file write failed: time = Sun Aug 25 07:24:47 2024
, filename = '/workspace/ImputationOT/data/hmp.h5ad', file descriptor = 71, errno = 28, error message = 'No space left on device', buf = 0x7d666f3a3810, total write size = 150421928, bytes this sub-write = 150421928, bytes actually written = 18446744073709551615, offset = 0)