In [None]:
import scanpy as sc
import scipy.io
import pandas as pd

# File paths for Replicate 1 (ATAC and RNA)
atac_matrix_file_1 = "GSM8780558_K562_DMSO_ATAC_1_matrix.mtx.gz"
atac_barcodes_file_1 = "GSM8780558_K562_DMSO_ATAC_1_barcodes.tsv.gz"
atac_fragments_file_1 = "GSM8780558_K562_DMSO_ATAC_1_atac_fragments.tsv.gz"

rna_matrix_file_1 = "GSM8780566_K562_DMSO_RNA_1_matrix.mtx.gz"
rna_barcodes_file_1 = "GSM8780566_K562_DMSO_RNA_1_barcodes.tsv.gz"
rna_features_file_1 = "GSM8780566_K562_DMSO_RNA_1_features.tsv.gz"

# File paths for Replicate 2 (ATAC and RNA)
atac_matrix_file_2 = "GSM8780559_K562_DMSO_ATAC_2_matrix.mtx.gz"
atac_barcodes_file_2 = "GSM8780559_K562_DMSO_ATAC_2_barcodes.tsv.gz"
atac_fragments_file_2 = "GSM8780559_K562_DMSO_ATAC_2_atac_fragments.tsv.gz"

rna_matrix_file_2 = "GSM8780567_K562_DMSO_RNA_2_matrix.mtx.gz"
rna_barcodes_file_2 = "GSM8780567_K562_DMSO_RNA_2_barcodes.tsv.gz"
rna_features_file_2 = "GSM8780567_K562_DMSO_RNA_2_features.tsv.gz"

# 1. Load the ATAC-seq data matrices (MTX format)
atac_matrix_1 = scipy.io.mmread(atac_matrix_file_1).tocsc()
atac_matrix_2 = scipy.io.mmread(atac_matrix_file_2).tocsc()

# 2. Load the RNA-seq data matrices (MTX format)
rna_matrix_1 = scipy.io.mmread(rna_matrix_file_1).tocsc()
rna_matrix_2 = scipy.io.mmread(rna_matrix_file_2).tocsc()

# 3. Load the barcodes (cells)
atac_barcodes_1 = pd.read_csv(atac_barcodes_file_1, header=None, sep="\t").squeeze().tolist()
atac_barcodes_2 = pd.read_csv(atac_barcodes_file_2, header=None, sep="\t").squeeze().tolist()

rna_barcodes_1 = pd.read_csv(rna_barcodes_file_1, header=None, sep="\t").squeeze().tolist()
rna_barcodes_2 = pd.read_csv(rna_barcodes_file_2, header=None, sep="\t").squeeze().tolist()

# 4. Load the features (fragments for ATAC-seq, genes for RNA-seq)
atac_fragments_1 = pd.read_csv(atac_fragments_file_1, header=None, sep="\t").squeeze().tolist()
atac_fragments_2 = pd.read_csv(atac_fragments_file_2, header=None, sep="\t").squeeze().tolist()

rna_features_1 = pd.read_csv(rna_features_file_1, header=None, sep="\t").squeeze().tolist()
rna_features_2 = pd.read_csv(rna_features_file_2, header=None, sep="\t").squeeze().tolist()

# 5. Combine the matrices, barcodes, and features
atac_matrix = scipy.sparse.hstack([atac_matrix_1, atac_matrix_2])
rna_matrix = scipy.sparse.hstack([rna_matrix_1, rna_matrix_2])

barcodes = atac_barcodes_1 + atac_barcodes_2  # Barcodes are the same for ATAC and RNA (same cells)
fragments = atac_fragments_1 + atac_fragments_2  # ATAC features (fragments)
rna_features = rna_features_1 + rna_features_2  # RNA features (genes)

# 6. Create the AnnData object for combined ATAC and RNA data (with two modalities)
adata = sc.AnnData(atac_matrix.T)  # Transpose to make it (cells x features)
adata.obs_names = barcodes  # Cells (barcodes)
adata.var_names = fragments  # ATAC-seq features (fragments)

# Add RNA data to the AnnData object
adata.raw = sc.AnnData(rna_matrix.T)  # Store RNA data as raw (i.e., not normalized)
adata.raw.obs_names = barcodes  # Same barcodes (cells)
adata.raw.var_names = rna_features  # RNA features (genes)

# 7. Save the combined AnnData object to an H5AD file (HDF5 format)
adata.write("K562_DMSO_ATAC_RNA_combined_replicates.h5ad")

print("HDF5 file created: K562_DMSO_ATAC_RNA_combined_replicates.h5ad")

In [2]:
import scanpy as sc
import scipy.io
import pandas as pd
import os

# Define input directory path (change this to the path where your files are located)
input_dir = "/gpfs/home/asun/jin_lab/get/raw_data/k562"  # Set the appropriate directory path

# File paths for Replicate 1 (ATAC and RNA)
atac_matrix_file_1 = os.path.join(input_dir, "GSM8780558_K562_DMSO_ATAC_1_matrix.mtx.gz")
atac_barcodes_file_1 = os.path.join(input_dir, "GSM8780558_K562_DMSO_ATAC_1_barcodes.tsv.gz")
atac_fragments_file_1 = os.path.join(input_dir, "GSM8780558_K562_DMSO_ATAC_1_atac_fragments.tsv.gz")

rna_matrix_file_1 = os.path.join(input_dir, "GSM8780566_K562_DMSO_RNA_1_matrix.mtx.gz")
rna_barcodes_file_1 = os.path.join(input_dir, "GSM8780566_K562_DMSO_RNA_1_barcodes.tsv.gz")
rna_features_file_1 = os.path.join(input_dir, "GSM8780566_K562_DMSO_RNA_1_features.tsv.gz")

# File paths for Replicate 2 (ATAC and RNA)
atac_matrix_file_2 = os.path.join(input_dir, "GSM8780559_K562_DMSO_ATAC_2_matrix.mtx.gz")
atac_barcodes_file_2 = os.path.join(input_dir, "GSM8780559_K562_DMSO_ATAC_2_barcodes.tsv.gz")
atac_fragments_file_2 = os.path.join(input_dir, "GSM8780559_K562_DMSO_ATAC_2_atac_fragments.tsv.gz")

rna_matrix_file_2 = os.path.join(input_dir, "GSM8780567_K562_DMSO_RNA_2_matrix.mtx.gz")
rna_barcodes_file_2 = os.path.join(input_dir, "GSM8780567_K562_DMSO_RNA_2_barcodes.tsv.gz")
rna_features_file_2 = os.path.join(input_dir, "GSM8780567_K562_DMSO_RNA_2_features.tsv.gz")

# 1. Load the ATAC-seq data matrices (MTX format)
atac_matrix_1 = scipy.io.mmread(atac_matrix_file_1).tocsc()
atac_matrix_2 = scipy.io.mmread(atac_matrix_file_2).tocsc()

# 2. Load the RNA-seq data matrices (MTX format)
rna_matrix_1 = scipy.io.mmread(rna_matrix_file_1).tocsc()
rna_matrix_2 = scipy.io.mmread(rna_matrix_file_2).tocsc()



AttributeError: 'DataFrame' object has no attribute 'tolist'

In [3]:
rna_matrix_2

<Compressed Sparse Column sparse matrix of dtype 'int64'
	with 135186850 stored elements and shape (246390, 13547)>

In [4]:
atac_barcodes_1 = pd.read_csv(atac_barcodes_file_1, header=None, sep="\t")

In [5]:
atac_barcodes_1

Unnamed: 0,0
0,AAACAGCCAAATATCC-1
1,AAACAGCCACCGGTAT-1
2,AAACAGCCACGCAACT-1
3,AAACAGCCATCACAGC-1
4,AAACAGCCATGTTGGC-1
...,...
12616,TTTGTGTTCGGTCAAT-1
12617,TTTGTTGGTCACGAAC-1
12618,TTTGTTGGTCCTTCAG-1
12619,TTTGTTGGTGGTTATG-1


In [None]:
# 3. Load the barcodes (cells)
atac_barcodes_1 = pd.read_csv(atac_barcodes_file_1, header=None, sep="\t").squeeze().tolist()
atac_barcodes_2 = pd.read_csv(atac_barcodes_file_2, header=None, sep="\t").squeeze().tolist()

rna_barcodes_1 = pd.read_csv(rna_barcodes_file_1, header=None, sep="\t").squeeze().tolist()
rna_barcodes_2 = pd.read_csv(rna_barcodes_file_2, header=None, sep="\t").squeeze().tolist()



In [6]:
atac_fragments_1 = pd.read_csv(atac_fragments_file_1, header=None, sep="\t", comment="#")

In [7]:
atac_fragments_1

Unnamed: 0,0,1,2,3,4
0,chr1,10001,10091,GCAGCCAGTATACTGG-1,1
1,chr1,10001,10097,GAGGTGAGTTGCACAA-1,1
2,chr1,10001,10198,GATGCATTCATGCCTC-1,1
3,chr1,10001,10222,GATCGAGCAATTAGGA-1,1
4,chr1,10005,10334,AGTAGCTTCACCTGCT-1,1
...,...,...,...,...,...
253517885,KI270713.1,39170,39410,TCTACCTCATCTTGAG-1,2
253517886,KI270713.1,39170,39410,TGTGAAACAGGAATCG-1,1
253517887,KI270713.1,39194,39519,AGGCCCAGTAGCTAGC-1,1
253517888,KI270713.1,39724,39939,CGCCAAATCCCTCACG-1,1


In [None]:
# 4. Load the features (fragments for ATAC-seq, genes for RNA-seq)
atac_fragments_1 = pd.read_csv(atac_fragments_file_1, header=None, sep="\t", comment="#").squeeze().tolist()
atac_fragments_2 = pd.read_csv(atac_fragments_file_2, header=None, sep="\t", comment="#").squeeze().tolist()

rna_features_1 = pd.read_csv(rna_features_file_1, header=None, sep="\t").squeeze().tolist()
rna_features_2 = pd.read_csv(rna_features_file_2, header=None, sep="\t").squeeze().tolist()


In [None]:
# 5. Combine the matrices, barcodes, and features
atac_matrix = scipy.sparse.hstack([atac_matrix_1, atac_matrix_2])
rna_matrix = scipy.sparse.hstack([rna_matrix_1, rna_matrix_2])

barcodes = atac_barcodes_1 + atac_barcodes_2  # Barcodes are the same for ATAC and RNA (same cells)
fragments = atac_fragments_1 + atac_fragments_2  # ATAC features (fragments)
rna_features = rna_features_1 + rna_features_2  # RNA features (genes)



In [None]:
# 6. Create the AnnData object for combined ATAC and RNA data (with two modalities)
adata = sc.AnnData(atac_matrix.T)  # Transpose to make it (cells x features)
adata.obs_names = barcodes  # Cells (barcodes)
adata.var_names = fragments  # ATAC-seq features (fragments)

# Add RNA data to the AnnData object
adata.raw = sc.AnnData(rna_matrix.T)  # Store RNA data as raw (i.e., not normalized)
adata.raw.obs_names = barcodes  # Same barcodes (cells)
adata.raw.var_names = rna_features  # RNA features (genes)



In [10]:
adata = sc.AnnData(rna_matrix_1.T)  # Store RNA data as raw (i.e., not normalized)


In [18]:
adata

AnnData object with n_obs × n_vars = 12621 × 255992

In [17]:
rna_features_1 = pd.read_csv(rna_features_file_1, header=None, sep="\t")

rna_features_1

Unnamed: 0,0,1,2,3,4,5
0,ENSG00000243485,MIR1302-2HG,Gene Expression,chr1,29553,30267
1,ENSG00000237613,FAM138A,Gene Expression,chr1,36080,36081
2,ENSG00000186092,OR4F5,Gene Expression,chr1,65418,69055
3,ENSG00000238009,AL627309.1,Gene Expression,chr1,120931,133723
4,ENSG00000239945,AL627309.3,Gene Expression,chr1,91104,91105
...,...,...,...,...,...,...
255987,KI270713.1:15882-16673,KI270713.1:15882-16673,Peaks,KI270713.1,15882,16673
255988,KI270713.1:21469-22377,KI270713.1:21469-22377,Peaks,KI270713.1,21469,22377
255989,KI270713.1:26768-27527,KI270713.1:26768-27527,Peaks,KI270713.1,26768,27527
255990,KI270713.1:29577-30473,KI270713.1:29577-30473,Peaks,KI270713.1,29577,30473


In [None]:
# 7. Save the combined AnnData object to an H5AD file (HDF5 format)
#adata.write("K562_DMSO_ATAC_RNA_combined_replicates.h5ad")

print("HDF5 file created: K562_DMSO_ATAC_RNA_combined_replicates.h5ad")

In [19]:
adata = sc.AnnData(atac_matrix_1.T)  # Store RNA data as raw (i.e., not normalized)
adata

AnnData object with n_obs × n_vars = 12621 × 255992

In [20]:
print(adata.X[:5, :].toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 2 0]]


In [21]:
rna_adata = sc.AnnData(rna_matrix_1.T)  # Store RNA data as raw (i.e., not normalized)
print(rna_adata.X[:5, :].toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 2 0]]


In [24]:
import numpy as np
data_identical = np.allclose(adata.X.toarray(), rna_adata.X.toarray())  # Converts sparse to dense for comparison
print(f"Data matrices identical: {data_identical}")

Data matrices identical: True
