In [1]:
import scanpy as sc
import scipy.io
import pandas as pd
import os

# Define input directory path (change this to the path where your files are located)
input_dir = "/gpfs/home/asun/jin_lab/get/raw_data/k562"  # Set the appropriate directory path

rna_matrix_file = os.path.join(input_dir, "GSM8780566_K562_DMSO_RNA_matrix.mtx.gz")
rna_barcodes_file = os.path.join(input_dir, "GSM8780566_K562_DMSO_RNA_1_barcodes.tsv.gz")
rna_features_file_1 = os.path.join(input_dir, "GSM8780566_K562_DMSO_RNA_1_features.tsv.gz")

In [21]:
# 1. Load the RNA-seq data matrix (MTX format)
rna_matrix_1 = scipy.io.mmread(rna_matrix_file_1).tocsc()  # Sparse matrix format

# 2. Load the barcodes (cells)
rna_barcodes_1 = pd.read_csv(rna_barcodes_file_1, header=None, sep="\t").squeeze().tolist()

# 3. Load the features (genes)
rna_features_1 = pd.read_csv(rna_features_file_1, header=None, sep="\t")

rna_features_1.columns = ['gene_id', 'index', 'feature_type', 'chr', 'start', 'end']

rna_features_1['interval'] = rna_features_1.apply(lambda row: f"{row['chr']}:{row['start']}-{row['end']}", axis=1)

# 4. Create the AnnData object for RNA-seq data (cells x features)
adata_rna = sc.AnnData(rna_matrix_1.T)  # Transpose to make (cells x features)

adata_rna.obs_names = rna_barcodes_1  # Set the barcodes as obs (cells)
adata_rna.var_names = rna_features_1['index'].tolist()

rna_features_1.set_index('index', inplace=True)

# Store the feature metadata (gene_ids, feature_types, genome, interval) in `adata.var`
adata_rna.var['gene_id'] = rna_features_1['gene_id']
adata_rna.var['feature_type'] = rna_features_1['feature_type']
adata_rna.var['genome'] = "hg38"
adata_rna.var['interval'] = rna_features_1['interval']

# 5. Save the AnnData object to an H5AD file (HDF5 format)
#output_file = "K562_DMSO_RNA_combined_replicates.h5ad"
#adata_rna.write(output_file)

print(f"HDF5 file created: {output_file}")

HDF5 file created: K562_DMSO_RNA_combined_replicates.h5ad


In [None]:
output_file = "K562_DMSO_RNA_rep1.h5ad"
adata_rna.write(output_file)

In [15]:
adata_rna = sc.AnnData(rna_matrix_1.T)

In [22]:
adata_rna.var.head()

Unnamed: 0,gene_id,feature_type,genome,interval
MIR1302-2HG,ENSG00000243485,Gene Expression,hg38,chr1:29553-30267
FAM138A,ENSG00000237613,Gene Expression,hg38,chr1:36080-36081
OR4F5,ENSG00000186092,Gene Expression,hg38,chr1:65418-69055
AL627309.1,ENSG00000238009,Gene Expression,hg38,chr1:120931-133723
AL627309.3,ENSG00000239945,Gene Expression,hg38,chr1:91104-91105


In [20]:
rna_features_1

Unnamed: 0,gene_id,index,feature_type,chr,start,end,interval
0,ENSG00000243485,MIR1302-2HG,Gene Expression,chr1,29553,30267,chr1:29553-30267
1,ENSG00000237613,FAM138A,Gene Expression,chr1,36080,36081,chr1:36080-36081
2,ENSG00000186092,OR4F5,Gene Expression,chr1,65418,69055,chr1:65418-69055
3,ENSG00000238009,AL627309.1,Gene Expression,chr1,120931,133723,chr1:120931-133723
4,ENSG00000239945,AL627309.3,Gene Expression,chr1,91104,91105,chr1:91104-91105
...,...,...,...,...,...,...,...
255987,KI270713.1:15882-16673,KI270713.1:15882-16673,Peaks,KI270713.1,15882,16673,KI270713.1:15882-16673
255988,KI270713.1:21469-22377,KI270713.1:21469-22377,Peaks,KI270713.1,21469,22377,KI270713.1:21469-22377
255989,KI270713.1:26768-27527,KI270713.1:26768-27527,Peaks,KI270713.1,26768,27527,KI270713.1:26768-27527
255990,KI270713.1:29577-30473,KI270713.1:29577-30473,Peaks,KI270713.1,29577,30473,KI270713.1:29577-30473


In [13]:
adata_rna.var.head()

Unnamed: 0,gene_id,feature_type,genome,interval
MIR1302-2HG,,,hg38,
FAM138A,,,hg38,
OR4F5,,,hg38,
AL627309.1,,,hg38,
AL627309.3,,,hg38,


In [5]:
rna_barcodes_1 = pd.read_csv(rna_barcodes_file_1, header=None, sep="\t").tolist()
rna_barcodes_1

AttributeError: 'DataFrame' object has no attribute 'tolist'

In [6]:
rna_features_1

Unnamed: 0,0,1,2,3,4,5
0,ENSG00000243485,MIR1302-2HG,Gene Expression,chr1,29553,30267
1,ENSG00000237613,FAM138A,Gene Expression,chr1,36080,36081
2,ENSG00000186092,OR4F5,Gene Expression,chr1,65418,69055
3,ENSG00000238009,AL627309.1,Gene Expression,chr1,120931,133723
4,ENSG00000239945,AL627309.3,Gene Expression,chr1,91104,91105
...,...,...,...,...,...,...
255987,KI270713.1:15882-16673,KI270713.1:15882-16673,Peaks,KI270713.1,15882,16673
255988,KI270713.1:21469-22377,KI270713.1:21469-22377,Peaks,KI270713.1,21469,22377
255989,KI270713.1:26768-27527,KI270713.1:26768-27527,Peaks,KI270713.1,26768,27527
255990,KI270713.1:29577-30473,KI270713.1:29577-30473,Peaks,KI270713.1,29577,30473


In [3]:
import scanpy as sc
import scipy.io
import pandas as pd
import os

# Define input directory path (change this to the path where your files are located)
input_dir = "/gpfs/home/asun/jin_lab/get/raw_data/k562"  # Set the appropriate directory path

rna_matrix_file_2 = os.path.join(input_dir, "GSM8780567_K562_DMSO_RNA_2_matrix.mtx.gz")
rna_barcodes_file_2 = os.path.join(input_dir, "GSM8780567_K562_DMSO_RNA_2_barcodes.tsv.gz")
rna_features_file_2 = os.path.join(input_dir, "GSM8780567_K562_DMSO_RNA_2_features.tsv.gz")

In [4]:
# 1. Load the RNA-seq data matrix (MTX format)
rna_matrix_2 = scipy.io.mmread(rna_matrix_file_2).tocsc()  # Sparse matrix format

# 2. Load the barcodes (cells)
rna_barcodes_2 = pd.read_csv(rna_barcodes_file_2, header=None, sep="\t").squeeze().tolist()

# 3. Load the features (genes)
rna_features_2 = pd.read_csv(rna_features_file_2, header=None, sep="\t")

rna_features_2.columns = ['gene_id', 'index', 'feature_type', 'chr', 'start', 'end']

rna_features_2['interval'] = rna_features_2.apply(lambda row: f"{row['chr']}:{row['start']}-{row['end']}", axis=1)

# 4. Create the AnnData object for RNA-seq data (cells x features)
adata_rna = sc.AnnData(rna_matrix_2.T)  # Transpose to make (cells x features)

adata_rna.obs_names = rna_barcodes_2  # Set the barcodes as obs (cells)
adata_rna.var_names = rna_features_2['index'].tolist()

rna_features_2.set_index('index', inplace=True)

# Store the feature metadata (gene_ids, feature_types, genome, interval) in `adata.var`
adata_rna.var['gene_id'] = rna_features_2['gene_id']
adata_rna.var['feature_type'] = rna_features_2['feature_type']
adata_rna.var['genome'] = "hg38"
adata_rna.var['interval'] = rna_features_2['interval']

# 5. Save the AnnData object to an H5AD file (HDF5 format)
#output_file = "K562_DMSO_RNA_combined_replicates.h5ad"
#adata_rna.write(output_file)

print(f"HDF5 file created: {output_file}")

NameError: name 'output_file' is not defined

In [1]:
adata_rna

NameError: name 'adata_rna' is not defined