In [3]:
# Cell 1: Setup - Imports and Paths
import scanpy as sc
import pandas as pd
import numpy as np
import os
import warnings
import sys

# Ignore specific warnings often encountered with AnnData
warnings.filterwarnings("ignore", category=FutureWarning, module="anndata")
# Allow viewing more columns
pd.set_option('display.max_columns', 50)

# --- Define paths for the LATEST run ---
RUN_ID = "run_20250501_045924" # From the log file name
BASE_DIR = "/mnt/czi-sci-ai/intrinsic-variation-gene-ex/rnaseq"
STAGE_1_2_DIR = os.path.join(BASE_DIR, "standardized_data", RUN_ID)
STAGE_2_5_DIR = os.path.join(BASE_DIR, "preprocessed_data", RUN_ID)
COMBINED_FILE = os.path.join(STAGE_1_2_DIR, "combined_all_genes_sparse_standardized.h5ad")

print(f"Run ID: {RUN_ID}")
print(f"Stage 1&2 Output Directory: {STAGE_1_2_DIR}")
print(f"Stage 2.5 Output Directory: {STAGE_2_5_DIR}")
print(f"Combined File Path: {COMBINED_FILE}")

# Function to safely load h5ad
def load_h5ad_safe(filepath):
    print(f"\n--- Loading: {filepath} ---")
    if not os.path.exists(filepath):
        print(f"ERROR: File not found!")
        return None
    try:
        adata = sc.read_h5ad(filepath)
        print(f"Successfully loaded. Shape: {adata.shape}")
        return adata
    except Exception as e:
        print(f"ERROR loading file: {e}")
        return None

# Function to inspect adata
def inspect_adata(adata, name="Dataset"):
    if adata is None:
        return
    print(f"\n--- Inspecting {name} ---")
    print("Shape:", adata.shape)

    print("\nOBS (head):")
    try:
        display(adata.obs.head())
    except NameError: # If not in notebook
        print(adata.obs.head().to_string())
    print("\nOBS Columns:", list(adata.obs.columns))
    print("\nOBS Dtypes:\n", adata.obs.dtypes.value_counts())

    print("\nVAR (head):")
    try:
        display(adata.var.head())
    except NameError:
        print(adata.var.head().to_string())
    print("\nVAR Columns:", list(adata.var.columns))
    print("\nVAR Dtypes:\n", adata.var.dtypes.value_counts())


    print("\nUNS Keys:", list(adata.uns.keys()))
    # Look for specific problematic keys from logs
    problem_keys = ['metadata_sources', 'dataset_info']
    for key in problem_keys:
        if key in adata.uns:
             print(f"  UNS['{key}'] type: {type(adata.uns[key])}")
             # Print snippet if it's a dict or list
             if isinstance(adata.uns[key], (dict, list)):
                  print(f"  UNS['{key}'] (snippet): {str(adata.uns[key])[:200]}...")

    # Check specific obs columns known to cause issues
    problem_obs_cols = ['cell_type_info']
    for col in problem_obs_cols:
         if col in adata.obs.columns:
              print(f"  OBS Column '{col}' dtype: {adata.obs[col].dtype}")
              print(f"  OBS Column '{col}' (sample values): {adata.obs[col].unique()[:5]}")

Run ID: run_20250501_045924
Stage 1&2 Output Directory: /mnt/czi-sci-ai/intrinsic-variation-gene-ex/rnaseq/standardized_data/run_20250501_045924
Stage 2.5 Output Directory: /mnt/czi-sci-ai/intrinsic-variation-gene-ex/rnaseq/preprocessed_data/run_20250501_045924
Combined File Path: /mnt/czi-sci-ai/intrinsic-variation-gene-ex/rnaseq/standardized_data/run_20250501_045924/combined_all_genes_sparse_standardized.h5ad


In [4]:
# Cell 2: Inspect Stage 1 Output (V1 files - before Stage 2 processing)
# Note: We expect ENCODE, GTEx, MAGE v1 files might be corrupted due to save errors.
# ADNI v1 should be okay.

adata_adni_v1 = load_h5ad_safe(os.path.join(STAGE_1_2_DIR, "adni_standardized_v1.h5ad"))
inspect_adata(adata_adni_v1, "ADNI_v1 (Stage 1 Output)")

adata_encode_v1 = load_h5ad_safe(os.path.join(STAGE_1_2_DIR, "encode_standardized_v1.h5ad"))
# inspect_adata(adata_encode_v1, "ENCODE_v1 (Stage 1 Output)") # This might fail or show incomplete data

adata_gtex_v1 = load_h5ad_safe(os.path.join(STAGE_1_2_DIR, "gtex_standardized_v1.h5ad"))
# inspect_adata(adata_gtex_v1, "GTEx_v1 (Stage 1 Output)") # This might fail or show incomplete data

adata_mage_v1 = load_h5ad_safe(os.path.join(STAGE_1_2_DIR, "mage_standardized_v1.h5ad"))
# inspect_adata(adata_mage_v1, "MAGE_v1 (Stage 1 Output)") # This might fail or show incomplete data

# You might need to uncomment the inspect_adata calls one by one if they fail due to corruption.
# Focus on whether the file loads at all, and if so, what .obs/.uns looks like.


--- Loading: /mnt/czi-sci-ai/intrinsic-variation-gene-ex/rnaseq/standardized_data/run_20250501_045924/adni_standardized_v1.h5ad ---
Successfully loaded. Shape: (650, 17991)

--- Inspecting ADNI_v1 (Stage 1 Output) ---
Shape: (650, 17991)

OBS (head):


Unnamed: 0_level_0,sample_id,subject_id,dataset,data_type,expression_unit,tissue,platform,processing,sex,age,species,species_ontology,tissue_original,tissue_ontology,data_type_original,assay_ontology,age_original,developmental_stage_ontology
_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
002_S_0413_002_S_0413_gencode_v24_pruned,002_S_0413_002_S_0413_gencode_v24_pruned,002_S_0413,ADNI,Microarray,Normalized intensity,blood,Affymetrix Human Genome U219 Array,gencode_v24,unknown,,human,NCBITaxon:9606,blood,UBERON:0000178,Microarray,,,
002_S_0729_002_S_0729_gencode_v24_pruned,002_S_0729_002_S_0729_gencode_v24_pruned,002_S_0729,ADNI,Microarray,Normalized intensity,blood,Affymetrix Human Genome U219 Array,gencode_v24,unknown,,human,NCBITaxon:9606,blood,UBERON:0000178,Microarray,,,
002_S_1155_002_S_1155_gencode_v24_pruned,002_S_1155_002_S_1155_gencode_v24_pruned,002_S_1155,ADNI,Microarray,Normalized intensity,blood,Affymetrix Human Genome U219 Array,gencode_v24,unknown,,human,NCBITaxon:9606,blood,UBERON:0000178,Microarray,,,
002_S_1261_002_S_1261_gencode_v24_pruned,002_S_1261_002_S_1261_gencode_v24_pruned,002_S_1261,ADNI,Microarray,Normalized intensity,blood,Affymetrix Human Genome U219 Array,gencode_v24,unknown,,human,NCBITaxon:9606,blood,UBERON:0000178,Microarray,,,
002_S_1268_002_S_1268_gencode_v24_pruned,002_S_1268_002_S_1268_gencode_v24_pruned,002_S_1268,ADNI,Microarray,Normalized intensity,blood,Affymetrix Human Genome U219 Array,gencode_v24,unknown,,human,NCBITaxon:9606,blood,UBERON:0000178,Microarray,,,



OBS Columns: ['sample_id', 'subject_id', 'dataset', 'data_type', 'expression_unit', 'tissue', 'platform', 'processing', 'sex', 'age', 'species', 'species_ontology', 'tissue_original', 'tissue_ontology', 'data_type_original', 'assay_ontology', 'age_original', 'developmental_stage_ontology']

OBS Dtypes:
 category    4
object      2
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
Name: count, dtype: int64

VAR (head):


Unnamed: 0,gene_id,original_ids,gene_name,gene_type,chromosome,mapping_source
ENSG00000000003,ENSG00000000003,ENSG00000000003.14,TSPAN6,protein_coding,chrX,exact_match
ENSG00000000005,ENSG00000000005,ENSG00000000005.5,TNMD,protein_coding,chrX,exact_match
ENSG00000000419,ENSG00000000419,ENSG00000000419.12,DPM1,protein_coding,chr20,exact_match
ENSG00000000457,ENSG00000000457,ENSG00000000457.13,SCYL3,protein_coding,chr1,exact_match
ENSG00000000938,ENSG00000000938,ENSG00000000938.12,FGR,protein_coding,chr1,exact_match



VAR Columns: ['gene_id', 'original_ids', 'gene_name', 'gene_type', 'chromosome', 'mapping_source']

VAR Dtypes:
 object      2
category    1
category    1
category    1
category    1
Name: count, dtype: int64

UNS Keys: ['dataset_info', 'gencode_version', 'harmonized_gencode_version', 'harmonized_reference_genome', 'metadata_validation', 'ontology_mappings', 'original_gencode_version', 'original_reference_genome', 'processing_date', 'reference_genome', 'subject_demographics']
  UNS['dataset_info'] type: <class 'dict'>
  UNS['dataset_info'] (snippet): {'assay_ontology': 'EFO:0002695', 'data_type': 'microarray', 'description': "Alzheimer's Disease Neuroimaging Initiative microarray data with demographic information", 'expression_unit': 'quantile nor...

--- Loading: /mnt/czi-sci-ai/intrinsic-variation-gene-ex/rnaseq/standardized_data/run_20250501_045924/encode_standardized_v1.h5ad ---
Successfully loaded. Shape: (7, 65586)

--- Loading: /mnt/czi-sci-ai/intrinsic-variation-gene-ex/rnaseq

In [5]:
# Cell 3: Inspect Stage 2 Output (V2 files)
# Note: We expect ENCODE, GTEx, MAGE v2 files might *not exist* or be corrupted
# because the save failed in standardize_metadata.py. ADNI v2 should exist.

adata_adni_v2 = load_h5ad_safe(os.path.join(STAGE_1_2_DIR, "adni_standardized_v2.h5ad"))
inspect_adata(adata_adni_v2, "ADNI_v2 (Stage 2 Output)")

# Check existence of others (likely False)
print("\nChecking for V2 files potentially missing due to save errors:")
print(f"Encode V2 exists: {os.path.exists(os.path.join(STAGE_1_2_DIR, 'encode_standardized_v2.h5ad'))}")
print(f"GTEx V2 exists:   {os.path.exists(os.path.join(STAGE_1_2_DIR, 'gtex_standardized_v2.h5ad'))}")
print(f"MAGE V2 exists:   {os.path.exists(os.path.join(STAGE_1_2_DIR, 'mage_standardized_v2.h5ad'))}")


--- Loading: /mnt/czi-sci-ai/intrinsic-variation-gene-ex/rnaseq/standardized_data/run_20250501_045924/adni_standardized_v2.h5ad ---
Successfully loaded. Shape: (650, 17991)

--- Inspecting ADNI_v2 (Stage 2 Output) ---
Shape: (650, 17991)

OBS (head):


Unnamed: 0_level_0,sample_id,subject_id,dataset,data_type,expression_unit,tissue,platform,processing,sex,age,species,species_ontology,tissue_original,tissue_ontology,data_type_original,assay_ontology,age_original,developmental_stage_ontology,tissue_ontology_confidence
_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
002_S_0413_002_S_0413_gencode_v24_pruned,002_S_0413_002_S_0413_gencode_v24_pruned,002_S_0413,ADNI,Microarray,Normalized intensity,blood,Affymetrix Human Genome U219 Array,gencode_v24,unknown,,human,NCBITaxon:9606,blood,UBERON:0000178,Microarray,,,,medium
002_S_0729_002_S_0729_gencode_v24_pruned,002_S_0729_002_S_0729_gencode_v24_pruned,002_S_0729,ADNI,Microarray,Normalized intensity,blood,Affymetrix Human Genome U219 Array,gencode_v24,unknown,,human,NCBITaxon:9606,blood,UBERON:0000178,Microarray,,,,medium
002_S_1155_002_S_1155_gencode_v24_pruned,002_S_1155_002_S_1155_gencode_v24_pruned,002_S_1155,ADNI,Microarray,Normalized intensity,blood,Affymetrix Human Genome U219 Array,gencode_v24,unknown,,human,NCBITaxon:9606,blood,UBERON:0000178,Microarray,,,,medium
002_S_1261_002_S_1261_gencode_v24_pruned,002_S_1261_002_S_1261_gencode_v24_pruned,002_S_1261,ADNI,Microarray,Normalized intensity,blood,Affymetrix Human Genome U219 Array,gencode_v24,unknown,,human,NCBITaxon:9606,blood,UBERON:0000178,Microarray,,,,medium
002_S_1268_002_S_1268_gencode_v24_pruned,002_S_1268_002_S_1268_gencode_v24_pruned,002_S_1268,ADNI,Microarray,Normalized intensity,blood,Affymetrix Human Genome U219 Array,gencode_v24,unknown,,human,NCBITaxon:9606,blood,UBERON:0000178,Microarray,,,,medium



OBS Columns: ['sample_id', 'subject_id', 'dataset', 'data_type', 'expression_unit', 'tissue', 'platform', 'processing', 'sex', 'age', 'species', 'species_ontology', 'tissue_original', 'tissue_ontology', 'data_type_original', 'assay_ontology', 'age_original', 'developmental_stage_ontology', 'tissue_ontology_confidence']

OBS Dtypes:
 category    4
object      2
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
Name: count, dtype: int64

VAR (head):


Unnamed: 0,gene_id,original_ids,gene_name,gene_type,chromosome,mapping_source
ENSG00000000003,ENSG00000000003,ENSG00000000003.14,TSPAN6,protein_coding,chrX,exact_match
ENSG00000000005,ENSG00000000005,ENSG00000000005.5,TNMD,protein_coding,chrX,exact_match
ENSG00000000419,ENSG00000000419,ENSG00000000419.12,DPM1,protein_coding,chr20,exact_match
ENSG00000000457,ENSG00000000457,ENSG00000000457.13,SCYL3,protein_coding,chr1,exact_match
ENSG00000000938,ENSG00000000938,ENSG00000000938.12,FGR,protein_coding,chr1,exact_match



VAR Columns: ['gene_id', 'original_ids', 'gene_name', 'gene_type', 'chromosome', 'mapping_source']

VAR Dtypes:
 object      2
category    1
category    1
category    1
category    1
Name: count, dtype: int64

UNS Keys: ['dataset_info', 'gencode_version', 'harmonized_gencode_version', 'harmonized_reference_genome', 'metadata_validation', 'ontology_mappings', 'original_gencode_version', 'original_reference_genome', 'processing_date', 'reference_genome', 'subject_demographics']
  UNS['dataset_info'] type: <class 'dict'>
  UNS['dataset_info'] (snippet): {'assay_ontology': 'EFO:0002695', 'data_type': 'microarray', 'description': "Alzheimer's Disease Neuroimaging Initiative microarray data with demographic information", 'expression_unit': 'quantile nor...

Checking for V2 files potentially missing due to save errors:
Encode V2 exists: True
GTEx V2 exists:   True
MAGE V2 exists:   True


In [6]:
# Cell 4: Inspect Stage 2.5 Output (Preprocessed files)
# These should exist and reflect the state before combining.

adata_adni_prep = load_h5ad_safe(os.path.join(STAGE_2_5_DIR, "adni_standardized_preprocessed.h5ad"))
inspect_adata(adata_adni_prep, "ADNI Preprocessed (Stage 2.5 Output)")

adata_encode_prep = load_h5ad_safe(os.path.join(STAGE_2_5_DIR, "encode_standardized_preprocessed.h5ad"))
inspect_adata(adata_encode_prep, "ENCODE Preprocessed (Stage 2.5 Output)")
# Check if placeholders were fixed in the *next* step (should still be present here)
if adata_encode_prep and 'gene_id' in adata_encode_prep.var.columns:
    placeholders = adata_encode_prep.var[adata_encode_prep.var['gene_id'].astype(str).str.startswith('PLACEHOLDER_')]
    print(f"\nPlaceholders found in ENCODE preprocessed var['gene_id']: {len(placeholders)}")

adata_gtex_prep = load_h5ad_safe(os.path.join(STAGE_2_5_DIR, "gtex_standardized_preprocessed.h5ad"))
inspect_adata(adata_gtex_prep, "GTEx Preprocessed (Stage 2.5 Output)")

adata_mage_prep = load_h5ad_safe(os.path.join(STAGE_2_5_DIR, "mage_standardized_preprocessed.h5ad"))
inspect_adata(adata_mage_prep, "MAGE Preprocessed (Stage 2.5 Output)")


--- Loading: /mnt/czi-sci-ai/intrinsic-variation-gene-ex/rnaseq/preprocessed_data/run_20250501_045924/adni_standardized_preprocessed.h5ad ---
Successfully loaded. Shape: (650, 17991)

--- Inspecting ADNI Preprocessed (Stage 2.5 Output) ---
Shape: (650, 17991)

OBS (head):


Unnamed: 0_level_0,sample_id,subject_id,dataset,data_type,expression_unit,tissue,platform,processing,sex,age,species,species_ontology,tissue_original,tissue_ontology,data_type_original,assay_ontology,age_original,developmental_stage_ontology,tissue_ontology_confidence
_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
002_S_0413_002_S_0413_gencode_v24_pruned,002_S_0413_002_S_0413_gencode_v24_pruned,002_S_0413,ADNI,Microarray,Normalized intensity,blood,Affymetrix Human Genome U219 Array,gencode_v24,unknown,,human,NCBITaxon:9606,blood,UBERON:0000178,Microarray,,,,medium
002_S_0729_002_S_0729_gencode_v24_pruned,002_S_0729_002_S_0729_gencode_v24_pruned,002_S_0729,ADNI,Microarray,Normalized intensity,blood,Affymetrix Human Genome U219 Array,gencode_v24,unknown,,human,NCBITaxon:9606,blood,UBERON:0000178,Microarray,,,,medium
002_S_1155_002_S_1155_gencode_v24_pruned,002_S_1155_002_S_1155_gencode_v24_pruned,002_S_1155,ADNI,Microarray,Normalized intensity,blood,Affymetrix Human Genome U219 Array,gencode_v24,unknown,,human,NCBITaxon:9606,blood,UBERON:0000178,Microarray,,,,medium
002_S_1261_002_S_1261_gencode_v24_pruned,002_S_1261_002_S_1261_gencode_v24_pruned,002_S_1261,ADNI,Microarray,Normalized intensity,blood,Affymetrix Human Genome U219 Array,gencode_v24,unknown,,human,NCBITaxon:9606,blood,UBERON:0000178,Microarray,,,,medium
002_S_1268_002_S_1268_gencode_v24_pruned,002_S_1268_002_S_1268_gencode_v24_pruned,002_S_1268,ADNI,Microarray,Normalized intensity,blood,Affymetrix Human Genome U219 Array,gencode_v24,unknown,,human,NCBITaxon:9606,blood,UBERON:0000178,Microarray,,,,medium



OBS Columns: ['sample_id', 'subject_id', 'dataset', 'data_type', 'expression_unit', 'tissue', 'platform', 'processing', 'sex', 'age', 'species', 'species_ontology', 'tissue_original', 'tissue_ontology', 'data_type_original', 'assay_ontology', 'age_original', 'developmental_stage_ontology', 'tissue_ontology_confidence']

OBS Dtypes:
 category    4
object      2
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
Name: count, dtype: int64

VAR (head):


Unnamed: 0,gene_id,original_ids,gene_name,gene_type,chromosome,mapping_source,original_gene_id,ensembl_id,mapping_confidence
ENSG00000000003,ENSG00000000003,ENSG00000000003.14,TSPAN6,protein_coding,chrX,reference_mapping,ENSG00000000003,ENSG00000000003,high
ENSG00000000005,ENSG00000000005,ENSG00000000005.5,TNMD,protein_coding,chrX,reference_mapping,ENSG00000000005,ENSG00000000005,high
ENSG00000000419,ENSG00000000419,ENSG00000000419.12,DPM1,protein_coding,chr20,reference_mapping,ENSG00000000419,ENSG00000000419,high
ENSG00000000457,ENSG00000000457,ENSG00000000457.13,SCYL3,protein_coding,chr1,reference_mapping,ENSG00000000457,ENSG00000000457,high
ENSG00000000938,ENSG00000000938,ENSG00000000938.12,FGR,protein_coding,chr1,reference_mapping,ENSG00000000938,ENSG00000000938,high



VAR Columns: ['gene_id', 'original_ids', 'gene_name', 'gene_type', 'chromosome', 'mapping_source', 'original_gene_id', 'ensembl_id', 'mapping_confidence']

VAR Dtypes:
 object      4
category    1
category    1
category    1
category    1
category    1
Name: count, dtype: int64

UNS Keys: ['dataset_info', 'gencode_version', 'gene_mapping_stats', 'harmonized_gencode_version', 'harmonized_reference_genome', 'metadata_validation', 'ontology_mappings', 'original_gencode_version', 'original_reference_genome', 'processing_date', 'reference_genome', 'subject_demographics']
  UNS['dataset_info'] type: <class 'dict'>
  UNS['dataset_info'] (snippet): {'assay_ontology': 'EFO:0002695', 'data_type': 'microarray', 'description': "Alzheimer's Disease Neuroimaging Initiative microarray data with demographic information", 'expression_unit': 'quantile nor...

--- Loading: /mnt/czi-sci-ai/intrinsic-variation-gene-ex/rnaseq/preprocessed_data/run_20250501_045924/encode_standardized_preprocessed.h5ad ---
S

Unnamed: 0,_sample_id,age,cell_line,cell_type,cell_type_info,depletion,description,disease,ethnicity,geo_id,nucleic_acid_type,organism,original_sample_id,sex,size_range,strand_specificity,subject_id,tissue,data_type,assay_ontology,species,species_ontology,dataset,sample_id,tissue_ontology,developmental_stage_ontology,expression_unit,tissue_ontology_confidence
0,ENCFF244DNJ,58,A549,epithelial,,rRNA,Human lung carcinoma cell line,adenocarcinoma,European,SAMN05733878,polyadenylated mRNA,human,ENCFF244DNJ,male,>200,Strand-specific (reverse),ENCDO451RUA,lung,RNA-seq,EFO:0009922,human,NCBITaxon:9606,encode,0,UBERON:0002048,,unknown,medium
1,ENCFF685WJV,69,Caki2,epithelial,,rRNA,Human kidney clear cell carcinoma cell line,clear cell renal cell carcinoma,European,SAMN04284635,RNA,human,ENCFF685WJV,male,>200,Strand-specific (reverse),ENCDO869AAI,kidney,RNA-seq,EFO:0009922,human,NCBITaxon:9606,encode,1,UBERON:0002113,,unknown,medium
2,ENCFF640FPG,53,GM23248,Fibroblast,,rRNA,Human skin fibroblast cell line,normal,European,SAMN04284514,RNA,human,ENCFF640FPG,male,>200,Strand-specific (reverse),ENCDO467QPX,skin,RNA-seq,EFO:0009922,human,NCBITaxon:9606,encode,2,UBERON:0002097,,unknown,medium
3,ENCFF863QWG,15,HepG2,epithelial,,rRNA,Human liver hepatocellular carcinoma cell line,hepatocellular carcinoma,European,SAMN04284581,RNA,human,ENCFF863QWG,male,>200,Strand-specific (reverse),ENCDO886MPB,liver,RNA-seq,EFO:0009922,human,NCBITaxon:9606,encode,3,UBERON:0002107,,unknown,medium
4,ENCFF171FQU,53,K562,lymphoblast,,rRNA,Human chronic myelogenous leukemia cell line,chronic myelogenous leukemia,unknown,SAMN04284550,RNA,human,ENCFF171FQU,female,>200,Strand-specific (reverse),ENCDO000AAL,bone marrow,RNA-seq,EFO:0009922,human,NCBITaxon:9606,encode,4,UBERON:0002371,,unknown,medium



OBS Columns: ['_sample_id', 'age', 'cell_line', 'cell_type', 'cell_type_info', 'depletion', 'description', 'disease', 'ethnicity', 'geo_id', 'nucleic_acid_type', 'organism', 'original_sample_id', 'sex', 'size_range', 'strand_specificity', 'subject_id', 'tissue', 'data_type', 'assay_ontology', 'species', 'species_ontology', 'dataset', 'sample_id', 'tissue_ontology', 'developmental_stage_ontology', 'expression_unit', 'tissue_ontology_confidence']

OBS Dtypes:
 object      7
category    2
category    2
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
Name: count, dtype: int64

VAR (head):


Unnamed: 0,gene_id,original_gene_id,ensembl_id,gene_name,gene_type,chromosome,mapping_source,mapping_confidence
0,ENSG00000169045,ENST00000518548.5|ENSG00000169045.17|OTTHUMG00...,ENSG00000169045,HNRNPH1,protein_coding,chr5,encode_mapping,high
1,ENSG00000278290,ENSG00000278290.1,ENSG00000278290,AC099050.1,miRNA,chr3,encode_mapping,high
2,ENSG00000172428,ENSG00000172428.10,ENSG00000172428,MYEOV2,protein_coding,chr2,encode_mapping,high
3,ENSG00000231924,ENST00000595124.5|ENSG00000231924.9|OTTHUMG000...,ENSG00000231924,PSG1,protein_coding,chr19,encode_mapping,high
4,ENSG00000226137,ENSG00000226137.3,ENSG00000226137,BAIAP2-AS1,lincRNA,chr17,encode_mapping,high



VAR Columns: ['gene_id', 'original_gene_id', 'ensembl_id', 'gene_name', 'gene_type', 'chromosome', 'mapping_source', 'mapping_confidence']

VAR Dtypes:
 category    2
object      1
category    1
category    1
category    1
category    1
category    1
Name: count, dtype: int64

UNS Keys: ['assay_ontology', 'data_type', 'dataset_info', 'gencode_version', 'gene_mapping_stats', 'genome_info', 'harmonized_gencode_version', 'harmonized_reference_genome', 'platform', 'reference_genome', 'rna_seq_protocol']
  UNS['dataset_info'] type: <class 'dict'>
  UNS['dataset_info'] (snippet): {'cell_lines': {'A549': {'age': '58', 'cell_type': 'epithelial', 'cell_type_info': {'anatomical_entity_id': 'UBERON:0002048', 'cell_type_id': 'CL:0000066'}, 'depletion': 'rRNA', 'description': 'Human ...
  OBS Column 'cell_type_info' dtype: category
  OBS Column 'cell_type_info' (sample values): ['']
Categories (1, object): ['']

Placeholders found in ENCODE preprocessed var['gene_id']: 0

--- Loading: /mnt/czi-sci

Unnamed: 0_level_0,SMATSSCR,SMCENTER,SMPTHNTS,rna_integrity_number,broad_tissue,tissue,SMUBRID,ischemic_time,SMTSPAX,array_batch,SMNABTCHT,SMNABTCHD,batch,SMGEBTCHD,SMGEBTCHT,ANALYTE_TYPE,SMAFRZE,SMGTC,SMRDTTL,SMALTTL,SMALTALG,SMSUPALG,SMRDLGTH,SMVQCFL,SMLMAPQ,...,SMSMRTHQ,SMPRERDHQ,SMPRERTHQ,SMSMGNDT,SMPREGNDT,SMRDLNMN,SMRDLNMD,SMRDLNSD,subject_id,sex,age,DTHHRDY,dataset,original_sample_id,data_type,expression_unit,species,species_ontology,tissue_original,tissue_ontology,data_type_original,assay_ontology,age_original,developmental_stage_ontology,tissue_ontology_confidence
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
GTEX-1117F-0005-SM-HL9SH,,B1,,8.3,Blood,whole blood,UBERON:0013756,1188.0,,BP-38553,RNA isolation PAXgene Blood RNA (Manual),5/3/13,LCSET-13764,,TruSeq.v1,RNA:Total RNA,RNASEQ,,134604994.0,367633937.0,232956008.0,72935.0,76.0,16433925.0,304143172.0,...,,,,,,,,,GTEX-1117F,unknown,60-69,4.0,GTEx,,RNA-seq,,human,NCBITaxon:9606,Whole Blood,UBERON:0000178,,EFO:0009922,60-69,,medium
GTEX-1117F-0011-R10b-SM-GI4VE,,"B1, A1",,7.2,Brain,brain - frontal cortex (ba9),UBERON:0009834,1193.0,,BP-42319,RNA Isolation via QIAGEN Spin Column,8/14/13,LCSET-12010,,TruSeq.v1,RNA:Total RNA,RNASEQ,,67350184.0,75528103.0,8050459.0,127460.0,76.0,3574302.0,12367264.0,...,,,,,,,,,GTEX-1117F,unknown,60-69,4.0,GTEx,,RNA-seq,,human,NCBITaxon:9606,Brain - Frontal Cortex (BA9),UBERON:0013529,,EFO:0009922,60-69,,medium
GTEX-1117F-0011-R11b-SM-GIN8R,,"B1, A1",,6.0,Brain,brain - cerebellar hemisphere,UBERON:0002037,1193.0,,BP-42319,RNA Isolation via QIAGEN Spin Column,8/14/13,LCSET-12011,,TruSeq.v1,RNA:Total RNA,RNASEQ,,99021352.0,122532321.0,23266593.0,244376.0,76.0,4879614.0,33578345.0,...,,,,,,,,,GTEX-1117F,unknown,60-69,4.0,GTEx,,RNA-seq,,human,NCBITaxon:9606,Brain - Cerebellar Hemisphere,UBERON:0002245,,EFO:0009922,60-69,,medium
GTEX-1117F-0011-R2b-SM-GI4VL,,"B1, A1",,5.7,Brain,brain - substantia nigra,UBERON:0002038,1193.0,,BP-42208,RNA Isolation via QIAGEN Spin Column,8/12/13,LCSET-12010,,TruSeq.v1,RNA:Total RNA,RNASEQ,,102541796.0,126416358.0,23734034.0,140528.0,76.0,5688018.0,35903889.0,...,,,,,,,,,GTEX-1117F,unknown,60-69,4.0,GTEx,,RNA-seq,,human,NCBITaxon:9606,Brain - Substantia nigra,UBERON:0002038,,EFO:0009922,60-69,,medium
GTEX-1117F-0011-R3a-SM-GJ3PJ,,"B1, A1",,7.1,Brain,brain - anterior cingulate cortex (ba24),UBERON:0009835,1193.0,,BP-42208,RNA Isolation via QIAGEN Spin Column,8/12/13,LCSET-12012,,TruSeq.v1,RNA:Total RNA,RNASEQ,,87919278.0,104564329.0,16487319.0,157732.0,76.0,3586415.0,24566291.0,...,,,,,,,,,GTEX-1117F,unknown,60-69,4.0,GTEx,,RNA-seq,,human,NCBITaxon:9606,Brain - Anterior cingulate cortex (BA24),UBERON:0009835,,EFO:0009922,60-69,,medium



OBS Columns: ['SMATSSCR', 'SMCENTER', 'SMPTHNTS', 'rna_integrity_number', 'broad_tissue', 'tissue', 'SMUBRID', 'ischemic_time', 'SMTSPAX', 'array_batch', 'SMNABTCHT', 'SMNABTCHD', 'batch', 'SMGEBTCHD', 'SMGEBTCHT', 'ANALYTE_TYPE', 'SMAFRZE', 'SMGTC', 'SMRDTTL', 'SMALTTL', 'SMALTALG', 'SMSUPALG', 'SMRDLGTH', 'SMVQCFL', 'SMLMAPQ', 'SMUMPRD', 'SMUNPDRD', 'SMMPPD', 'SMMAPRT', 'SMMPPDUN', 'SMUNMPRT', 'SMMPDP', 'SMDPMPRT', 'SMMPPDXG', 'SMMPDPXG', 'SMDPRTXG', 'SMCHMRD', 'SMCHMRT', 'SMMPPDPR', 'SMMPHQRD', 'SMMPHQRT', 'SMMPLQRD', 'SMSPLTRT', 'SME1MPRD', 'SME2MPRD', 'SME1MPRT', 'SME2MPRT', 'SME1MMB', 'SME2MMB', 'SME1TTLB', 'SME2TTLB', 'SME1MMRT', 'SME2MMRT', 'SMTTLMM', 'SMTTLB', 'SMBSMMRT', 'SMESTLBS', 'SMEXNCRD', 'SMEXNCRT', 'SMEXPEFF', 'SMNTRNRD', 'SMNTRNRT', 'SMNTRARD', 'SMNTRART', 'SMNTERRD', 'SMNTERRT', 'SMAMBRD', 'SMAMBRT', 'SMNTEXC', 'SMDSCRT', 'SMEXNCRTHQ', 'SMNTRNRTHQ', 'SMNTRARTHQ', 'SMNTERRTHQ', 'SMAMBRTHQ', 'SME1SNSE', 'SME2SNSE', 'SME1ANTI', 'SME2ANTI', 'SME1PCTS', 'SME2PCTS', 'SMG

Unnamed: 0,gene_id,original_ids,gene_name,gene_type,chromosome,mapping_source,original_gene_id,ensembl_id,mapping_confidence
ENSG00000000003,ENSG00000000003,ENSG00000000003.15,TSPAN6,protein_coding,chrX,reference_mapping,ENSG00000000003,ENSG00000000003,high
ENSG00000000005,ENSG00000000005,ENSG00000000005.6,TNMD,protein_coding,chrX,reference_mapping,ENSG00000000005,ENSG00000000005,high
ENSG00000000419,ENSG00000000419,ENSG00000000419.14,DPM1,protein_coding,chr20,reference_mapping,ENSG00000000419,ENSG00000000419,high
ENSG00000000457,ENSG00000000457,ENSG00000000457.14,SCYL3,protein_coding,chr1,reference_mapping,ENSG00000000457,ENSG00000000457,high
ENSG00000000460,ENSG00000000460,ENSG00000000460.17,C1orf112,protein_coding,chr1,reference_mapping,ENSG00000000460,ENSG00000000460,high



VAR Columns: ['gene_id', 'original_ids', 'gene_name', 'gene_type', 'chromosome', 'mapping_source', 'original_gene_id', 'ensembl_id', 'mapping_confidence']

VAR Dtypes:
 object      4
category    1
category    1
category    1
category    1
category    1
Name: count, dtype: int64

UNS Keys: ['dataset_info', 'dataset_version', 'gencode_version', 'gene_mapping_stats', 'harmonized_gencode_version', 'harmonized_reference_genome', 'metadata_sources']
  UNS['metadata_sources'] type: <class 'numpy.ndarray'>
  UNS['dataset_info'] type: <class 'dict'>
  UNS['dataset_info'] (snippet): {'data_type': 'RNA-seq', 'expression_unit': 'TPM', 'gencode_version': np.int64(24), 'genes': np.int64(58988), 'samples': np.int64(19616), 'source': 'GTEx', 'subject_count': np.int64(946), 'tissue_coun...

--- Loading: /mnt/czi-sci-ai/intrinsic-variation-gene-ex/rnaseq/preprocessed_data/run_20250501_045924/mage_standardized_preprocessed.h5ad ---
Successfully loaded. Shape: (731, 19428)

--- Inspecting MAGE Preprocess

Unnamed: 0_level_0,sample_id,donor_id,subject_id,tissue,dataset,data_type,expression_unit,is_gencode,is_ensembl,sex,age,species,species_ontology,tissue_original,tissue_ontology,data_type_original,assay_ontology,age_original,developmental_stage_ontology,tissue_ontology_confidence
_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
NA06985_NA06985,NA06985_NA06985,NA06985,NA06985,lymphoblast,MAGE,rna-seq,TPM,False,True,unknown,,human,NCBITaxon:9606,lymphoblast,,RNA-seq,EFO:0009922,,,none
NA07000_NA07000,NA07000_NA07000,NA07000,NA07000,lymphoblast,MAGE,rna-seq,TPM,False,True,unknown,,human,NCBITaxon:9606,lymphoblast,,RNA-seq,EFO:0009922,,,none
NA11919_NA11919,NA11919_NA11919,NA11919,NA11919,lymphoblast,MAGE,rna-seq,TPM,False,True,unknown,,human,NCBITaxon:9606,lymphoblast,,RNA-seq,EFO:0009922,,,none
NA11930_NA11930,NA11930_NA11930,NA11930,NA11930,lymphoblast,MAGE,rna-seq,TPM,False,True,unknown,,human,NCBITaxon:9606,lymphoblast,,RNA-seq,EFO:0009922,,,none
NA11932_NA11932,NA11932_NA11932,NA11932,NA11932,lymphoblast,MAGE,rna-seq,TPM,False,True,unknown,,human,NCBITaxon:9606,lymphoblast,,RNA-seq,EFO:0009922,,,none



OBS Columns: ['sample_id', 'donor_id', 'subject_id', 'tissue', 'dataset', 'data_type', 'expression_unit', 'is_gencode', 'is_ensembl', 'sex', 'age', 'species', 'species_ontology', 'tissue_original', 'tissue_ontology', 'data_type_original', 'assay_ontology', 'age_original', 'developmental_stage_ontology', 'tissue_ontology_confidence']

OBS Dtypes:
 category    4
object      3
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
Name: count, dtype: int64

VAR (head):


Unnamed: 0,gene_id,original_ids,gene_name,gene_type,chromosome,mapping_source,original_gene_id,ensembl_id,mapping_confidence
ENSG00000000003,ENSG00000000003,ENSG00000000003.14,TSPAN6,protein_coding,chrX,reference_mapping,ENSG00000000003,ENSG00000000003,high
ENSG00000000419,ENSG00000000419,ENSG00000000419.12,DPM1,protein_coding,chr20,reference_mapping,ENSG00000000419,ENSG00000000419,high
ENSG00000000457,ENSG00000000457,ENSG00000000457.13,SCYL3,protein_coding,chr1,reference_mapping,ENSG00000000457,ENSG00000000457,high
ENSG00000000460,ENSG00000000460,ENSG00000000460.16,C1orf112,protein_coding,chr1,reference_mapping,ENSG00000000460,ENSG00000000460,high
ENSG00000000938,ENSG00000000938,ENSG00000000938.12,FGR,protein_coding,chr1,reference_mapping,ENSG00000000938,ENSG00000000938,high



VAR Columns: ['gene_id', 'original_ids', 'gene_name', 'gene_type', 'chromosome', 'mapping_source', 'original_gene_id', 'ensembl_id', 'mapping_confidence']

VAR Dtypes:
 object      4
category    1
category    1
category    1
category    1
category    1
Name: count, dtype: int64

UNS Keys: ['dataset_info', 'gene_mapping_stats']
  UNS['dataset_info'] type: <class 'dict'>
  UNS['dataset_info'] (snippet): {'accession': 'PRJNA851328', 'cell_type': 'lymphoblastoid cell lines', 'continental_groups': np.int64(5), 'data_repository': array(['', ''], dtype=object)}...


In [7]:
# Cell 5: Inspect Final Combined Dataset
adata_combined = load_h5ad_safe(COMBINED_FILE)
inspect_adata(adata_combined, "Combined Dataset (Final Output)")

# Specific checks for combined dataset
if adata_combined:
    print("\n--- Combined Dataset Specific Checks ---")
    if 'dataset' in adata_combined.obs.columns:
        print("\nDataset Value Counts:")
        print(adata_combined.obs['dataset'].value_counts())
    else:
        print("\nERROR: 'dataset' column missing in combined obs!")

    if 'present_in_datasets' in adata_combined.var.columns:
         print("\nGene presence sample (first 5):")
         print(adata_combined.var['present_in_datasets'].head())
    else:
         print("\nERROR: 'present_in_datasets' column missing in combined var!")

    print("\nCombined UNS Keys:", list(adata_combined.uns.keys()))
    # Check structure of specific UNS keys if needed
    # print(adata_combined.uns.get('dataset_info'))
    # print(adata_combined.uns.get('dataset_overlap'))


--- Loading: /mnt/czi-sci-ai/intrinsic-variation-gene-ex/rnaseq/standardized_data/run_20250501_045924/combined_all_genes_sparse_standardized.h5ad ---
Successfully loaded. Shape: (21004, 62324)

--- Inspecting Combined Dataset (Final Output) ---
Shape: (21004, 62324)

OBS (head):


Unnamed: 0,sample_id,subject_id,dataset,data_type,expression_unit,tissue,sex,age
002_S_0413_002_S_0413_gencode_v24_pruned,002_S_0413_002_S_0413_gencode_v24_pruned,002_S_0413,adni,Microarray,Normalized intensity,blood,unknown,
002_S_0729_002_S_0729_gencode_v24_pruned,002_S_0729_002_S_0729_gencode_v24_pruned,002_S_0729,adni,Microarray,Normalized intensity,blood,unknown,
002_S_1155_002_S_1155_gencode_v24_pruned,002_S_1155_002_S_1155_gencode_v24_pruned,002_S_1155,adni,Microarray,Normalized intensity,blood,unknown,
002_S_1261_002_S_1261_gencode_v24_pruned,002_S_1261_002_S_1261_gencode_v24_pruned,002_S_1261,adni,Microarray,Normalized intensity,blood,unknown,
002_S_1268_002_S_1268_gencode_v24_pruned,002_S_1268_002_S_1268_gencode_v24_pruned,002_S_1268,adni,Microarray,Normalized intensity,blood,unknown,



OBS Columns: ['sample_id', 'subject_id', 'dataset', 'data_type', 'expression_unit', 'tissue', 'sex', 'age']

OBS Dtypes:
 object      1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
Name: count, dtype: int64

VAR (head):


Unnamed: 0,gene_id,present_in_datasets,chromosome,mapping_source,gene_name,original_ids,gene_type
ENSG00000000003,ENSG00000000003,"adni,gtex,mage",chrX,reference_mapping,TSPAN6,ENSG00000000003,protein_coding
ENSG00000000005,ENSG00000000005,"adni,gtex",chrX,reference_mapping,TNMD,ENSG00000000005,protein_coding
ENSG00000000419,ENSG00000000419,"adni,gtex,mage",chr20,reference_mapping,DPM1,ENSG00000000419,protein_coding
ENSG00000000457,ENSG00000000457,"adni,encode,gtex,mage",chr1,reference_mapping,SCYL3,ENSG00000000457,protein_coding
ENSG00000000460,ENSG00000000460,"encode,gtex,mage",chr1,reference_mapping,C1orf112,ENSG00000000460,protein_coding



VAR Columns: ['gene_id', 'present_in_datasets', 'chromosome', 'mapping_source', 'gene_name', 'original_ids', 'gene_type']

VAR Dtypes:
 object      2
category    1
category    1
category    1
category    1
category    1
Name: count, dtype: int64

UNS Keys: ['dataset_info', 'dataset_overlap', 'gene_counts', 'harmonized_gencode_version', 'harmonized_reference_genome', 'sparsity_stats']
  UNS['dataset_info'] type: <class 'dict'>
  UNS['dataset_info'] (snippet): {'approach': 'all_genes_sparse', 'creation_date': '2025-05-01', 'datasets_combined': array(['adni', 'encode', 'gtex', 'mage'], dtype=object), 'description': 'Combined dataset containing all genes from...

--- Combined Dataset Specific Checks ---

Dataset Value Counts:
dataset
gtex      19616
mage        731
adni        650
encode        7
Name: count, dtype: int64

Gene presence sample (first 5):
ENSG00000000003           adni,gtex,mage
ENSG00000000005                adni,gtex
ENSG00000000419           adni,gtex,mage
ENSG000000004