In [2]:
# Cell 1: Setup - Imports and Paths
import scanpy as sc
import pandas as pd
import numpy as np
import os
import warnings
import sys
import json # Import json to potentially load/decode strings

# Ignore specific warnings often encountered with AnnData
warnings.filterwarnings("ignore", category=FutureWarning, module="anndata")
# Allow viewing more columns
pd.set_option('display.max_columns', 50)

# --- Define paths for the LATEST run ---
RUN_ID = "run_20250501_193803" # <--- Make sure this matches the run directory
BASE_DIR = "/mnt/czi-sci-ai/intrinsic-variation-gene-ex/rnaseq"
PREPROCESSED_DIR = os.path.join(BASE_DIR, "preprocessed_data", RUN_ID)

print(f"Run ID: {RUN_ID}")
print(f"Inspecting Preprocessed Data Directory: {PREPROCESSED_DIR}")

# Function to safely load h5ad
def load_h5ad_safe(filepath):
    print(f"\n--- Loading: {filepath} ---")
    if not os.path.exists(filepath):
        print(f"ERROR: File not found!")
        return None
    try:
        adata = sc.read_h5ad(filepath)
        print(f"Successfully loaded. Shape: {adata.shape}")
        return adata
    except Exception as e:
        print(f"ERROR loading file: {e}")
        return None

# Function to inspect adata
def inspect_adata(adata, name="Dataset"):
    if adata is None:
        return
    print(f"\n--- Inspecting {name} ---")
    print("Shape:", adata.shape)

    print("\nOBS (head 5):")
    try:
        display(adata.obs.head())
    except NameError: # If not in notebook
        print(adata.obs.head().to_string())
    print("\nOBS Columns:", list(adata.obs.columns))
    # print("\nOBS Dtypes:\n", adata.obs.dtypes.value_counts()) # Can be verbose

    print("\nVAR (head 5):")
    try:
        display(adata.var.head())
    except NameError:
        print(adata.var.head().to_string())
    print("\nVAR Columns:", list(adata.var.columns))
    # print("\nVAR Dtypes:\n", adata.var.dtypes.value_counts()) # Can be verbose


    print("\nUNS Keys:", list(adata.uns.keys()))
    # Try to inspect keys that might be problematic or contain JSON strings
    keys_to_inspect = ['metadata_sources', 'dataset_info', 'gene_mapping_stats']
    for key in keys_to_inspect:
        if key in adata.uns:
            print(f"\nUNS Key: '{key}'")
            value = adata.uns[key]
            print(f"  Type: {type(value)}")
            value_str = str(value)
            print(f"  Value (stringified, max 500 chars): {value_str[:500]}{'...' if len(value_str) > 500 else ''}")
            # Try decoding if it looks like a JSON string
            if isinstance(value, str) and value.strip().startswith(('[', '{')):
                try:
                    decoded = json.loads(value)
                    print(f"  (Successfully decoded as JSON)")
                    # print(f"  Decoded (first level type): {type(decoded)}")
                except json.JSONDecodeError:
                    print("  (Could not decode as JSON)")
            elif isinstance(value, dict): # Check nested values
                for sub_key, sub_value in list(value.items())[:5]: # Look at first 5 items
                     print(f"    Sub-key '{sub_key}' type: {type(sub_value)}")


    # Check specific obs/var columns
    print("\nSpecific Column Checks:")
    if 'dataset' in adata.obs.columns:
        print("Dataset counts:", adata.obs['dataset'].value_counts().to_dict())
    if 'mapping_source' in adata.var.columns:
         print("Gene mapping source counts:", adata.var['mapping_source'].value_counts().to_dict())
    if 'tissue_ontology' in adata.obs.columns:
        unmapped_tissue = adata.obs[adata.obs['tissue_ontology'] == '']['tissue'].unique()
        print(f"Unmapped Tissues (sample): {list(unmapped_tissue)[:10]}")
    if 'assay_ontology' in adata.obs.columns:
         unmapped_assay = adata.obs[adata.obs['assay_ontology'] == '']['data_type'].unique()
         print(f"Unmapped Assay Types: {list(unmapped_assay)[:10]}")

Run ID: run_20250501_193803
Inspecting Preprocessed Data Directory: /mnt/czi-sci-ai/intrinsic-variation-gene-ex/rnaseq/preprocessed_data/run_20250501_193803


In [3]:
# Cell 2: Load and Inspect ADNI Preprocessed
adata_adni = load_h5ad_safe(os.path.join(PREPROCESSED_DIR, "adni_standardized_preprocessed.h5ad"))
inspect_adata(adata_adni, "ADNI Preprocessed")
# Expected: Should look good, mapping source mostly 'reference_mapping'.


--- Loading: /mnt/czi-sci-ai/intrinsic-variation-gene-ex/rnaseq/preprocessed_data/run_20250501_193803/adni_standardized_preprocessed.h5ad ---
Successfully loaded. Shape: (650, 17991)

--- Inspecting ADNI Preprocessed ---
Shape: (650, 17991)

OBS (head 5):


Unnamed: 0_level_0,sample_id,subject_id,dataset,data_type,expression_unit,tissue,platform,processing,sex,age,species,species_ontology,tissue_original,tissue_ontology,data_type_original,assay_ontology,age_original,developmental_stage_ontology,tissue_ontology_confidence
_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
002_S_0413_002_S_0413_gencode_v24_pruned,002_S_0413_002_S_0413_gencode_v24_pruned,002_S_0413,ADNI,Microarray,Normalized intensity,blood,Affymetrix Human Genome U219 Array,gencode_v24,unknown,,human,NCBITaxon:9606,blood,UBERON:0000178,Microarray,,,,medium
002_S_0729_002_S_0729_gencode_v24_pruned,002_S_0729_002_S_0729_gencode_v24_pruned,002_S_0729,ADNI,Microarray,Normalized intensity,blood,Affymetrix Human Genome U219 Array,gencode_v24,unknown,,human,NCBITaxon:9606,blood,UBERON:0000178,Microarray,,,,medium
002_S_1155_002_S_1155_gencode_v24_pruned,002_S_1155_002_S_1155_gencode_v24_pruned,002_S_1155,ADNI,Microarray,Normalized intensity,blood,Affymetrix Human Genome U219 Array,gencode_v24,unknown,,human,NCBITaxon:9606,blood,UBERON:0000178,Microarray,,,,medium
002_S_1261_002_S_1261_gencode_v24_pruned,002_S_1261_002_S_1261_gencode_v24_pruned,002_S_1261,ADNI,Microarray,Normalized intensity,blood,Affymetrix Human Genome U219 Array,gencode_v24,unknown,,human,NCBITaxon:9606,blood,UBERON:0000178,Microarray,,,,medium
002_S_1268_002_S_1268_gencode_v24_pruned,002_S_1268_002_S_1268_gencode_v24_pruned,002_S_1268,ADNI,Microarray,Normalized intensity,blood,Affymetrix Human Genome U219 Array,gencode_v24,unknown,,human,NCBITaxon:9606,blood,UBERON:0000178,Microarray,,,,medium



OBS Columns: ['sample_id', 'subject_id', 'dataset', 'data_type', 'expression_unit', 'tissue', 'platform', 'processing', 'sex', 'age', 'species', 'species_ontology', 'tissue_original', 'tissue_ontology', 'data_type_original', 'assay_ontology', 'age_original', 'developmental_stage_ontology', 'tissue_ontology_confidence']

VAR (head 5):


Unnamed: 0,gene_id,original_ids,gene_name,gene_type,chromosome,mapping_source,original_gene_id,ensembl_id,mapping_confidence
ENSG00000000003,ENSG00000000003,ENSG00000000003.14,TSPAN6,protein_coding,chrX,reference_mapping,ENSG00000000003,ENSG00000000003,high
ENSG00000000005,ENSG00000000005,ENSG00000000005.5,TNMD,protein_coding,chrX,reference_mapping,ENSG00000000005,ENSG00000000005,high
ENSG00000000419,ENSG00000000419,ENSG00000000419.12,DPM1,protein_coding,chr20,reference_mapping,ENSG00000000419,ENSG00000000419,high
ENSG00000000457,ENSG00000000457,ENSG00000000457.13,SCYL3,protein_coding,chr1,reference_mapping,ENSG00000000457,ENSG00000000457,high
ENSG00000000938,ENSG00000000938,ENSG00000000938.12,FGR,protein_coding,chr1,reference_mapping,ENSG00000000938,ENSG00000000938,high



VAR Columns: ['gene_id', 'original_ids', 'gene_name', 'gene_type', 'chromosome', 'mapping_source', 'original_gene_id', 'ensembl_id', 'mapping_confidence']

UNS Keys: ['dataset_info', 'gencode_version', 'gene_mapping_stats', 'harmonized_gencode_version', 'harmonized_reference_genome', 'metadata_validation', 'ontology_mappings', 'original_gencode_version', 'original_reference_genome', 'processing_date', 'reference_genome', 'subject_demographics']

UNS Key: 'dataset_info'
  Type: <class 'dict'>
  Value (stringified, max 500 chars): {'assay_ontology': 'EFO:0002695', 'data_type': 'microarray', 'description': "Alzheimer's Disease Neuroimaging Initiative microarray data with demographic information", 'expression_unit': 'quantile normalized microarray counts', 'gencode_version': '24', 'genes': np.int64(17991), 'platform': 'Affymetrix Human Genome U219 Array', 'reference': 'https://pmc.ncbi.nlm.nih.gov/articles/PMC7541709/', 'reference_genome': 'hg38', 'samples': np.int64(650), 'source': 'ADNI

In [4]:
# Cell 3: Load and Inspect ENCODE Preprocessed
adata_encode = load_h5ad_safe(os.path.join(PREPROCESSED_DIR, "encode_standardized_preprocessed.h5ad"))
inspect_adata(adata_encode, "ENCODE Preprocessed")
# Expected: High number of 'unmapped' in var['mapping_source']. Many gene info columns likely empty.
# Check `.uns['gene_mapping_stats']`


--- Loading: /mnt/czi-sci-ai/intrinsic-variation-gene-ex/rnaseq/preprocessed_data/run_20250501_193803/encode_standardized_preprocessed.h5ad ---
Successfully loaded. Shape: (7, 65586)

--- Inspecting ENCODE Preprocessed ---
Shape: (7, 65586)

OBS (head 5):


Unnamed: 0_level_0,original_sample_id,cell_line,description,tissue,disease,cell_type,sex,organism,age,ethnicity,subject_id,geo_id,nucleic_acid_type,depletion,size_range,strand_specificity,cell_type_info,extraction_method,data_type,expression_unit,assay,dataset,sample_id,species,species_ontology,tissue_original,tissue_ontology,data_type_original,assay_ontology,age_original,developmental_stage_ontology,tissue_ontology_confidence
_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
ENCFF244DNJ,ENCFF244DNJ,A549,Human lung carcinoma cell line,lung,adenocarcinoma,epithelial,male,human,58,European,ENCDO451RUA,SAMN05733878,polyadenylated mRNA,rRNA,>200,Strand-specific (reverse),"{'anatomical_entity_id': 'UBERON:0002048', 'ce...",polyA_plus,RNA-seq,TPM,,ENCODE,,human,NCBITaxon:9606,lung,UBERON:0002048,RNA-seq,EFO:0009922,58,HsapDv:0000087,medium
ENCFF685WJV,ENCFF685WJV,Caki2,Human kidney clear cell carcinoma cell line,kidney,clear cell renal cell carcinoma,epithelial,male,human,69,European,ENCDO869AAI,SAMN04284635,RNA,rRNA,>200,Strand-specific (reverse),"{'anatomical_entity_id': 'UBERON:0002113', 'ce...",,RNA-seq,TPM,RNA-seq (total RNA-seq),ENCODE,,human,NCBITaxon:9606,kidney,UBERON:0002113,RNA-seq,EFO:0009922,69,HsapDv:0000087,medium
ENCFF640FPG,ENCFF640FPG,GM23248,Human skin fibroblast cell line,skin,normal,Fibroblast,male,human,53,European,ENCDO467QPX,SAMN04284514,RNA,rRNA,>200,Strand-specific (reverse),"{'anatomical_entity_id': 'UBERON:0002097', 'ce...",,RNA-seq,TPM,RNA-seq (total RNA-seq),ENCODE,,human,NCBITaxon:9606,skin,UBERON:0002097,RNA-seq,EFO:0009922,53,HsapDv:0000087,medium
ENCFF863QWG,ENCFF863QWG,HepG2,Human liver hepatocellular carcinoma cell line,liver,hepatocellular carcinoma,epithelial,male,human,15,European,ENCDO886MPB,SAMN04284581,RNA,rRNA,>200,Strand-specific (reverse),"{'anatomical_entity_id': 'UBERON:0002107', 'ce...",,RNA-seq,TPM,RNA-seq (total RNA-seq),ENCODE,,human,NCBITaxon:9606,liver,UBERON:0002107,RNA-seq,EFO:0009922,15,HsapDv:0000086,medium
ENCFF171FQU,ENCFF171FQU,K562,Human chronic myelogenous leukemia cell line,bone marrow,chronic myelogenous leukemia,lymphoblast,female,human,53,unknown,ENCDO000AAL,SAMN04284550,RNA,rRNA,>200,Strand-specific (reverse),"{'anatomical_entity_id': 'UBERON:0002371', 'ce...",,RNA-seq,TPM,RNA-seq (total RNA-seq),ENCODE,,human,NCBITaxon:9606,bone marrow,UBERON:0002371,RNA-seq,EFO:0009922,53,HsapDv:0000087,medium



OBS Columns: ['original_sample_id', 'cell_line', 'description', 'tissue', 'disease', 'cell_type', 'sex', 'organism', 'age', 'ethnicity', 'subject_id', 'geo_id', 'nucleic_acid_type', 'depletion', 'size_range', 'strand_specificity', 'cell_type_info', 'extraction_method', 'data_type', 'expression_unit', 'assay', 'dataset', 'sample_id', 'species', 'species_ontology', 'tissue_original', 'tissue_ontology', 'data_type_original', 'assay_ontology', 'age_original', 'developmental_stage_ontology', 'tissue_ontology_confidence']

VAR (head 5):


Unnamed: 0,gene_id,original_gene_id,ensembl_id,gene_name,gene_type,chromosome,mapping_source,mapping_confidence
0,ENSG00000161203,ENST00000476434.1|ENSG00000161203.13|OTTHUMG00...,ENSG00000161203,AP2M1,protein_coding,chr3,encode_mapping,high
1,ENSG00000124126,ENST00000496915.1|ENSG00000124126.13|OTTHUMG00...,ENSG00000124126,PREX1,protein_coding,chr20,encode_mapping,high
2,ENSG00000185972,ENST00000335119.3|ENSG00000185972.5|OTTHUMG000...,ENSG00000185972,CCIN,protein_coding,chr9,encode_mapping,high
3,ENSG00000167695,ENST00000308278.12|ENSG00000167695.14|OTTHUMG0...,ENSG00000167695,FAM57A,protein_coding,chr17,encode_mapping,high
4,ENSG00000003402,ENST00000470178.6|ENSG00000003402.19|OTTHUMG00...,ENSG00000003402,CFLAR,protein_coding,chr2,encode_mapping,high



VAR Columns: ['gene_id', 'original_gene_id', 'ensembl_id', 'gene_name', 'gene_type', 'chromosome', 'mapping_source', 'mapping_confidence']

UNS Keys: ['assay_ontology', 'cell_type_info', 'data_type', 'dataset_info', 'gencode_version', 'gene_mapping_stats', 'genome_info', 'harmonized_gencode_version', 'harmonized_reference_genome', 'metadata_sources']

UNS Key: 'metadata_sources'
  Type: <class 'numpy.ndarray'>
  Value (stringified, max 500 chars): ['' '']

UNS Key: 'dataset_info'
  Type: <class 'dict'>
  Value (stringified, max 500 chars): {'cell_lines': {'A549': {'age': '58', 'cell_type': 'epithelial', 'cell_type_info': {'anatomical_entity_id': 'UBERON:0002048', 'cell_type_id': 'CL:0000066'}, 'depletion': 'rRNA', 'description': 'Human lung carcinoma cell line', 'disease': 'adenocarcinoma', 'ethnicity': 'European', 'geo_id': 'SAMN05733878', 'nucleic_acid_type': 'polyadenylated mRNA', 'organism': 'human', 'sex': 'male', 'size_range': '>200', 'strand_specificity': 'Strand-specific (reve

In [5]:
# Cell 4: Load and Inspect GTEx Preprocessed
adata_gtex = load_h5ad_safe(os.path.join(PREPROCESSED_DIR, "gtex_standardized_preprocessed.h5ad"))
inspect_adata(adata_gtex, "GTEx Preprocessed")
# Expected: Should load. Check `.uns['metadata_sources']` - might be missing or look odd if save failed. Check unmapped tissues/assays.


--- Loading: /mnt/czi-sci-ai/intrinsic-variation-gene-ex/rnaseq/preprocessed_data/run_20250501_193803/gtex_standardized_preprocessed.h5ad ---
Successfully loaded. Shape: (19616, 58988)

--- Inspecting GTEx Preprocessed ---
Shape: (19616, 58988)

OBS (head 5):


Unnamed: 0_level_0,SMATSSCR,SMCENTER,SMPTHNTS,rna_integrity_number,broad_tissue,tissue,SMUBRID,ischemic_time,SMTSPAX,array_batch,SMNABTCHT,SMNABTCHD,batch,SMGEBTCHD,SMGEBTCHT,ANALYTE_TYPE,SMAFRZE,SMGTC,SMRDTTL,SMALTTL,SMALTALG,SMSUPALG,SMRDLGTH,SMVQCFL,SMLMAPQ,...,SMSMRTHQ,SMPRERDHQ,SMPRERTHQ,SMSMGNDT,SMPREGNDT,SMRDLNMN,SMRDLNMD,SMRDLNSD,subject_id,sex,age,DTHHRDY,dataset,original_sample_id,data_type,expression_unit,species,species_ontology,tissue_original,tissue_ontology,data_type_original,assay_ontology,age_original,developmental_stage_ontology,tissue_ontology_confidence
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
GTEX-1117F-0005-SM-HL9SH,,B1,,8.3,Blood,whole blood,UBERON:0013756,1188.0,,BP-38553,RNA isolation PAXgene Blood RNA (Manual),5/3/13,LCSET-13764,,TruSeq.v1,RNA:Total RNA,RNASEQ,,134604994.0,367633937.0,232956008.0,72935.0,76.0,16433925.0,304143172.0,...,,,,,,,,,GTEX-1117F,unknown,60-69,4.0,GTEx,,RNA-seq,,human,NCBITaxon:9606,Whole Blood,UBERON:0000178,,EFO:0009922,60-69,,medium
GTEX-1117F-0011-R10b-SM-GI4VE,,"B1, A1",,7.2,Brain,brain - frontal cortex (ba9),UBERON:0009834,1193.0,,BP-42319,RNA Isolation via QIAGEN Spin Column,8/14/13,LCSET-12010,,TruSeq.v1,RNA:Total RNA,RNASEQ,,67350184.0,75528103.0,8050459.0,127460.0,76.0,3574302.0,12367264.0,...,,,,,,,,,GTEX-1117F,unknown,60-69,4.0,GTEx,,RNA-seq,,human,NCBITaxon:9606,Brain - Frontal Cortex (BA9),UBERON:0013529,,EFO:0009922,60-69,,medium
GTEX-1117F-0011-R11b-SM-GIN8R,,"B1, A1",,6.0,Brain,brain - cerebellar hemisphere,UBERON:0002037,1193.0,,BP-42319,RNA Isolation via QIAGEN Spin Column,8/14/13,LCSET-12011,,TruSeq.v1,RNA:Total RNA,RNASEQ,,99021352.0,122532321.0,23266593.0,244376.0,76.0,4879614.0,33578345.0,...,,,,,,,,,GTEX-1117F,unknown,60-69,4.0,GTEx,,RNA-seq,,human,NCBITaxon:9606,Brain - Cerebellar Hemisphere,UBERON:0002245,,EFO:0009922,60-69,,medium
GTEX-1117F-0011-R2b-SM-GI4VL,,"B1, A1",,5.7,Brain,brain - substantia nigra,UBERON:0002038,1193.0,,BP-42208,RNA Isolation via QIAGEN Spin Column,8/12/13,LCSET-12010,,TruSeq.v1,RNA:Total RNA,RNASEQ,,102541796.0,126416358.0,23734034.0,140528.0,76.0,5688018.0,35903889.0,...,,,,,,,,,GTEX-1117F,unknown,60-69,4.0,GTEx,,RNA-seq,,human,NCBITaxon:9606,Brain - Substantia nigra,UBERON:0002038,,EFO:0009922,60-69,,medium
GTEX-1117F-0011-R3a-SM-GJ3PJ,,"B1, A1",,7.1,Brain,brain - anterior cingulate cortex (ba24),UBERON:0009835,1193.0,,BP-42208,RNA Isolation via QIAGEN Spin Column,8/12/13,LCSET-12012,,TruSeq.v1,RNA:Total RNA,RNASEQ,,87919278.0,104564329.0,16487319.0,157732.0,76.0,3586415.0,24566291.0,...,,,,,,,,,GTEX-1117F,unknown,60-69,4.0,GTEx,,RNA-seq,,human,NCBITaxon:9606,Brain - Anterior cingulate cortex (BA24),UBERON:0009835,,EFO:0009922,60-69,,medium



OBS Columns: ['SMATSSCR', 'SMCENTER', 'SMPTHNTS', 'rna_integrity_number', 'broad_tissue', 'tissue', 'SMUBRID', 'ischemic_time', 'SMTSPAX', 'array_batch', 'SMNABTCHT', 'SMNABTCHD', 'batch', 'SMGEBTCHD', 'SMGEBTCHT', 'ANALYTE_TYPE', 'SMAFRZE', 'SMGTC', 'SMRDTTL', 'SMALTTL', 'SMALTALG', 'SMSUPALG', 'SMRDLGTH', 'SMVQCFL', 'SMLMAPQ', 'SMUMPRD', 'SMUNPDRD', 'SMMPPD', 'SMMAPRT', 'SMMPPDUN', 'SMUNMPRT', 'SMMPDP', 'SMDPMPRT', 'SMMPPDXG', 'SMMPDPXG', 'SMDPRTXG', 'SMCHMRD', 'SMCHMRT', 'SMMPPDPR', 'SMMPHQRD', 'SMMPHQRT', 'SMMPLQRD', 'SMSPLTRT', 'SME1MPRD', 'SME2MPRD', 'SME1MPRT', 'SME2MPRT', 'SME1MMB', 'SME2MMB', 'SME1TTLB', 'SME2TTLB', 'SME1MMRT', 'SME2MMRT', 'SMTTLMM', 'SMTTLB', 'SMBSMMRT', 'SMESTLBS', 'SMEXNCRD', 'SMEXNCRT', 'SMEXPEFF', 'SMNTRNRD', 'SMNTRNRT', 'SMNTRARD', 'SMNTRART', 'SMNTERRD', 'SMNTERRT', 'SMAMBRD', 'SMAMBRT', 'SMNTEXC', 'SMDSCRT', 'SMEXNCRTHQ', 'SMNTRNRTHQ', 'SMNTRARTHQ', 'SMNTERRTHQ', 'SMAMBRTHQ', 'SME1SNSE', 'SME2SNSE', 'SME1ANTI', 'SME2ANTI', 'SME1PCTS', 'SME2PCTS', 'SMG

Unnamed: 0,gene_id,original_ids,gene_name,gene_type,chromosome,mapping_source,original_gene_id,ensembl_id,mapping_confidence
ENSG00000000003,ENSG00000000003,ENSG00000000003.15,TSPAN6,protein_coding,chrX,reference_mapping,ENSG00000000003,ENSG00000000003,high
ENSG00000000005,ENSG00000000005,ENSG00000000005.6,TNMD,protein_coding,chrX,reference_mapping,ENSG00000000005,ENSG00000000005,high
ENSG00000000419,ENSG00000000419,ENSG00000000419.14,DPM1,protein_coding,chr20,reference_mapping,ENSG00000000419,ENSG00000000419,high
ENSG00000000457,ENSG00000000457,ENSG00000000457.14,SCYL3,protein_coding,chr1,reference_mapping,ENSG00000000457,ENSG00000000457,high
ENSG00000000460,ENSG00000000460,ENSG00000000460.17,C1orf112,protein_coding,chr1,reference_mapping,ENSG00000000460,ENSG00000000460,high



VAR Columns: ['gene_id', 'original_ids', 'gene_name', 'gene_type', 'chromosome', 'mapping_source', 'original_gene_id', 'ensembl_id', 'mapping_confidence']

UNS Keys: ['dataset_info', 'dataset_version', 'gencode_version', 'gene_mapping_stats', 'harmonized_gencode_version', 'harmonized_reference_genome', 'metadata_sources']

UNS Key: 'metadata_sources'
  Type: <class 'numpy.ndarray'>
  Value (stringified, max 500 chars): ['' '']

UNS Key: 'dataset_info'
  Type: <class 'dict'>
  Value (stringified, max 500 chars): {'data_type': 'RNA-seq', 'expression_unit': 'TPM', 'gencode_version': np.int64(24), 'genes': np.int64(58988), 'samples': np.int64(19616), 'source': 'GTEx', 'subject_count': np.int64(946), 'tissue_count': np.int64(54), 'version': 'v10'}
    Sub-key 'data_type' type: <class 'str'>
    Sub-key 'expression_unit' type: <class 'str'>
    Sub-key 'gencode_version' type: <class 'numpy.int64'>
    Sub-key 'genes' type: <class 'numpy.int64'>
    Sub-key 'samples' type: <class 'numpy.int6

In [6]:
# Cell 5: Load and Inspect MAGE Preprocessed
adata_mage = load_h5ad_safe(os.path.join(PREPROCESSED_DIR, "mage_standardized_preprocessed.h5ad"))
inspect_adata(adata_mage, "MAGE Preprocessed")
# Expected: Should load. Check `.uns['metadata_sources']` and `.uns['dataset_info']['data_repository']`. Check unmapped tissues ('lymphoblast').


--- Loading: /mnt/czi-sci-ai/intrinsic-variation-gene-ex/rnaseq/preprocessed_data/run_20250501_193803/mage_standardized_preprocessed.h5ad ---
Successfully loaded. Shape: (731, 19428)

--- Inspecting MAGE Preprocessed ---
Shape: (731, 19428)

OBS (head 5):


Unnamed: 0_level_0,sample_id,donor_id,subject_id,tissue,dataset,data_type,expression_unit,is_gencode,is_ensembl,sex,age,species,species_ontology,tissue_original,tissue_ontology,data_type_original,assay_ontology,age_original,developmental_stage_ontology,tissue_ontology_confidence
_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
NA06985_NA06985,NA06985_NA06985,NA06985,NA06985,lymphoblast,MAGE,rna-seq,TPM,False,True,unknown,,human,NCBITaxon:9606,lymphoblast,,RNA-seq,EFO:0009922,,,none
NA07000_NA07000,NA07000_NA07000,NA07000,NA07000,lymphoblast,MAGE,rna-seq,TPM,False,True,unknown,,human,NCBITaxon:9606,lymphoblast,,RNA-seq,EFO:0009922,,,none
NA11919_NA11919,NA11919_NA11919,NA11919,NA11919,lymphoblast,MAGE,rna-seq,TPM,False,True,unknown,,human,NCBITaxon:9606,lymphoblast,,RNA-seq,EFO:0009922,,,none
NA11930_NA11930,NA11930_NA11930,NA11930,NA11930,lymphoblast,MAGE,rna-seq,TPM,False,True,unknown,,human,NCBITaxon:9606,lymphoblast,,RNA-seq,EFO:0009922,,,none
NA11932_NA11932,NA11932_NA11932,NA11932,NA11932,lymphoblast,MAGE,rna-seq,TPM,False,True,unknown,,human,NCBITaxon:9606,lymphoblast,,RNA-seq,EFO:0009922,,,none



OBS Columns: ['sample_id', 'donor_id', 'subject_id', 'tissue', 'dataset', 'data_type', 'expression_unit', 'is_gencode', 'is_ensembl', 'sex', 'age', 'species', 'species_ontology', 'tissue_original', 'tissue_ontology', 'data_type_original', 'assay_ontology', 'age_original', 'developmental_stage_ontology', 'tissue_ontology_confidence']

VAR (head 5):


Unnamed: 0,gene_id,original_ids,gene_name,gene_type,chromosome,mapping_source,original_gene_id,ensembl_id,mapping_confidence
ENSG00000000003,ENSG00000000003,ENSG00000000003.14,TSPAN6,protein_coding,chrX,reference_mapping,ENSG00000000003,ENSG00000000003,high
ENSG00000000419,ENSG00000000419,ENSG00000000419.12,DPM1,protein_coding,chr20,reference_mapping,ENSG00000000419,ENSG00000000419,high
ENSG00000000457,ENSG00000000457,ENSG00000000457.13,SCYL3,protein_coding,chr1,reference_mapping,ENSG00000000457,ENSG00000000457,high
ENSG00000000460,ENSG00000000460,ENSG00000000460.16,C1orf112,protein_coding,chr1,reference_mapping,ENSG00000000460,ENSG00000000460,high
ENSG00000000938,ENSG00000000938,ENSG00000000938.12,FGR,protein_coding,chr1,reference_mapping,ENSG00000000938,ENSG00000000938,high



VAR Columns: ['gene_id', 'original_ids', 'gene_name', 'gene_type', 'chromosome', 'mapping_source', 'original_gene_id', 'ensembl_id', 'mapping_confidence']

UNS Keys: ['assay_ontology', 'cell_type_info', 'data_type', 'dataset_info', 'gencode_version', 'gene_mapping_stats', 'genome_info', 'harmonized_gencode_version', 'harmonized_reference_genome', 'metadata_sources']

UNS Key: 'metadata_sources'
  Type: <class 'numpy.ndarray'>
  Value (stringified, max 500 chars): ['' '']

UNS Key: 'dataset_info'
  Type: <class 'dict'>
  Value (stringified, max 500 chars): {'accession': 'PRJNA851328', 'cell_type': 'lymphoblastoid cell lines', 'continental_groups': np.int64(5), 'data_repository': '[{"name": "Sequence Read Archive (SRA)", "accession": "PRJNA851328"}, {"name": "Zenodo", "doi": "10.5281/zenodo.10535719"}]', 'data_type': 'RNA-seq', 'donor_count': np.int64(731), 'expression_unit': 'TPM', 'gencode_version': np.int64(24), 'genes': np.int64(19428), 'library_count': np.int64(779), 'populations':