AnnData Object Build

## Construct AnnData objects from matrix txt files

In [38]:
import pandas as pd
import anndata as ad
import scanpy as sc 
import os
import GEOparse

In [39]:
AM_KP_neg_txt = 'data/purified/GSM5583415_PN0162_0001_counts_purified_downsampled.txt'
AM_KP_pos_txt = 'data/purified/GSM5583416_PN0162_0002_counts_purified_downsampled.txt'
IM_KP_neg_txt = 'data/purified/GSM5583417_PN0162_0003_counts_purified_downsampled.txt'
IM_KP_pos_txt = 'data/purified/GSM5583418_PN0162_0004_counts_purified_downsampled.txt'
AM_Control_txt = 'data/purified/GSM5583419_PN0162_0005_counts_purified_downsampled.txt'
IM_Control_txt = 'data/purified/GSM5583420_PN0162_0006_counts_purified_downsampled.txt'

In [29]:
AM_KP_neg_df = pd.read_csv(AM_KP_neg_txt, sep='\t', index_col=0).T # flip rows x col
AM_KP_pos_df = pd.read_csv(AM_KP_pos_txt, sep='\t', index_col=0).T
IM_KP_neg_df = pd.read_csv(IM_KP_neg_txt, sep='\t', index_col=0).T
IM_KP_pos_df = pd.read_csv(IM_KP_pos_txt, sep='\t', index_col=0).T
AM_Control_df = pd.read_csv(AM_Control_txt, sep='\t', index_col=0).T
IM_Control_df = pd.read_csv(IM_Control_txt, sep='\t', index_col=0).T






In [30]:
gsm_file_map = {
    'GSM5583415': AM_KP_neg_txt,
    'GSM5583416': AM_KP_pos_txt,
    'GSM5583417': IM_KP_neg_txt,
    'GSM5583418': IM_KP_pos_txt,
    'GSM5583419': AM_Control_txt,
    'GSM5583420': IM_Control_txt,
}

In [31]:
list_of_dfs_to_concat = []
for gsm_id, file_path in gsm_file_map.items():
    # Load each file: genes as index, cells as columns
    df_raw = pd.read_csv(file_path, sep='\t', index_col=0)
    # Transpose to cells x genes
    df_transposed = df_raw.T
    # Add a column to store the original sample (GSM) ID for each cell
    df_transposed['original_sample_id'] = gsm_id
    list_of_dfs_to_concat.append(df_transposed)
    print(f"Loaded {gsm_id}: shape {df_transposed.shape}")

Loaded GSM5583415: shape (422, 15548)
Loaded GSM5583416: shape (422, 15548)
Loaded GSM5583417: shape (422, 15548)
Loaded GSM5583418: shape (422, 15548)
Loaded GSM5583419: shape (422, 15548)
Loaded GSM5583420: shape (422, 15548)


In [34]:
# Concatenate all DataFrames into a single DataFrame
# `axis=0` stacks them row-wise (adding more cells)
combined_counts_df = pd.concat(list_of_dfs_to_concat, axis=0)
# The 'original_sample_id' column will be used for metadata.
# Remove it from the main matrix before creating AnnData
original_sample_id_series = combined_counts_df['original_sample_id']
combined_counts_df = combined_counts_df.drop(columns=['original_sample_id'])
print("\nCombined counts_df shape (total cells x total genes):", combined_counts_df.shape)
print("Combined counts_df head:")
print(combined_counts_df.head())


Combined counts_df shape (total cells x total genes): (2532, 15547)
Combined counts_df head:
                    Mrpl15  Lypla1  Gm37988  Tcea1  Atp6v1h  Rb1cc1  \
TGCTTCGTCTGTACAG_1       1       2        0      8        2       1   
AGTAGCTAGTAACCTC_1       1       1        0      0        2       0   
GTGGAGAAGAGTACCG_1       0       0        0      0        0       0   
GTAATGCTCTCATTAC_1       2       2        0      0        0       1   
CGCATAAAGTCGAAAT_1       3       0        0      3        4       1   

                    4732440D04Rik  Pcmtd1  Gm26901  Rrs1  ...  Pacrg  Scube3  \
TGCTTCGTCTGTACAG_1              0       1        0     0  ...      0       0   
AGTAGCTAGTAACCTC_1              0       3        0     1  ...      0       0   
GTGGAGAAGAGTACCG_1              0       0        0     0  ...      0       0   
GTAATGCTCTCATTAC_1              0       0        0     0  ...      0       0   
CGCATAAAGTCGAAAT_1              0       2        0     2  ...      0       0   

In [40]:
# --- 2. Get the Phenotypic Data using GEOparse ---
gse_id = "GSE184290"
download_dir = './GEO_downloads'
os.makedirs(download_dir, exist_ok=True)
gse = GEOparse.get_GEO(geo=gse_id, destdir=download_dir)

all_sample_metadata = []
for gsm_id, gsm_obj in gse.gsms.items():
    sample_metadata = {key: value[0] if value else None for key, value in gsm_obj.metadata.items()}
    sample_metadata['sample_id'] = gsm_id
    all_sample_metadata.append(sample_metadata)

pheno_data_df = pd.DataFrame(all_sample_metadata)
pheno_data_df = pheno_data_df.set_index('sample_id')

print("\nPhenotypic Data DataFrame shape (samples x features):", pheno_data_df.shape)
print("Phenotypic Data DataFrame head:")
print(pheno_data_df.head())

16-Jun-2025 18:35:09 INFO GEOparse - Downloading ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE184nnn/GSE184290/soft/GSE184290_family.soft.gz to ./GEO_downloads/GSE184290_family.soft.gz
16-Jun-2025 18:35:09 INFO utils - Downloading ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE184nnn/GSE184290/soft/GSE184290_family.soft.gz to ./GEO_downloads/GSE184290_family.soft.gz





16-Jun-2025 18:35:11 INFO GEOparse - Parsing ./GEO_downloads/GSE184290_family.soft.gz: 
16-Jun-2025 18:35:11 DEBUG GEOparse - DATABASE: GeoMiame
16-Jun-2025 18:35:11 DEBUG GEOparse - SERIES: GSE184290
16-Jun-2025 18:35:11 DEBUG GEOparse - PLATFORM: GPL24247
16-Jun-2025 18:35:11 DEBUG GEOparse - SAMPLE: GSM5583415
16-Jun-2025 18:35:11 DEBUG GEOparse - SAMPLE: GSM5583416
16-Jun-2025 18:35:11 DEBUG GEOparse - SAMPLE: GSM5583417
16-Jun-2025 18:35:11 DEBUG GEOparse - SAMPLE: GSM5583418
16-Jun-2025 18:35:11 DEBUG GEOparse - SAMPLE: GSM5583419
16-Jun-2025 18:35:11 DEBUG GEOparse - SAMPLE: GSM5583420




Phenotypic Data DataFrame shape (samples x features): (6, 35)
Phenotypic Data DataFrame head:
                 title geo_accession                 status submission_date  \
sample_id                                                                     
GSM5583415      AM_KP-    GSM5583415  Public on Sep 16 2022     Sep 16 2021   
GSM5583416      AM_KP+    GSM5583416  Public on Sep 16 2022     Sep 16 2021   
GSM5583417      IM_KP-    GSM5583417  Public on Sep 16 2022     Sep 16 2021   
GSM5583418      IM_KP+    GSM5583418  Public on Sep 16 2022     Sep 16 2021   
GSM5583419  AM_Control    GSM5583419  Public on Sep 16 2022     Sep 16 2021   

           last_update_date type channel_count source_name_ch1  organism_ch1  \
sample_id                                                                      
GSM5583415      Sep 16 2022  SRA             1            Lung  Mus musculus   
GSM5583416      Sep 16 2022  SRA             1            Lung  Mus musculus   
GSM5583417      Sep 16 2022  S

In [18]:
from GEOparse import get_GEO


In [41]:
# --- 3. Create the AnnData object ---
# The combined_counts_df is already cells x genes, so no further transpose needed for ad.AnnData
adata = ad.AnnData(combined_counts_df)

print("\nAnnData object created from combined counts. Shape (cells x genes):", adata.shape)



AnnData object created from combined counts. Shape (cells x genes): (2532, 15547)


  adata = ad.AnnData(combined_counts_df)


In [44]:
# --- 4. Populate .obs (observation/cell metadata) - SIMPLER AND MORE ROBUST ---

# First, add the 'original_sample_id' column to adata.obs
# Ensure the series index aligns with adata.obs_names
adata.obs['original_sample_id'] = original_sample_id_series.loc[adata.obs_names]

# Now, perform the merge directly on adata.obs.
# The 'original_sample_id' column in adata.obs will be matched with the index of pheno_data_df.
# Use .copy() to ensure you're working on a new DataFrame slice and not modifying inplace unexpectedly.
adata.obs = adata.obs.merge(
    pheno_data_df,
    left_on='original_sample_id', # Column in adata.obs to use for merging
    right_index=True,             # Use the index of pheno_data_df for merging
    how='left'
).set_index(adata.obs_names) # IMPORTANT: Re-set the index back to original cell barcodes


In [45]:
# --- 5. Populate .var (variable/gene metadata) ---
adata.var_names_make_unique()


print("\nFinal AnnData object:")
print(adata)
print("\nadata.obs head (with merged metadata):")
print(adata.obs.head())
print("\nColumns in adata.obs:")
print(adata.obs.columns)


Final AnnData object:
AnnData object with n_obs × n_vars = 2532 × 15547
    obs: 'original_sample_id', 'title_x', 'geo_accession_x', 'status_x', 'submission_date_x', 'last_update_date_x', 'type_x', 'channel_count_x', 'source_name_ch1_x', 'organism_ch1_x', 'taxid_ch1_x', 'characteristics_ch1_x', 'molecule_ch1_x', 'extract_protocol_ch1_x', 'description_x', 'data_processing_x', 'platform_id_x', 'contact_name_x', 'contact_email_x', 'contact_laboratory_x', 'contact_department_x', 'contact_institute_x', 'contact_address_x', 'contact_city_x', 'contact_state_x', 'contact_zip/postal_code_x', 'contact_country_x', 'instrument_model_x', 'library_selection_x', 'library_source_x', 'library_strategy_x', 'relation_x', 'supplementary_file_1_x', 'supplementary_file_2_x', 'series_id_x', 'data_row_count_x', 'title_y', 'geo_accession_y', 'status_y', 'submission_date_y', 'last_update_date_y', 'type_y', 'channel_count_y', 'source_name_ch1_y', 'organism_ch1_y', 'taxid_ch1_y', 'characteristics_ch1_y', 'molecu

In [19]:
gse = get_GEO(geo="GSE184290", destdir="./")

16-Jun-2025 18:10:42 INFO GEOparse - Downloading ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE184nnn/GSE184290/soft/GSE184290_family.soft.gz to ./GSE184290_family.soft.gz
16-Jun-2025 18:10:42 INFO utils - Downloading ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE184nnn/GSE184290/soft/GSE184290_family.soft.gz to ./GSE184290_family.soft.gz





16-Jun-2025 18:10:43 INFO GEOparse - Parsing ./GSE184290_family.soft.gz: 
16-Jun-2025 18:10:43 DEBUG GEOparse - DATABASE: GeoMiame
16-Jun-2025 18:10:43 DEBUG GEOparse - SERIES: GSE184290
16-Jun-2025 18:10:43 DEBUG GEOparse - PLATFORM: GPL24247
16-Jun-2025 18:10:43 DEBUG GEOparse - SAMPLE: GSM5583415
16-Jun-2025 18:10:43 DEBUG GEOparse - SAMPLE: GSM5583416
16-Jun-2025 18:10:43 DEBUG GEOparse - SAMPLE: GSM5583417
16-Jun-2025 18:10:43 DEBUG GEOparse - SAMPLE: GSM5583418
16-Jun-2025 18:10:43 DEBUG GEOparse - SAMPLE: GSM5583419
16-Jun-2025 18:10:43 DEBUG GEOparse - SAMPLE: GSM5583420





In [46]:
# Define the file path where you want to save it
output_file = "my_single_cell_data.h5ad"

# Save the AnnData object
adata.write(output_file)

In [21]:
# Initialize an empty list to store metadata for each sample
all_sample_metadata = []

# Iterate through each GSM (sample) in the GSE object
for gsm_id, gsm_obj in gse.gsms.items():
    # The metadata for each sample is in gsm_obj.metadata
    # It's a dictionary of lists (because some attributes can have multiple values)
    sample_metadata = {key: value[0] if value else None for key, value in gsm_obj.metadata.items()}
    sample_metadata['sample_id'] = gsm_id # Add the GSM ID itself
    all_sample_metadata.append(sample_metadata)

In [23]:
pheno_data_df = pd.DataFrame(all_sample_metadata)
pheno_data_df = pheno_data_df.set_index('sample_id')

print("Phenotypic Data DataFrame:")
print(pheno_data_df.head())

Phenotypic Data DataFrame:
                 title geo_accession                 status submission_date  \
sample_id                                                                     
GSM5583415      AM_KP-    GSM5583415  Public on Sep 16 2022     Sep 16 2021   
GSM5583416      AM_KP+    GSM5583416  Public on Sep 16 2022     Sep 16 2021   
GSM5583417      IM_KP-    GSM5583417  Public on Sep 16 2022     Sep 16 2021   
GSM5583418      IM_KP+    GSM5583418  Public on Sep 16 2022     Sep 16 2021   
GSM5583419  AM_Control    GSM5583419  Public on Sep 16 2022     Sep 16 2021   

           last_update_date type channel_count source_name_ch1  organism_ch1  \
sample_id                                                                      
GSM5583415      Sep 16 2022  SRA             1            Lung  Mus musculus   
GSM5583416      Sep 16 2022  SRA             1            Lung  Mus musculus   
GSM5583417      Sep 16 2022  SRA             1            Lung  Mus musculus   
GSM5583418      Sep