In [20]:
# Create a data directory
import os
os.makedirs("data", exist_ok=True)

# Download the GEO raw tar file
!wget -O data/GSE297365_RAW.tar \
ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE297nnn/GSE297365/suppl/GSE297365_RAW.tar

--2025-12-21 06:27:27--  ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE297nnn/GSE297365/suppl/GSE297365_RAW.tar
           => ‘data/GSE297365_RAW.tar’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.31, 130.14.250.7, 130.14.250.10, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.31|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /geo/series/GSE297nnn/GSE297365/suppl ... done.
==> SIZE GSE297365_RAW.tar ... 357468160
==> PASV ... done.    ==> RETR GSE297365_RAW.tar ... done.
Length: 357468160 (341M) (unauthoritative)


2025-12-21 06:27:34 (61.9 MB/s) - ‘data/GSE297365_RAW.tar’ saved [357468160]



In [21]:
# Extract the tar archive
!tar -xvf data/GSE297365_RAW.tar -C data/

GSM8989497_105_2g_ABT_263_raw_expression.h5
GSM8989498_105_2g_DMSO_raw_expression.h5
GSM8989499_HD_7HA_ABT_263_raw_expression.h5
GSM8989500_HD_7HA_DMSO_raw_expression.h5


In [22]:
!ls data/

GSE297365_RAW.tar
GSM8989497_105_2g_ABT_263_raw_expression.h5
GSM8989498_105_2g_DMSO_raw_expression.h5
GSM8989499_HD_7HA_ABT_263_raw_expression.h5
GSM8989500_HD_7HA_DMSO_raw_expression.h5


In [23]:
!find data -maxdepth 3

data
data/GSE297365_RAW.tar
data/GSM8989497_105_2g_ABT_263_raw_expression.h5
data/GSM8989500_HD_7HA_DMSO_raw_expression.h5
data/GSM8989498_105_2g_DMSO_raw_expression.h5
data/GSM8989499_HD_7HA_ABT_263_raw_expression.h5


In [24]:
!pip install scanpy anndata h5py



In [25]:
import h5py

file = "data/GSM8989497_105_2g_ABT_263_raw_expression.h5"

with h5py.File(file, "r") as f:
    print(list(f.keys()))

['matrix']


In [26]:
import scanpy as sc

file = "data/GSM8989497_105_2g_ABT_263_raw_expression.h5"

adata = sc.read_10x_h5(file)
adata

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 1851517 × 60623
    var: 'gene_ids', 'feature_types', 'genome'

In [27]:
adata.obs.head()


AAACCCAAGAAACCCA-1
AAACCCAAGAAAGCGA-1
AAACCCAAGAAATCCA-1
AAACCCAAGAAATTGC-1
AAACCCAAGAACAGGA-1


In [28]:
adata.var.head(10)

Unnamed: 0,gene_ids,feature_types,genome
DDX11L1,ENSG00000223972,Gene Expression,refdata-cellranger-GRC38-3.1.0
WASH7P,ENSG00000227232,Gene Expression,refdata-cellranger-GRC38-3.1.0
MIR6859-1,ENSG00000278267,Gene Expression,refdata-cellranger-GRC38-3.1.0
MIR1302-2HG,ENSG00000243485,Gene Expression,refdata-cellranger-GRC38-3.1.0
MIR1302-2,ENSG00000284332,Gene Expression,refdata-cellranger-GRC38-3.1.0
FAM138A,ENSG00000237613,Gene Expression,refdata-cellranger-GRC38-3.1.0
OR4G4P,ENSG00000268020,Gene Expression,refdata-cellranger-GRC38-3.1.0
OR4G11P,ENSG00000240361,Gene Expression,refdata-cellranger-GRC38-3.1.0
OR4F5,ENSG00000186092,Gene Expression,refdata-cellranger-GRC38-3.1.0
AL627309.1,ENSG00000238009,Gene Expression,refdata-cellranger-GRC38-3.1.0


In [29]:
adata.var_names[:10]

Index(['DDX11L1', 'WASH7P', 'MIR6859-1', 'MIR1302-2HG', 'MIR1302-2', 'FAM138A',
       'OR4G4P', 'OR4G11P', 'OR4F5', 'AL627309.1'],
      dtype='object')

In [30]:
adata.var_names.duplicated().sum()

np.int64(1211)

In [31]:
adata.var_names_make_unique()

  adata.var_names_make_unique()


In [32]:
# QC metrics
adata.obs["n_counts"] = adata.X.sum(axis=1).A1
adata.obs["n_genes"] = (adata.X > 0).sum(axis=1).A1

adata.obs[["n_counts", "n_genes"]].describe()

Unnamed: 0,n_counts,n_genes
count,1851517.0,1851517.0
mean,72.55589,29.19349
std,1019.647,285.8996
min,0.0,0.0
25%,0.0,0.0
50%,1.0,1.0
75%,1.0,1.0
max,72445.0,10202.0


In [33]:
adata_filt = adata[
    (adata.obs["n_genes"] >= 500) &
    (adata.obs["n_counts"] >= 1000) &
    (adata.obs["n_genes"] <= 8000)
].copy()

adata_filt

AnnData object with n_obs × n_vars = 8987 × 60623
    obs: 'n_counts', 'n_genes'
    var: 'gene_ids', 'feature_types', 'genome'

In [34]:
adata_filt.obs["disease"] = "PMS"
adata_filt.obs["treatment"] = "ABT_263"
adata_filt.obs["donor"] = "105_2g"
adata_filt.obs["sample"] = "PMS_ABT"

In [35]:

adata_filt.write("data/PMS_ABT_filtered.h5ad")

In [36]:
!ls -lh data/

total 922M
-rw-r--r-- 1 root root 341M Dec 21 06:27 GSE297365_RAW.tar
-rw-r--r-- 1 root root  97M Jan 17  2025 GSM8989497_105_2g_ABT_263_raw_expression.h5
-rw-r--r-- 1 root root  54M Jan 17  2025 GSM8989498_105_2g_DMSO_raw_expression.h5
-rw-r--r-- 1 root root 106M Jan 17  2025 GSM8989499_HD_7HA_ABT_263_raw_expression.h5
-rw-r--r-- 1 root root  86M Jan 17  2025 GSM8989500_HD_7HA_DMSO_raw_expression.h5
-rw-r--r-- 1 root root 240M Dec 21 06:29 PMS_ABT_filtered.h5ad


In [37]:
import scanpy as sc

adata_check = sc.read_h5ad("data/PMS_ABT_filtered.h5ad")
adata_check

AnnData object with n_obs × n_vars = 8987 × 60623
    obs: 'n_counts', 'n_genes', 'disease', 'treatment', 'donor', 'sample'
    var: 'gene_ids', 'feature_types', 'genome'

In [38]:
# What metadata do we have?
adata_check.obs.head()

Unnamed: 0,n_counts,n_genes,disease,treatment,donor,sample
AAACCCAAGTGTAGTA-1,22374.0,6087,PMS,ABT_263,105_2g,PMS_ABT
AAACCCACAGTTACCA-1,13908.0,5216,PMS,ABT_263,105_2g,PMS_ABT
AAACCCACATTGAGCT-1,15994.0,4569,PMS,ABT_263,105_2g,PMS_ABT
AAACCCAGTGACACAG-1,11020.0,4117,PMS,ABT_263,105_2g,PMS_ABT
AAACCCAGTTCGGTAT-1,25389.0,6495,PMS,ABT_263,105_2g,PMS_ABT


In [39]:

# What gene annotations exist?
adata_check.var.head()

Unnamed: 0,gene_ids,feature_types,genome
DDX11L1,ENSG00000223972,Gene Expression,refdata-cellranger-GRC38-3.1.0
WASH7P,ENSG00000227232,Gene Expression,refdata-cellranger-GRC38-3.1.0
MIR6859-1,ENSG00000278267,Gene Expression,refdata-cellranger-GRC38-3.1.0
MIR1302-2HG,ENSG00000243485,Gene Expression,refdata-cellranger-GRC38-3.1.0
MIR1302-2,ENSG00000284332,Gene Expression,refdata-cellranger-GRC38-3.1.0


In [40]:
# Shape and sparsity
adata_check.n_obs, adata_check.n_vars

(8987, 60623)

In [41]:
# Top expressed genes (raw counts)
import numpy as np

gene_sums = np.array(adata_check.X.sum(axis=0)).flatten()
top_genes = adata_check.var_names[np.argsort(gene_sums)[-10:]]
top_genes

Index(['RPL13', 'RPS18', 'EEF1A1', 'RPS8', 'RPL37A', 'RPLP1', 'RPL41', 'RPS19',
       'MT-RNR2', 'MALAT1'],
      dtype='object')

In [42]:


del adata_check
import gc
gc.collect()

42

In [43]:
import scanpy as sc

file = "data/GSM8989498_105_2g_DMSO_raw_expression.h5"
adata = sc.read_10x_h5(file)
adata

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 1764145 × 60623
    var: 'gene_ids', 'feature_types', 'genome'

In [44]:
adata.var_names_make_unique()

  adata.var_names_make_unique()


In [45]:
# QC metrics
adata.obs["n_counts"] = adata.X.sum(axis=1).A1
adata.obs["n_genes"] = (adata.X > 0).sum(axis=1).A1

adata.obs[["n_counts", "n_genes"]].describe()

Unnamed: 0,n_counts,n_genes
count,1764145.0,1764145.0
mean,48.627,15.69357
std,1201.703,235.9746
min,0.0,0.0
25%,0.0,0.0
50%,1.0,1.0
75%,2.0,2.0
max,143758.0,13572.0


In [46]:
adata_filt = adata[
    (adata.obs["n_genes"] >= 500) &
    (adata.obs["n_counts"] >= 1000) &
    (adata.obs["n_genes"] <= 8000)
].copy()

adata_filt

AnnData object with n_obs × n_vars = 3281 × 60623
    obs: 'n_counts', 'n_genes'
    var: 'gene_ids', 'feature_types', 'genome'

In [47]:
adata_filt.obs["disease"] = "PMS"
adata_filt.obs["treatment"] = "DMSO"
adata_filt.obs["donor"] = "105_2g"
adata_filt.obs["sample"] = "PMS_DMSO"

In [48]:
adata_filt.write("data/PMS_DMSO_filtered.h5ad")

In [49]:
del adata
del adata_filt

import gc
gc.collect()

31

In [50]:
import scanpy as sc

file = "data/GSM8989499_HD_7HA_ABT_263_raw_expression.h5"
adata = sc.read_10x_h5(file)
adata

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 1948528 × 60623
    var: 'gene_ids', 'feature_types', 'genome'

In [51]:
adata.var_names_make_unique()

  adata.var_names_make_unique()


In [52]:
# QC metrics
adata.obs["n_counts"] = adata.X.sum(axis=1).A1
adata.obs["n_genes"] = (adata.X > 0).sum(axis=1).A1

adata.obs[["n_counts", "n_genes"]].describe()

Unnamed: 0,n_counts,n_genes
count,1948528.0,1948528.0
mean,67.88657,29.78015
std,843.4061,251.4934
min,0.0,0.0
25%,0.0,0.0
50%,1.0,1.0
75%,1.0,1.0
max,68837.0,9346.0


In [53]:


adata_filt = adata[
    (adata.obs["n_genes"] >= 500) &
    (adata.obs["n_counts"] >= 1000) &
    (adata.obs["n_genes"] <= 8000)
].copy()

adata_filt

AnnData object with n_obs × n_vars = 11424 × 60623
    obs: 'n_counts', 'n_genes'
    var: 'gene_ids', 'feature_types', 'genome'

In [54]:
adata_filt.obs["disease"] = "Healthy"
adata_filt.obs["treatment"] = "ABT_263"
adata_filt.obs["donor"] = "7HA"
adata_filt.obs["sample"] = "HD_ABT"

In [55]:
adata_filt.write("data/HD_ABT_filtered.h5ad")

In [56]:
del adata
del adata_filt

import gc
gc.collect()

31

In [57]:
import scanpy as sc

file = "data/GSM8989500_HD_7HA_DMSO_raw_expression.h5"
adata = sc.read_10x_h5(file)
adata

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 1791147 × 60623
    var: 'gene_ids', 'feature_types', 'genome'

In [58]:
adata.var_names_make_unique()

  adata.var_names_make_unique()


In [59]:
# QC metrics
adata.obs["n_counts"] = adata.X.sum(axis=1).A1
adata.obs["n_genes"] = (adata.X > 0).sum(axis=1).A1

adata.obs[["n_counts", "n_genes"]].describe()

Unnamed: 0,n_counts,n_genes
count,1791147.0,1791147.0
mean,65.31795,26.43504
std,945.8843,279.0752
min,0.0,0.0
25%,0.0,0.0
50%,1.0,1.0
75%,1.0,1.0
max,64995.0,9889.0


In [60]:
adata_filt = adata[
    (adata.obs["n_genes"] >= 500) &
    (adata.obs["n_counts"] >= 1000) &
    (adata.obs["n_genes"] <= 8000)
].copy()

adata_filt

AnnData object with n_obs × n_vars = 9331 × 60623
    obs: 'n_counts', 'n_genes'
    var: 'gene_ids', 'feature_types', 'genome'

In [61]:
adata_filt.obs["disease"] = "Healthy"
adata_filt.obs["treatment"] = "DMSO"
adata_filt.obs["donor"] = "7HA"
adata_filt.obs["sample"] = "HD_DMSO"

In [62]:
adata_filt.write("data/HD_DMSO_filtered.h5ad")


In [63]:
del adata
del adata_filt

import gc
gc.collect()

31