In [1]:
import os
import io
import pandas as pd
import scanpy as sc
import pyarrow.dataset as ds
import gcsfs

In [10]:
fs = gcsfs.GCSFileSystem()

In [21]:
gcp_base_path = "gs://arc-ctc-tahoe100/2025-02-25/"


In [22]:
infile = os.path.join(gcp_base_path, 'metadata', 'sample_metadata.parquet')

In [23]:
sample_metadata = ds.dataset(infile, filesystem=fs, format="parquet").head(3).to_pandas()
sample_metadata

Unnamed: 0,sample,plate,mean_gene_count,mean_tscp_count,mean_mread_count,mean_pcnt_mito,drug,drugname_drugconc
0,smp_1495,plate1,1354,2027,2444,0,Infigratinib,"[('Infigratinib', 0.05, 'uM')]"
1,smp_1496,plate1,1404,2226,2691,0,Erdafitinib,"[('Erdafitinib ', 0.05, 'uM')]"
2,smp_1497,plate1,1205,1859,2246,0,Everolimus,"[('Everolimus', 0.05, 'uM')]"


In [24]:
# select certain columns and row filtering
columns_to_read = ['sample', 'plate', 'mean_gene_count']  # Specify the columns you need
dataset = ds.dataset(infile, filesystem=fs, format="parquet")
sample_metadata = dataset.to_table(filter=(ds.field('mean_gene_count') > 2000), columns=columns_to_read).to_pandas()
sample_metadata 

Unnamed: 0,sample,plate,mean_gene_count
0,smp_1598,plate2,2196
1,smp_1604,plate2,2039
2,smp_1605,plate2,2043
3,smp_2044,plate6,2156
4,smp_2046,plate6,2073
5,smp_2054,plate6,2036
6,smp_2056,plate6,2059
7,smp_2060,plate6,2095
8,smp_2066,plate6,2230
9,smp_2067,plate6,2193


In [25]:
# get the number of samples
columns_to_read = ["sample"]  # Specify the columns you need
dataset = ds.dataset(infile, filesystem=fs, format="parquet")
sample_count = dataset.to_table(columns=columns_to_read).to_pandas()["sample"].nunique()
print(f"Number of samples: {sample_count}")

Number of samples: 1344


In [26]:
# set the path to the obs_metadata file
infile = os.path.join(gcp_base_path, 'metadata', 'obs_metadata.parquet')

In [27]:
# get samples per plate
columns_to_read = ["plate", "sample"]  # Specify the columns you need
dataset = ds.dataset(infile, filesystem=fs, format="parquet")
samples_per_plate = dataset.to_table(columns=columns_to_read).to_pandas().groupby("plate").size()
samples_per_plate

plate
plate1      5481420
plate10     8044908
plate11     7435869
plate12    10487057
plate13     8501658
plate14     6518806
plate2      8064658
plate3      4705402
plate4      7004356
plate5      6419498
plate6      7545393
plate7      5692117
plate8      8880979
plate9      5866669
dtype: int64

In [10]:
# read a subset of the metadata
obs_metadata = ds.dataset(infile, filesystem=fs, format="parquet").head(100000).to_pandas()
obs_metadata

Unnamed: 0,plate,BARCODE_SUB_LIB_ID,sample,gene_count,tscp_count,mread_count,drugname_drugconc,drug,cell_line,sublibrary,BARCODE,pcnt_mito,S_score,G2M_score,phase,pass_filter,cell_name
0,plate10,01_001_001-lib_1681,smp_2359,1379,2172,2559,"[('Bestatin (hydrochloride)', 0.05, 'uM')]",Bestatin (hydrochloride),CVCL_1478,lib_1681,01_001_001,0.029926,-0.229665,-0.190110,G1,full,NCI-H1573
1,plate10,01_002_149-lib_1681,smp_2359,975,1256,1470,"[('Bestatin (hydrochloride)', 0.05, 'uM')]",Bestatin (hydrochloride),CVCL_0459,lib_1681,01_002_149,0.026274,-0.167578,-0.132784,G1,full,NCI-H460
2,plate10,01_003_052-lib_1681,smp_2359,865,1239,1446,"[('Bestatin (hydrochloride)', 0.05, 'uM')]",Bestatin (hydrochloride),CVCL_C466,lib_1681,01_003_052,0.033898,-0.200957,-0.161538,G1,full,hTERT-HPNE
3,plate10,01_003_090-lib_1681,smp_2359,393,484,559,"[('Bestatin (hydrochloride)', 0.05, 'uM')]",Bestatin (hydrochloride),CVCL_1724,lib_1681,01_003_090,0.037190,-0.052746,-0.076190,G1,minimal,SW48
4,plate10,01_003_093-lib_1681,smp_2359,2657,5325,6269,"[('Bestatin (hydrochloride)', 0.05, 'uM')]",Bestatin (hydrochloride),CVCL_1285,lib_1681,01_003_093,0.017465,-0.636364,-0.614103,G1,full,HOP62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,plate10,72_141_112-lib_1682,smp_2430,1481,2171,2559,"[('γ-Oryzanol', 0.05, 'uM')]",γ-Oryzanol,CVCL_0366,lib_1682,72_141_112,0.042837,0.000000,-0.100386,S,full,SNU-423
99996,plate10,72_141_131-lib_1682,smp_2430,1430,2119,2496,"[('γ-Oryzanol', 0.05, 'uM')]",γ-Oryzanol,CVCL_0371,lib_1682,72_141_131,0.074563,-0.009524,-0.028340,G1,full,KATO III
99997,plate10,72_141_184-lib_1682,smp_2430,827,1044,1201,"[('γ-Oryzanol', 0.05, 'uM')]",γ-Oryzanol,CVCL_1693,lib_1682,72_141_184,0.045019,-0.028571,-0.057324,G1,full,SHP-77
99998,plate10,72_142_157-lib_1682,smp_2430,875,1153,1333,"[('γ-Oryzanol', 0.05, 'uM')]",γ-Oryzanol,CVCL_0320,lib_1682,72_142_157,0.071119,-0.061905,-0.042970,G1,full,HT-29


In [28]:
# sample count
obs_metadata["sample"].nunique()

96

In [29]:
# gene count distribution
pd.options.display.float_format = '{:.0f}'.format
obs_metadata["gene_count"].describe()

count   100000
mean      1382
std        735
min        268
25%        896
50%       1209
75%       1661
max       9395
Name: gene_count, dtype: float64

In [30]:
# tscp (UMI) count distribution
pd.options.display.float_format = '{:.0f}'.format
obs_metadata["tscp_count"].describe()

count   100000
mean      2214
std       1833
min        392
25%       1230
50%       1748
75%       2583
max      54006
Name: tscp_count, dtype: float64

In [2]:
# set the path to the plate metadata file
infile = "gs://arc-ctc-tahoe100/2025-02-25/tutorial/plate3_2k-obs.h5ad"
infile = "gs://arc-ctc-tahoe100/2025-02-25/h5ad/plate3_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad"
infile = 'plate14_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad'
#infile = 'plate3_CVCL_0023.h5ad'
infile

'plate14_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad'

In [3]:
import timeit
from datetime import timedelta

# read in the h5ad file
t_0 = timeit.default_timer()
#with fs.open(infile, 'rb') as f:
#with open(infile, 'rb') as f:
#adata = sc.read_h5ad(f,backed='r+')
adata = sc.read_h5ad(infile)
#adata = sc.read_h5ad(infile,backed='r')
#adata = sc.read_h5ad(infile)
t_1 = timeit.default_timer()
elapsed_time = round((t_1 - t_0), 3)
print(f"Elapsed time: {elapsed_time} s")
adata

Elapsed time: 316.571 s


AnnData object with n_obs × n_vars = 6518806 × 62710
    obs: 'sample', 'gene_count', 'tscp_count', 'mread_count', 'drugname_drugconc', 'drug', 'cell_line', 'sublibrary', 'BARCODE', 'pcnt_mito', 'S_score', 'G2M_score', 'phase', 'pass_filter', 'cell_name', 'plate'

In [4]:
cell_lines = list(set(adata.obs['cell_line']))
print(cell_lines)
plate = list(set(adata.obs['plate']))[0]
print(plate)

['CVCL_0320', 'CVCL_0023', 'CVCL_1666', 'CVCL_1635', 'CVCL_1550', 'CVCL_0399', 'CVCL_0480', 'CVCL_0099', 'CVCL_1285', 'CVCL_0028', 'CVCL_1716', 'CVCL_1478', 'CVCL_0152', 'CVCL_1731', 'CVCL_0131', 'CVCL_1577', 'CVCL_1531', 'CVCL_1239', 'CVCL_0371', 'CVCL_1571', 'CVCL_0332', 'CVCL_1724', 'CVCL_1125', 'CVCL_0428', 'CVCL_0218', 'CVCL_1056', 'CVCL_1098', 'CVCL_0366', 'CVCL_0179', 'CVCL_0459', 'CVCL_0397', 'CVCL_C466', 'CVCL_1055', 'CVCL_1547', 'CVCL_0504', 'CVCL_0293', 'CVCL_0334', 'CVCL_1097', 'CVCL_0359', 'CVCL_1119', 'CVCL_1717', 'CVCL_0292', 'CVCL_0069', 'CVCL_1381', 'CVCL_1693', 'CVCL_0546', 'CVCL_1517', 'CVCL_1715', 'CVCL_1495', 'CVCL_1094']
plate14


In [5]:
for i,cell_line in enumerate(cell_lines):
    adata1 = adata[adata.obs['cell_line']==cell_line,:]
    print(i,'/',len(cell_lines),cell_line,adata1)
    #sc.pl.scatter(adata1,x="S_score",y="G2M_score",color="drug")
    adata1.write(plate+'_'+cell_line+'.h5ad',compression='gzip')

0 / 50 CVCL_0320 View of AnnData object with n_obs × n_vars = 162610 × 62710
    obs: 'sample', 'gene_count', 'tscp_count', 'mread_count', 'drugname_drugconc', 'drug', 'cell_line', 'sublibrary', 'BARCODE', 'pcnt_mito', 'S_score', 'G2M_score', 'phase', 'pass_filter', 'cell_name', 'plate'
1 / 50 CVCL_0023 View of AnnData object with n_obs × n_vars = 182446 × 62710
    obs: 'sample', 'gene_count', 'tscp_count', 'mread_count', 'drugname_drugconc', 'drug', 'cell_line', 'sublibrary', 'BARCODE', 'pcnt_mito', 'S_score', 'G2M_score', 'phase', 'pass_filter', 'cell_name', 'plate'
2 / 50 CVCL_1666 View of AnnData object with n_obs × n_vars = 100623 × 62710
    obs: 'sample', 'gene_count', 'tscp_count', 'mread_count', 'drugname_drugconc', 'drug', 'cell_line', 'sublibrary', 'BARCODE', 'pcnt_mito', 'S_score', 'G2M_score', 'phase', 'pass_filter', 'cell_name', 'plate'
3 / 50 CVCL_1635 View of AnnData object with n_obs × n_vars = 113097 × 62710
    obs: 'sample', 'gene_count', 'tscp_count', 'mread_count

In [17]:
with fs.open("gs://tahoe100m-plate3-cvcl0023/"+plate+'_'+cell_line+'.h5ad', 'wb') as f:
    adata1.write(f,compression='gzip')

OSError: Forbidden: https://storage.googleapis.com/upload/storage/v1/b/tahoe100m-plate3-cvcl0023/o?name=plate3_CVCL_0023.h5ad
Access denied.

In [21]:
##Downloading files
You can use gsutil to download any of the files in the bucket and work with them locally.

Please be considerate to the cost of egress when download the data from Google Cloud Storage.

For example:

gsutil cp gs://arc-ctc-tahoe100/2025-02-25/tutorial/plate3_2k-obs.h5ad .
For large data transfers, it is better to use gsutil rsync:

gsutil rsync gs://arc-ctc-tahoe100/2025-02-25/tutorial/ .

SyntaxError: leading zeros in decimal integer literals are not permitted; use an 0o prefix for octal integers (2853223811.py, line 8)