# Goals

* Create tahoe-100 dataset on GCP 
* Description of the [metadata](https://docs.google.com/spreadsheets/d/18lu5agCNbgWsjZ9K05Mf65z1RseoFqsi/edit?usp=sharing&ouid=103963277960448548043&rtpof=true&sd=true)

# Var

In [1]:
work_dir = '/processed_datasets/scRecount/tahoe'
gcp_dir = 'gs://arc-ctc-tahoe100/2025-02-25/'

# Init

In [2]:
import os
import sys
import gcsfs
from glob import glob
import pandas as pd
import scanpy as sc
import pyarrow.dataset as ds
from google.cloud import storage

In [None]:
# Set up the Google Cloud Storage client and the GCSFS client
fs = gcsfs.GCSFileSystem()

# Format h5ad files and upload to GCP

In [22]:
# list all *.h5ad.gz files in the work directory
h5ad_files = sorted(glob(os.path.join(work_dir, '*.h5ad.gz')))
h5ad_files

['/processed_datasets/scRecount/tahoe/plate10_filtered.h5ad.gz',
 '/processed_datasets/scRecount/tahoe/plate11_filtered.h5ad.gz',
 '/processed_datasets/scRecount/tahoe/plate12_filtered.h5ad.gz',
 '/processed_datasets/scRecount/tahoe/plate13_filtered.h5ad.gz',
 '/processed_datasets/scRecount/tahoe/plate14_filtered.h5ad.gz',
 '/processed_datasets/scRecount/tahoe/plate1_filtered.h5ad.gz',
 '/processed_datasets/scRecount/tahoe/plate2_filtered.h5ad.gz',
 '/processed_datasets/scRecount/tahoe/plate3_filtered.h5ad.gz',
 '/processed_datasets/scRecount/tahoe/plate4_filtered.h5ad.gz',
 '/processed_datasets/scRecount/tahoe/plate5_filtered.h5ad.gz',
 '/processed_datasets/scRecount/tahoe/plate6_filtered.h5ad.gz',
 '/processed_datasets/scRecount/tahoe/plate7_filtered.h5ad.gz',
 '/processed_datasets/scRecount/tahoe/plate8_filtered.h5ad.gz',
 '/processed_datasets/scRecount/tahoe/plate9_filtered.h5ad.gz']

# Format h5ad files and upload to GCP

In [None]:
# columns to keep
to_keep = [
    "sample", "gene_count", "tscp_count", "mread_count", "drugname_drugconc", "drug", 
    "cell_line", "sublibrary", "BARCODE", "pcnt_mito", "S_score", "G2M_score", "phase", 
    "pass_filter", "cell_name"
]

# read in the files
data = []
for infile in h5ad_files:
    print(f"Reading {infile}...")
    df = sc.read_h5ad(infile, backed='r').obs[to_keep]
    # add columns
    df['plate'] = os.path.basename(infile).split('_')[0]
    df['BARCODE_SUB_LIB_ID'] = df.index
    # reorder columns
    cols = df.columns.tolist()
    cols = cols[-2:] + cols[:-2]
    df = df[cols]
    # append
    data.append(df)
 
# combine
df = pd.concat(data, axis=0).reset_index(drop=True)
df

In [20]:
# write to GCS
outfile = os.path.join(gcp_dir, 'metadata', 'obs_metadata.parquet.gz')
print(f"Writing {outfile}...")
with fs.open(outfile, 'wb') as f:
    df.to_parquet(f, compression='gzip', index=True)

Writing gs://arc-ctc-tahoe100/2025-02-25/metadata/obs_metadata.parquet.gz...


In [30]:
# group by sample and plate
df_s = df.groupby(['sample', 'plate']).agg({
    'gene_count': 'mean', 
    'tscp_count': 'mean', 
    'mread_count': 'mean',
    'pcnt_mito': 'mean',  
    'drug': 'first',
    'drugname_drugconc': 'first',
}).reset_index().rename(
    columns={
        "gene_count" : "mean_gene_count", 
        "tscp_count" : "mean_tscp_count", 
        "mread_count" : "mean_mread_count",
        "pcnt_mito" : "mean_pcnt_mito"
    }
)

In [31]:
# write to GCS
outfile = os.path.join(gcp_dir, 'metadata', 'sample_metadata.parquet.gz')
print(f"Writing {outfile}...")
with fs.open(outfile, 'wb') as f:
    df_s.to_parquet(f, compression='gzip')

Writing gs://arc-ctc-tahoe100/2025-02-25/metadata/sample_metadata.parquet.gz...


# Validate

In [32]:
# read just the first 3 rows
infile = os.path.join(gcp_dir, 'metadata', 'sample_metadata.parquet.gz')
df = ds.dataset(infile, filesystem=fs, format="parquet").head(3).to_pandas()
df

Reading gs://arc-ctc-tahoe100/2025-02-25/metadata/sample_metadata.parquet.gz...


Unnamed: 0,sample,plate,mean_gene_count,mean_tscp_count,mean_mread_count,mean_pcnt_mito,drug,drugname_drugconc
0,smp_1495,plate1,1354.169768,2027.115940,2444.032416,0.033956,Infigratinib,"[('Infigratinib', 0.05, 'uM')]"
1,smp_1496,plate1,1404.454157,2226.282791,2690.685970,0.071723,Erdafitinib,"[('Erdafitinib ', 0.05, 'uM')]"
2,smp_1497,plate1,1205.267794,1859.375821,2246.200127,0.084853,Everolimus,"[('Everolimus', 0.05, 'uM')]"
3,smp_1498,plate1,1225.510822,1906.494566,2298.907623,0.088262,Pemigatinib,"[('Pemigatinib', 0.05, 'uM')]"
4,smp_1499,plate1,1231.372881,1861.305085,2245.372881,0.050802,Abemaciclib,"[('Abemaciclib', 0.05, 'uM')]"
...,...,...,...,...,...,...,...,...
1339,smp_2834,plate14,1713.499983,3003.519174,3507.489998,0.086085,Anethole trithione,"[('Anethole trithione', 5.0, 'uM')]"
1340,smp_2835,plate14,1711.157835,2996.269656,3488.079633,0.079115,Clonidine (hydrochloride),"[('Clonidine (hydrochloride)', 5.0, 'uM')]"
1341,smp_2836,plate14,1376.900732,2251.751955,2627.752929,0.093560,Adagrasib,"[('Adagrasib', 0.05, 'uM')]"
1342,smp_2837,plate14,1452.439731,2428.145334,2827.406351,0.086081,DMSO_TF,"[('DMSO_TF', 0.0, 'uM')]"


In [21]:
# read just the first 3 rows
infile = os.path.join(gcp_dir, 'metadata', 'obs_metadata.parquet.gz')
df = ds.dataset(infile, filesystem=fs, format="parquet").head(3).to_pandas()
df

Unnamed: 0,plate,BARCODE_SUB_LIB_ID,sample,gene_count,tscp_count,mread_count,drugname_drugconc,drug,cell_line,sublibrary,BARCODE,pcnt_mito,S_score,G2M_score,phase,pass_filter,cell_name
0,plate10,01_001_001-lib_1681,smp_2359,1379,2172,2559,"[('Bestatin (hydrochloride)', 0.05, 'uM')]",Bestatin (hydrochloride),CVCL_1478,lib_1681,01_001_001,0.029926,-0.229665,-0.19011,G1,full,NCI-H1573
1,plate10,01_002_149-lib_1681,smp_2359,975,1256,1470,"[('Bestatin (hydrochloride)', 0.05, 'uM')]",Bestatin (hydrochloride),CVCL_0459,lib_1681,01_002_149,0.026274,-0.167578,-0.132784,G1,full,NCI-H460
2,plate10,01_003_052-lib_1681,smp_2359,865,1239,1446,"[('Bestatin (hydrochloride)', 0.05, 'uM')]",Bestatin (hydrochloride),CVCL_C466,lib_1681,01_003_052,0.033898,-0.200957,-0.161538,G1,full,hTERT-HPNE


# sessionInfo

In [22]:
!pip list

Package                     Version
--------------------------- -----------
aiobotocore                 2.19.0
aiohappyeyeballs            2.4.4
aiohttp                     3.11.11
aioitertools                0.12.0
aiosignal                   1.3.2
anndata                     0.11.3
array_api_compat            1.10.0
asttokens                   3.0.0
attrs                       25.1.0
beautifulsoup4              4.13.3
botocore                    1.36.3
Brotli                      1.1.0
cached-property             1.5.2
cachetools                  5.5.1
cellxgene-census            1.16.2
certifi                     2024.12.14
cffi                        1.17.1
charset-normalizer          3.4.1
colorama                    0.4.6
comm                        0.2.2
contourpy                   1.3.1
cryptography                44.0.0
cycler                      0.12.1
debugpy                     1.8.12
decorator                   5.1.1
exceptiongroup              1.2.2
executing            