
### runs on stenv3

In [1]:
import os
import sys
import glob
import scanpy as sc
import numpy as np

### Reads in all fastq file names on gcp

In [4]:
%%capture cap_out --no-stderr
!gsutil ls gs://fc-17b85933-14fb-4c4d-937e-e928f8ea53b1/normalized_data/ | cut -d '/' -f 5 
fastqs = [i for i in cap_out.stdout.split("\r\n")]
fastqs_samples = [j[0]+"_"+j[-1] for j in [i.split("_")[0:2] for i in fastqs]]

In [5]:
len(fastqs_samples)-1

395

In [6]:
fastqs_samples

['10005CN23_C1.h5ad',
 '10005CN23_C2.h5ad',
 '10005CN23_D1.h5ad',
 '10005CN23_D2.h5ad',
 '10005CN23_E1.h5ad',
 '10005CN23_E2.h5ad',
 '10005CN24_C1.h5ad',
 '10005CN24_C2.h5ad',
 '10005CN24_D1.h5ad',
 '10005CN24_D2.h5ad',
 '10005CN24_E1.h5ad',
 '10005CN24_E2.h5ad',
 '10005CN27_C1.h5ad',
 '10005CN27_C2.h5ad',
 '10005CN27_D1.h5ad',
 '10005CN27_D2.h5ad',
 '10005CN27_E1.h5ad',
 '10005CN27_E2.h5ad',
 '10005CN29_C1.h5ad',
 '10005CN29_C2.h5ad',
 '10005CN29_D1.h5ad',
 '10005CN29_D2.h5ad',
 '10005CN29_E1.h5ad',
 '10005CN29_E2.h5ad',
 '10005CN35_C1.h5ad',
 '10005CN35_C2.h5ad',
 '10005CN35_D1.h5ad',
 '10005CN35_E1.h5ad',
 '10005CN35_E2.h5ad',
 '10005CN36_C1.h5ad',
 '10005CN36_C2.h5ad',
 '10005CN36_D1.h5ad',
 '10005CN36_D2.h5ad',
 '10005CN36_E1.h5ad',
 '10005CN36_E2.h5ad',
 '10005CN37_C1.h5ad',
 '10005CN37_C2.h5ad',
 '10005CN37_D1.h5ad',
 '10005CN37_D2.h5ad',
 '10005CN37_E1.h5ad',
 '10005CN37_E2.h5ad',
 '10005CN38_C1.h5ad',
 '10005CN38_C2.h5ad',
 '10005CN38_D1.h5ad',
 '10005CN38_D2.h5ad',
 '10005CN3

### Reads in expression file

In [7]:
# Load Lambda pmean df
path = '/home/sanjavickovic/data/st_data'

# Read expression file
filename = os.path.join(path, 'anndata_colons_norm_metagenes_submodules_celltypes_kegg_degenes_pheno.h5ad')  
st_splotch_pd = sc.read_h5ad(filename)

In [8]:
np.unique(list(st_splotch_pd.obs['sample']))

array(['10005CN23_C1', '10005CN23_C2', '10005CN23_D1', '10005CN23_D2',
       '10005CN23_E1', '10005CN23_E2', '10005CN24_C1', '10005CN24_C2',
       '10005CN24_D1', '10005CN24_D2', '10005CN24_E1', '10005CN24_E2',
       '10005CN27_C1', '10005CN27_C2', '10005CN27_D1', '10005CN27_D2',
       '10005CN27_E1', '10005CN27_E2', '10005CN29_C1', '10005CN29_C2',
       '10005CN29_D1', '10005CN29_D2', '10005CN29_E1', '10005CN29_E2',
       '10005CN35_C1', '10005CN35_C2', '10005CN35_D1', '10005CN35_E1',
       '10005CN35_E2', '10005CN36_C1', '10005CN36_C2', '10005CN36_D1',
       '10005CN36_D2', '10005CN36_E1', '10005CN36_E2', '10005CN37_C1',
       '10005CN37_C2', '10005CN37_D1', '10005CN37_D2', '10005CN37_E1',
       '10005CN37_E2', '10005CN38_C1', '10005CN38_C2', '10005CN38_D1',
       '10005CN38_D2', '10005CN38_E1', '10005CN39_C1', '10005CN39_C2',
       '10005CN39_D1', '10005CN39_D2', '10005CN39_E1', '10005CN39_E2',
       '10005CN40_C1', '10005CN40_C2', '10005CN40_D1', '10005CN40_D2',
      

### Checks if any extra files deposited on GCP and deletes those

In [10]:
for sam in np.unique(fastqs_samples[0:len(fastqs_samples)-1]):
    if not sam.split(".")[0] in np.unique(list(st_splotch_pd.obs['sample'])):
        #print("not ok")
        #print(sam)
        dels = [x for x in fastqs if sam in x]
        fname_r1 = 'gs://fc-17b85933-14fb-4c4d-937e-e928f8ea53b1/normalized_data/'+dels[0]
        print(fname_r1)
        !gsutil rm $fname_r1

In [11]:
%%capture cap_out --no-stderr
!gsutil ls gs://fc-17b85933-14fb-4c4d-937e-e928f8ea53b1/raw_data/ | cut -d '/' -f 5 
fastqs = [i for i in cap_out.stdout.split("\r\n")]
fastqs_samples = [j[0]+"_"+j[-1] for j in [i.split("_")[0:2] for i in fastqs]]

### Checks if any files are missing on GCP

In [12]:
for sam in np.unique(list(st_splotch_pd.obs['sample'])):
    if not str(sam+'.h5ad') in np.unique(fastqs_samples[0:len(fastqs_samples)-1]):
        
        print(sam+'.h5ad')
        
# copy any files outputed here to gcp for storage

In [13]:
%%capture cap_out --no-stderr
!gsutil ls gs://fc-17b85933-14fb-4c4d-937e-e928f8ea53b1/raw_data/ | cut -d '/' -f 5 
fastqs = [i for i in cap_out.stdout.split("\r\n")]
fastqs_samples = [j[0]+"_"+j[-1] for j in [i.split("_")[0:2] for i in fastqs]]

In [15]:
assert((len(np.unique(fastqs_samples))-1) == (len(fastqs)-1))