# Set-up

In [13]:
import os
import glob
import subprocess
import pandas as pd
from tqdm.autonotebook import tqdm
from pysradb.sraweb import SRAweb

In [3]:
# Choose the current dataset we are working with
dataset_name = "Zhu2023_sc-islet_snATAC-seq"
srp_id = "SRP374215"

In [4]:
# Set-up directories
base_dir = "/cellar/users/aklie/data/igvf/beta_cell_networks"
cwd = os.path.join(base_dir, "download", dataset_name)
fastq_dir = os.path.join(base_dir, "fastq", dataset_name)
base_dir, cwd, fastq_dir

('/cellar/users/aklie/data/igvf/beta_cell_networks',
 '/cellar/users/aklie/data/igvf/beta_cell_networks/download/Zhu2023_sc-islet_snATAC-seq',
 '/cellar/users/aklie/data/igvf/beta_cell_networks/fastq/Zhu2023_sc-islet_snATAC-seq')

In [5]:
# Load in the metadata
metadata = pd.read_csv(f"{cwd}/{srp_id}_metadata.tsv", sep="\t")

In [14]:
# For each fastq.gz in the fastq dir, check how many reads it has
spots_dict = {}
for fastq_file in tqdm(glob.glob(os.path.join(fastq_dir, srp_id, "*", "*.fastq.gz"))):
    cmd = f"zcat {fastq_file} | wc -l"
    num_reads = int(subprocess.check_output(cmd, shell=True).decode("utf-8").strip())/4
    spots_dict[os.path.basename(fastq_file)] = num_reads

 27%|██▋       | 4/15 [32:36<2:00:09, 655.41s/it]

In [15]:
spots_dict

{'SRR19140211_2.fastq.gz': 126271591.0,
 'SRR19140211_1.fastq.gz': 126271591.0,
 'SRR19140211_3.fastq.gz': 126271591.0,
 'SRR19140214_1.fastq.gz': 624062139.0,
 'SRR19140214_2.fastq.gz': 624062139.0,
 'SRR19140214_3.fastq.gz': 624062139.0,
 'SRR19140210_2.fastq.gz': 173358378.0,
 'SRR19140210_1.fastq.gz': 173358378.0,
 'SRR19140210_3.fastq.gz': 173358378.0,
 'SRR19140213_3.fastq.gz': 434496051.0,
 'SRR19140213_1.fastq.gz': 434496051.0,
 'SRR19140213_2.fastq.gz': 434496051.0,
 'SRR19140212_2.fastq.gz': 231878793.0,
 'SRR19140212_3.fastq.gz': 231878793.0,
 'SRR19140212_1.fastq.gz': 231878793.0}

In [6]:
# Check number of reads against metadata

Unnamed: 0,run_accession,study_accession,study_title,experiment_accession,experiment_title,experiment_desc,organism_taxid,organism_name,library_name,library_strategy,...,gcp_access_type,experiment_alias,source_name,day,ena_fastq_http,ena_fastq_http_1,ena_fastq_http_2,ena_fastq_ftp,ena_fastq_ftp_1,ena_fastq_ftp_2
0,SRR19140210,SRP374215,Improving stem cell-derived pancreatic islets ...,SRX15207301,GSM6123273: MM290_D14; Homo sapiens; ATAC-seq,GSM6123273: MM290_D14; Homo sapiens; ATAC-seq,9606,Homo sapiens,GSM6123273,ATAC-seq,...,gcp identity,,Islets,D14,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR191/010...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR191/010...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR191/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR191/...
1,SRR19140211,SRP374215,Improving stem cell-derived pancreatic islets ...,SRX15207300,GSM6123272: MM168_D39; Homo sapiens; ATAC-seq,GSM6123272: MM168_D39; Homo sapiens; ATAC-seq,9606,Homo sapiens,GSM6123272,ATAC-seq,...,gcp identity,,Islets,D39,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR191/011...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR191/011...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR191/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR191/...
2,SRR19140212,SRP374215,Improving stem cell-derived pancreatic islets ...,SRX15207299,GSM6123271: MM166_D32; Homo sapiens; ATAC-seq,GSM6123271: MM166_D32; Homo sapiens; ATAC-seq,9606,Homo sapiens,GSM6123271,ATAC-seq,...,gcp identity,,Islets,D32,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR191/012...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR191/012...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR191/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR191/...
3,SRR19140213,SRP374215,Improving stem cell-derived pancreatic islets ...,SRX15207298,GSM6123270: MM157_D21; Homo sapiens; ATAC-seq,GSM6123270: MM157_D21; Homo sapiens; ATAC-seq,9606,Homo sapiens,GSM6123270,ATAC-seq,...,gcp identity,,Islets,D21,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR191/013...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR191/013...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR191/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR191/...
4,SRR19140214,SRP374215,Improving stem cell-derived pancreatic islets ...,SRX15207297,GSM6123269: MM129_D11; Homo sapiens; ATAC-seq,GSM6123269: MM129_D11; Homo sapiens; ATAC-seq,9606,Homo sapiens,GSM6123269,ATAC-seq,...,gcp identity,,Islets,D11,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR191/014...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR191/014...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR191/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR191/...
