# Set-up

In [7]:
import os
import sys
from tqdm.autonotebook import tqdm
from pysradb.sraweb import SRAweb

# Add conda environment to path
sys.path.append('/cellar/users/aklie/opt/miniconda3/envs/get_data/bin/')
os.environ['PATH'] = '/cellar/users/aklie/opt/miniconda3/envs/get_data/bin/' + ':' + os.environ['PATH']

In [8]:
# Choose the current dataset we are working with
dataset_name = "Wang2023_islet_snATAC-seq"
srp_id = "SRP311849"

In [9]:
# Set-up directories
base_dir = "/cellar/users/aklie/data/igvf/beta_cell_networks"
cwd = os.path.join(base_dir, "download", dataset_name)
fastq_dir = os.path.join(base_dir, "fastq", dataset_name)
base_dir, cwd, fastq_dir

('/cellar/users/aklie/data/igvf/beta_cell_networks',
 '/cellar/users/aklie/data/igvf/beta_cell_networks/download/Wang2023_islet_snATAC-seq',
 '/cellar/users/aklie/data/igvf/beta_cell_networks/fastq/Wang2023_islet_snATAC-seq')

In [4]:
# Connect to SRA
db = SRAweb()

# Get metadata

In [6]:
# Grab the metadata for the SRP
metadata = db.sra_metadata(srp_id, detailed=True)

In [7]:
# Save the metadata and the list of srr ids
metadata.to_csv(os.path.join(cwd, f"{srp_id}_metadata.tsv"), index=False, sep="\t")
metadata["run_accession"].to_csv(os.path.join(cwd, f"{srp_id}_srr_ids.txt"), index=False, header=False)

In [8]:
# Check metadata of interest
metadata["diagnosis"].value_counts()

T2D             14
Non-diabetic    11
Pre-T2D          9
Name: diagnosis, dtype: int64

In [9]:
# Subset to conditions of interest
t2d_metadata = metadata[metadata["diagnosis"] == "T2D"]
nd_metadata = metadata[metadata["diagnosis"] == "Non-diabetic"]
pre_t2d_metadata = metadata[metadata["diagnosis"] == "Pre-T2D"]
len(t2d_metadata), len(nd_metadata), len(pre_t2d_metadata)

(14, 11, 9)

In [10]:
# Save each subset along with the list of srr ids
t2d_metadata.to_csv(os.path.join(cwd, f"{srp_id}_t2d_metadata.tsv"), index=False, sep="\t")
t2d_metadata["run_accession"].to_csv(os.path.join(cwd, f"{srp_id}_t2d_srr_ids.txt"), index=False, header=False)

nd_metadata.to_csv(os.path.join(cwd, f"{srp_id}_nd_metadata.tsv"), index=False, sep="\t")
nd_metadata["run_accession"].to_csv(os.path.join(cwd, f"{srp_id}_nd_srr_ids.txt"), index=False, header=False)

pre_t2d_metadata.to_csv(os.path.join(cwd, f"{srp_id}_pre_t2d_metadata.tsv"), index=False, sep="\t")
pre_t2d_metadata["run_accession"].to_csv(os.path.join(cwd, f"{srp_id}_pre_t2d_srr_ids.txt"), index=False, header=False)

# Download non-diabetic samples (`sra` files)

In [11]:
db.download(df=nd_metadata, out_dir=fastq_dir)

Checking download URLs
The following files will be downloaded: 

run_accession study_accession experiment_accession public_url                                                                                                  download_url                                                                                            out_dir                                                                          filesize
SRR14048753   SRP311849       SRX10424702           https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos4/sra-pub-zq-1/SRR014/14048/SRR14048753/SRR14048753.lite.1 ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR140/SRR14048753/SRR14048753.sra /cellar/users/aklie/data/igvf/beta_cell_networks/fastq/Wang2023_islet_snATAC-seq 17.9 GB 
SRR14048754   SRP311849       SRX10424703                                                 https://sra-pub-run-odp.s3.amazonaws.com/sra/SRR14048754/SRR14048754 ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/S

  9%|▉         | 1/11 [08:07<1:21:14, 487.45s/it]

In [None]:
# Validate the downloaded files
#for i in *; do cd $i; vdb-validate SRR* 2>>../vdb_validate_all.out; cd ..; done

# Convert to `fastq` files

In [11]:
import glob
import subprocess

In [12]:
tmp_dir = "/cellar/users/aklie/tmp/fastq-dump"
gzip = True
split_files = True
threads = 4

In [13]:
# Loop through and print out each SRA download file within the subdirectories of the fastq_dir
# Run the following command parallel-fastq-dump --threads 4 --outdir . --split-files --tmpdir $tmp_dir --gzip -s SRR14048750.sra
for sra_file in glob.glob(os.path.join(fastq_dir, srp_id, "*", "*.sra")):
    sra_dir = os.path.dirname(sra_file)
    if gzip:
        cmd = f"parallel-fastq-dump --threads {threads} --outdir {sra_dir} --split-files --tmpdir {tmp_dir} --gzip -s {sra_file}"
    else:
        cmd = f"parallel-fastq-dump --threads {threads} --outdir {sra_dir} --split-files --tmpdir {tmp_dir} -s {sra_file}"
    print(cmd)
    if len(glob.glob(os.path.join(sra_dir, "*.fastq*"))) > 0:
        print(f"Files already downloaded for {sra_dir}")
    else:
        subprocess.run(cmd, shell=True)

parallel-fastq-dump --threads 4 --outdir /cellar/users/aklie/data/igvf/beta_cell_networks/fastq/Wang2023_islet_snATAC-seq/SRP311849/SRX10424717 --split-files --tmpdir /cellar/users/aklie/tmp/fastq-dump --gzip -s /cellar/users/aklie/data/igvf/beta_cell_networks/fastq/Wang2023_islet_snATAC-seq/SRP311849/SRX10424717/SRR14048768.sra


2023-07-04 11:24:50,861 - SRR ids: ['/cellar/users/aklie/data/igvf/beta_cell_networks/fastq/Wang2023_islet_snATAC-seq/SRP311849/SRX10424717/SRR14048768.sra']
2023-07-04 11:24:50,861 - extra args: ['--split-files', '--gzip']
2023-07-04 11:24:50,864 - tempdir: /cellar/users/aklie/tmp/fastq-dump/pfd_vvx_4yfv
2023-07-04 11:24:50,865 - CMD: sra-stat --meta --quick /cellar/users/aklie/data/igvf/beta_cell_networks/fastq/Wang2023_islet_snATAC-seq/SRP311849/SRX10424717/SRR14048768.sra
2023-07-04 11:24:51,090 - /cellar/users/aklie/data/igvf/beta_cell_networks/fastq/Wang2023_islet_snATAC-seq/SRP311849/SRX10424717/SRR14048768.sra spots: 283613070
2023-07-04 11:24:51,090 - blocks: [[1, 70903267], [70903268, 141806534], [141806535, 212709801], [212709802, 283613070]]
2023-07-04 11:24:51,091 - CMD: fastq-dump -N 1 -X 70903267 -O /cellar/users/aklie/tmp/fastq-dump/pfd_vvx_4yfv/0 --split-files --gzip /cellar/users/aklie/data/igvf/beta_cell_networks/fastq/Wang2023_islet_snATAC-seq/SRP311849/SRX10424717/

KeyboardInterrupt: 

In [65]:
# Loop through and print out each SRA download file within the subdirectories of the fastq_dir
for file in tqdm(os.listdir(fastq_dir)):
    if file.endswith(".sra"):
        file_path = os.path.join(fastq_dir, file)
        print(file_path)

100%|██████████| 1/1 [00:00<00:00, 13662.23it/s]


# DONE!

---