# Set-up

In [43]:
import os
import sys
import glob
import pickle
from tqdm.autonotebook import tqdm
from pysradb.sraweb import SRAweb

# Add conda environment to path
sys.path.append('/cellar/users/aklie/opt/miniconda3/envs/get_data/bin/')
os.environ['PATH'] = '/cellar/users/aklie/opt/miniconda3/envs/get_data/bin/' + ':' + os.environ['PATH']

In [2]:
# Choose the current dataset we are working with
dataset_name = "Hangauer2017_BT474_RNA-seq"
srp_id = "SRP079968"

In [3]:
# Set-up directories
base_dir = "/cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq"
cwd = os.path.join(base_dir, "bin/get_data")
fastq_dir = os.path.join(base_dir, "fastq")
base_dir, cwd, fastq_dir

('/cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq',
 '/cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/bin/get_data',
 '/cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq')

In [4]:
# Connect to SRA
db = SRAweb()

# Get metadata

In [5]:
# Grab the metadata for the SRP
metadata = db.sra_metadata(srp_id, detailed=True)
len(metadata)

25

In [6]:
# Save the metadata and the list of srr ids
metadata.to_csv(os.path.join(cwd, f"{srp_id}_metadata.tsv"), index=False, sep="\t")
metadata["run_accession"].to_csv(os.path.join(cwd, f"{srp_id}_srr_ids.txt"), index=False, header=False)

In [9]:
metadata.columns

Index(['run_accession', 'study_accession', 'study_title',
       'experiment_accession', 'experiment_title', 'experiment_desc',
       'organism_taxid', 'organism_name', 'library_name', 'library_strategy',
       'library_source', 'library_selection', 'library_layout',
       'sample_accession', 'sample_title', 'instrument', 'instrument_model',
       'instrument_model_desc', 'total_spots', 'total_size', 'run_total_spots',
       'run_total_bases', 'run_alias', 'public_filename', 'public_size',
       'public_date', 'public_md5', 'public_version', 'public_semantic_name',
       'public_supertype', 'public_sratoolkit', 'aws_url', 'aws_free_egress',
       'aws_access_type', 'public_url', 'ncbi_url', 'ncbi_free_egress',
       'ncbi_access_type', 'gcp_url', 'gcp_free_egress', 'gcp_access_type',
       'experiment_alias', 'source_name', 'cell line', 'tissue of origin',
       'number of cells', 'ena_fastq_http', 'ena_fastq_http_1',
       'ena_fastq_http_2', 'ena_fastq_ftp', 'ena_fastq_ft

In [22]:
metadata["sample_name"] = metadata['experiment_desc'].str.split(":").str[1].str.split(";").str[0].str.strip().str.replace(" ", "_")
metadata["sample_name"].value_counts()

BT474_Parental_rep_1     5
BT474_Parental_rep_2     5
BT474_Persister_rep_1    5
BT474_Persister_rep_2    5
BT474_Persister_rep_3    5
Name: sample_name, dtype: int64

In [23]:
# for each duplicate sample, make a lane column in numerically increasing order (1, 2, 3, ...)
metadata["lane"] = metadata.groupby("sample_name").cumcount() + 1
metadata["lane"]

0     1
1     2
2     3
3     4
4     5
5     1
6     2
7     3
8     4
9     5
10    1
11    2
12    3
13    4
14    5
15    1
16    2
17    3
18    4
19    5
20    1
21    2
22    3
23    4
24    5
Name: lane, dtype: int64

In [25]:
# Get a mapping of experiment accessions to sample ids, useful for chromap and CellRanger
expacc_to_sample = metadata.set_index("experiment_accession")["sample_name"].to_dict()
expacc_to_sample

{'SRX1979973': 'BT474_Parental_rep_1',
 'SRX1979974': 'BT474_Parental_rep_2',
 'SRX1979975': 'BT474_Persister_rep_1',
 'SRX1979976': 'BT474_Persister_rep_2',
 'SRX1979977': 'BT474_Persister_rep_3'}

In [31]:
# Grab datasets dirs
datasets = glob.glob("/cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/*")
len(datasets), datasets

(5,
 ['/cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979974',
  '/cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979973',
  '/cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979977',
  '/cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979975',
  '/cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979976'])

In [66]:
# Rename for CellRanger
fastqs_per_sample = {}
for dataset in datasets:
    fastq_files = sorted(glob.glob(os.path.join(dataset, "*.fastq.gz")))
    file_mapping = {}
    for i, fastq_file in enumerate(fastq_files):
        if "SRR" not in fastq_file:
            print(f"Already renamed, skipping {fastq_file}")
            continue
        read_type = fastq_file.split("_")[-1].split(".")[0]
        file_path = os.path.dirname(fastq_file)
        exp_acc = os.path.basename(file_path).split("_")[0]
        if exp_acc not in expacc_to_sample:
            print(f"Skipping {exp_acc}")
            continue
        sample_id = expacc_to_sample[exp_acc]
        new_file = f"{file_path}/{sample_id}_L00{i+1}_R1.fastq.gz"
        file_mapping[fastq_file] = new_file
        cmd = f"mv {fastq_file} {new_file}"
        print(cmd)
        #os.system(cmd)
        if sample_id not in fastqs_per_sample:
            fastqs_per_sample[sample_id] = []
        fastqs_per_sample[sample_id].append(new_file)
    with open(os.path.join(file_path, "file_mapping.pickle"), "wb") as f:
        pickle.dump(file_mapping, f)


mv /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979974/SRR3955806_1.fastq.gz /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979974/BT474_Parental_rep_2_L001_R1.fastq.gz
mv /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979974/SRR3955807_1.fastq.gz /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979974/BT474_Parental_rep_2_L002_R1.fastq.gz
mv /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979974/SRR3955808_1.fastq.gz /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979974/BT474_Parental_rep_2_L003_R1.fastq.gz
mv /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979974/SRR3955809_1.fastq.gz /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979974/BT474_Parental_rep_2_L004_R1.fastq.gz
mv /cellar/users/aklie/data/datasets/Hangaue

In [68]:
# For each sample, concatenate the fastq files using cat
merged_files = []
for sample_id, fastq_files in fastqs_per_sample.items():
    if len(fastq_files) == 1:
        continue
    cat_cmd = f"cat {' '.join(fastq_files)} > {fastq_files[0].replace('_L001_', '_')}"
    merged_files.append(fastq_files[0].replace('_L001_', '_'))
    print(cat_cmd)
    #os.system(cat_cmd)

cat /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979974/BT474_Parental_rep_2_L001_R1.fastq.gz /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979974/BT474_Parental_rep_2_L002_R1.fastq.gz /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979974/BT474_Parental_rep_2_L003_R1.fastq.gz /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979974/BT474_Parental_rep_2_L004_R1.fastq.gz /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979974/BT474_Parental_rep_2_L005_R1.fastq.gz > /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979974/BT474_Parental_rep_2_R1.fastq.gz
cat /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979973/BT474_Parental_rep_1_L001_R1.fastq.gz /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979973/BT474_Parental_rep_1_L002

In [69]:
merged_files

['/cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979974/BT474_Parental_rep_2_R1.fastq.gz',
 '/cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979973/BT474_Parental_rep_1_R1.fastq.gz',
 '/cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979977/BT474_Persister_rep_3_R1.fastq.gz',
 '/cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979975/BT474_Persister_rep_1_R1.fastq.gz',
 '/cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979976/BT474_Persister_rep_2_R1.fastq.gz']

In [70]:
# for each sample, run fastqc
fastqc_dir = "/cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/fastqc"
for merged_file in merged_files:
    fastqc_cmd = f"fastqc -o {fastqc_dir} {merged_file}"
    print(fastqc_cmd)
    #os.system(fastqc_cmd)

fastqc -o /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/fastqc /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979974/BT474_Parental_rep_2_R1.fastq.gz
fastqc -o /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/fastqc /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979973/BT474_Parental_rep_1_R1.fastq.gz
fastqc -o /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/fastqc /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979977/BT474_Persister_rep_3_R1.fastq.gz
fastqc -o /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/fastqc /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/SRX1979975/BT474_Persister_rep_1_R1.fastq.gz
fastqc -o /cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/fastqc /cellar/users/aklie/data/datasets/Hangauer20

# DONE!

---

In [49]:
import pandas as pd

In [63]:
seqkit_df = pd.read_csv("/cellar/users/aklie/data/datasets/Hangauer2017_BT474_RNA-seq/fastq/SRP079968/fastq_stats.tsv", sep="\t")

In [64]:
seqkit_df["run_accession"] = seqkit_df["file"].str.split("/").str[-1].str.split("_").str[0]

In [65]:
seqkit_df.merge(metadata[["run_accession", 'run_total_spots']], on="run_accession")

Unnamed: 0,file,format,type,num_seqs,sum_len,min_len,avg_len,max_len,run_accession,run_total_spots
0,/cellar/users/aklie/data/datasets/Hangauer2017...,FASTQ,DNA,31849965,1592498250,50,50.0,50,SRR3955801,31849965
1,/cellar/users/aklie/data/datasets/Hangauer2017...,FASTQ,DNA,31942393,1597119650,50,50.0,50,SRR3955802,31942393
2,/cellar/users/aklie/data/datasets/Hangauer2017...,FASTQ,DNA,30582105,1529105250,50,50.0,50,SRR3955803,30582105
3,/cellar/users/aklie/data/datasets/Hangauer2017...,FASTQ,DNA,31802569,1590128450,50,50.0,50,SRR3955804,31802569
4,/cellar/users/aklie/data/datasets/Hangauer2017...,FASTQ,DNA,31876916,1625722716,51,51.0,51,SRR3955805,31876916
5,/cellar/users/aklie/data/datasets/Hangauer2017...,FASTQ,DNA,31848904,1592445200,50,50.0,50,SRR3955806,31848904
6,/cellar/users/aklie/data/datasets/Hangauer2017...,FASTQ,DNA,31895843,1594792150,50,50.0,50,SRR3955807,31895843
7,/cellar/users/aklie/data/datasets/Hangauer2017...,FASTQ,DNA,30669115,1533455750,50,50.0,50,SRR3955808,30669115
8,/cellar/users/aklie/data/datasets/Hangauer2017...,FASTQ,DNA,31785722,1589286100,50,50.0,50,SRR3955809,31785722
9,/cellar/users/aklie/data/datasets/Hangauer2017...,FASTQ,DNA,31402101,1601507151,51,51.0,51,SRR3955810,31402101


sample_accession
SRS1587381    31849965
SRS1587381    31942393
SRS1587381    30582105
SRS1587381    31802569
SRS1587381    31876916
SRS1587382    31848904
SRS1587382    31895843
SRS1587382    30669115
SRS1587382    31785722
SRS1587382    31402101
SRS1587383    30596834
SRS1587383    30668664
SRS1587383    29532831
SRS1587383    30441051
SRS1587383    30764606
SRS1587384    33673012
SRS1587384    33818692
SRS1587384    32514812
SRS1587384    33667024
SRS1587384    32822514
SRS1587385    30754611
SRS1587385    30899133
SRS1587385    29706249
SRS1587385    30718811
SRS1587385    30128347
Name: run_total_spots, dtype: object