In [1]:
import pathlib as Path
import pandas as pd
from os import path, makedirs
from pysradb import SRAweb

  from tqdm.autonotebook import tqdm


In [2]:
def downloadDataFromSRA(subject_runs, index = 0, outdir = './', metadata = None):
    #make the outdir if it doesn't exist
    if not path.isdir(outdir):
        makedirs(outdir)
        
    #instantiate the DB
    db = SRAweb()
    #all data for this patient
    if metadata is None:
        metadata = db.sra_metadata(subject_runs.loc[0,'SRA Study'],detailed = True)
    #download only one if you pass an int
    if type(index)==int:
        print('Downloading run accession %s from database.'%metadata.loc[index,'run_accession'])
        data = db.download(df = metadata.loc[:index,:],skip_confirmation = True, use_ascp = False, out_dir = outdir)
    #download the entire list--recursion here
    elif type(index)==list:
        data = pd.concat([downloadDataFromSRA(subject_runs, index=x, outdir = outdir, metadata = metadata) for x in index])
        print('Done downloading list! Successfully downloaded %d files.'%len(data.index.values))
    #don't download anything!
    else:
        raise ValueError('Index type not supported! Please pass either an int or a list of indices.')
        
    return data
        

In [3]:
#parameters
subject = 'p8808'
samples_to_download = [0,1,2]

In [4]:
datafolder = Path.PurePath('../data/')
run_table = pd.read_csv(datafolder / 'SraRunTable.txt').set_index(['Subject'])
subject_runs = run_table.loc[subject].reset_index(drop=True)

In [5]:
#testing out the first 3!
data = downloadDataFromSRA(subject_runs, index=samples_to_download, outdir = '../data/')

Downloading run accession SRR5651391 from database.
The supplied url column "None" cannot be found.

Using recommended_url instead.

Checking download URLs
The following files will be downloaded: 

run_accession study_accession experiment_accession recommended_url                                                                          download_url                                                                                          out_dir  filesize
SRR5651391    SRP108708       SRX2888364           https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos1/sra-pub-run-12/SRR5651391/SRR5651391.1 ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR565/SRR5651391/SRR5651391.sra ../data/ 371.4 MB


Total size: 371.4 MB




ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [77]:
import os
import subprocess
from pysradb import SRAweb

def genFastqs(sra_numbers, outdir = './'):
    #Download Fastq files into fastq folder
    for sra_id in sra_numbers:
        print ("Generating fastq for: " + sra_id)
        fasterq_dump = sra_toolkit + "/fasterq-dump " + sra_id + " -O " + outdir + "fastq2"
        subprocess.check_call(fasterq_dump, shell=True)
        print ("The command used was: " + fasterq_dump)

In [78]:
#parameters
sra_index = 'Ruminococcus_gnavus'
sra_numbers = ['SRR5651391']
sra_toolkit = '~/Downloads/sratoolkit.2.10.9-mac64/bin'
outdir = '~/Documents/Methods7712/Module1/Hackathon1/Repo/data/'

In [None]:
genFastqs(sra_numbers, outdir)

In [84]:
def mapReads(sra_numbers, sra_index, outdir = './'):
    # stream output from bowtie2
    for sra_id in sra_numbers:
        print ("Generating SAM for: " + sra_id)
        fastq1 = outdir + "fastq2/" + sra_id + "_1.fastq"
        fastq2 = outdir + "fastq2/" + sra_id + "_2.fastq"
        subprocess.check_call("cd " + outdir + sra_index, shell=True)
        bowtie2 = "bowtie2 -x " + sra_index + " -1 " + fastq1 + " -2 " + fastq2 + " -S " + outdir + "fastq2/" + sra_id + ".sam" 
        subprocess.check_call(bowtie2, shell=True)
        print ("The command used was: " + bowtie2)

In [85]:
mapReads(sra_numbers,sra_index, outdir)

Generating SAM for: SRR5651391


CalledProcessError: Command 'bowtie2 -x Ruminococcus_gnavus -1 ~/Documents/Methods7712/Module1/Hackathon1/Repo/data/fastq2/SRR5651391_1.fastq -2 ~/Documents/Methods7712/Module1/Hackathon1/Repo/data/fastq2/SRR5651391_2.fastq -S ~/Documents/Methods7712/Module1/Hackathon1/Repo/data/fastq2/SRR5651391.sam' returned non-zero exit status 127.

In [None]:
import pysam 

def readCounts(sra_numbers, outdir):
   
    # Get read count
    for sra_id in sra_numbers:
        samfile = pysam.AlignmentFile(outdir + sra_id + ".sam", "rb")
        print reduce(lambda x, y: x + y, [ int(l.rstrip('\n').split('\t')[2]) for l in pysam.idxstats(samfile) ])



In [None]:
readCounts(sra_numbers,outdir)

In [None]:
import pysam 
from functools import reduce

def readCounts(sra_numbers, outdir):
   
    # Get read count
    for sra_id in sra_numbers:
        #pysam.sort("-o", outdir + sra_id + "_sorted.sam", outdir + sra_id + ".sam")
        samfile = pysam.AlignmentFile(outdir + sra_id + "_sorted.sam", "rb")
        #print(pysam.idxstats(outdir + sra_id + "_sorted.sam"))
        #print(reduce(lambda x, y: x + y, [ int(l.rstrip('\n').split('\t')[2]) for l in pysam.idxstats(outdir + sra_id + "_sorted.sam") ]))
        pysam.AlignmentFile.count(outdir + sra_id + "_sorted.sam", "rb")

def main():

    readCounts(['SRR5651391'],'/Users/brooksantangelo/Documents/Methods7712/Module1/Hackathon1/Repo/data/Fastq/')

if __name__ == '__main__':
    main()