# Imports

In [1]:
import subprocess
import os
import sys
import pandas
import requests
import json
import time
import re
import gzip
import shutil
import tarfile
import stat
try:
  import wget
except:
  pass


# Functions

In [2]:
def fetch_project_df(profID):
  '''Wrapper to API at www.ebi.ac.uk/ena'''
  import requests
  import json
  import pandas

  url = 'https://www.ebi.ac.uk/ena/portal/api/filereport'
  params = {'accession':profID,
            'format':'json',
            'download':'false',
            'result':'read_run'}
  headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"+
             " AppleWebKit/537.36 (KHTML, like Gecko)"+
             " Chrome/70.0.3538.77 Safari/537.36"}

  print("Accessing: ", url)
  r = requests.get(url,
                  params = params,
                  headers = headers
                  ) 
  print("\tMaking Request To: ", r.url) 
  my_json = json.loads(r.text)
  df = pandas.json_normalize(json.loads(r.text))
  return df


In [3]:
# https://stackoverflow.com/questions/55040442/how-to-register-gz-format-in-shutil-register-archive-format-to-use-same-format

def gunzip_something(gzipped_file_name, work_dir):
  import os
  import re
  import gzip
  import shutil
  "gunzip the given gzipped file"

  # see warning about filename
  filename = os.path.split(gzipped_file_name)[-1]
  filename = re.sub(r"\.gz$", "", filename, flags=re.IGNORECASE)

  with gzip.open(gzipped_file_name, 'rb') as f_in:  # <<========== extraction happens here
      with open(os.path.join(work_dir, filename), 'wb') as f_out:
          shutil.copyfileobj(f_in, f_out)
try:
  shutil.register_unpack_format('gz',
                              ['.gz', ],
                              gunzip_something)
except:
  pass

In [4]:
def fetch_selected_runs(df, selected_runs, datapath=None, remove_gz=True):
  import time
  import shutil
  import os
  import requests
  installed_wget = True
  try:
    import wget
  except ModuleNotFoundError:
    installed_wget = False
  import subprocess
  
  single_read = False
  for i in range(len(selected_runs)):
    subset = df[df['run_accession']==selected_runs[i]]
    if ';' in subset['fastq_ftp'].values[0]:
        urls = subset['fastq_ftp'].values[0].split(";")
    else:
        urls = [subset['fastq_ftp'].values[0]]
        single_read = True
    for url in urls:
      if not url.startswith("https://"):
        url = "https://" + url
      print('Fetching: {}'.format(url))
      headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"+
             " AppleWebKit/537.36 (KHTML, like Gecko)"+
             " Chrome/70.0.3538.77 Safari/537.36"}
      filename = url.split("/")[-1]
      cmd = "wget {} --no-check-certificate -U Mozilla/5.0".format(url)
      print("Running Command: ",cmd)
      !{cmd}
      #### TODO Find more pythonic way to download data
      #cmd = ["wget","{}".format(url),"-U","Mozilla/5.0"]
      #process = subprocess.Popen(cmd, 
      #                      stdout=subprocess.PIPE, 
      #                      stderr=subprocess.STDOUT)
      #for line in process.stdout:
      #  sys.stdout.write(line)
      #with open(filename, "wb") as f:
      #  r = requests.get(url, headers = headers )
      #  f.write(r.content)
      #if installed_wget: 
      #  wget.download(url)
      #else:
      #  subprocess.run(["wget", url])
        #!wget {url}
      time.sleep(2)
  
  if not os.path.exists(datapath):
    os.makedirs(datapath)
  for datafile in selected_runs:
    if single_read:
        mygz= ''.join([datafile,'.fastq.gz'])
        myfastq= ''.join([datafile,'.fastq'])
        
        if os.path.exists(mygz):
          print('Unpacking: ' + mygz )
          #!gzip -d {mygz} && mv {myfastq} {datapath}
          shutil.unpack_archive(mygz, datapath,"gz")
        if os.path.exists(mygz.replace(".fastq.gz","_1.fastq.gz")):
          mygz = mygz.replace(".fastq.gz","_1.fastq.gz")
          print('Unpacking: ' + mygz )
          #!gzip -d {mygz} && mv {myfastq} {datapath}
          shutil.unpack_archive(mygz, datapath,"gz")
        if remove_gz:
          os.remove(mygz)
    else:
        mygz= ''.join([datafile,'_1.fastq.gz'])
        myfastq= ''.join([datafile,'_1.fastq'])
        print('Unpacking: ' + mygz )
        if os.path.exists(mygz):
          #!gzip -d {mygz} && mv {myfastq} {datapath}
          shutil.unpack_archive(mygz, datapath,"gz")
        if remove_gz:
          os.remove(mygz)

        mygz= ''.join([datafile,'_2.fastq.gz'])
        myfastq= ''.join([datafile,'_2.fastq'])
        print('Unpacking: ' + mygz )
        if os.path.exists(mygz):
      #!gzip -d {mygz} && mv {myfastq} {datapath}
          shutil.unpack_archive(mygz, datapath,"gz")
        if remove_gz:
          os.remove(mygz)
  return single_read

# Main

## Setup

In [5]:
# Path to Save Data
my_local_path = "/gpfs/gpfs0/project/ds6011-sp22-wiki-data/ds6011-sp22-dod/"
my_local_path = my_local_path + "data/"
datapath = "/gpfs/gpfs0/project/ds6011-sp22-wiki-data/ds6011-sp22-dod/data"

In [6]:
# Data Project to download
dataset = 'PRJEB35615'

In [7]:
overwrite = False

## Call to ena api

In [8]:
df = fetch_project_df(dataset)

Accessing:  https://www.ebi.ac.uk/ena/portal/api/filereport
	Making Request To:  https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJEB35615&format=json&download=false&result=read_run


## Select Runs to Download

In [13]:
selected_runs = sorted(df['run_accession'].to_list()) # Get all Runs

selected_runs = [ 'ERR3698081'] 

In [14]:
print("Selected Runs: ",selected_runs)

Selected Runs:  ['ERR3698081']


In [15]:
mydatapath = ''.join([my_local_path,"{}/".format(dataset)])
print("Generating Path {}".format(mydatapath))
if not os.path.exists(mydatapath):
    os.makedirs(mydatapath)

Generating Path /gpfs/gpfs0/project/ds6011-sp22-wiki-data/ds6011-sp22-dod/data/PRJEB35615/


# Start Download

In [16]:
import multiprocessing as mp
for run in selected_runs:
    subset = df[df['run_accession']==run].fastq_ftp.values[0].split(";")
    reads = [x.split("/")[-1].replace(".gz","") for x in subset]
    for i in range(len(reads)):
        print("\n")
        read = reads[i]
        print("Fetching: ", subset[i])
        dataread = ''.join([mydatapath,read])
  
        if overwrite == True:
            if os.path.exists(dataread):
                try: os.remove(dataread)
                except: shutil.rmtree(dataread) 
        
        # Prevent download if file exists
        if os.path.exists(dataread):
            print("{} Exists, Will Not Download".format(read))
            print("\tPath: {}".format(dataread))
            continue
        else:
            singleread = fetch_selected_runs(df, [run], datapath = mydatapath, remove_gz = True)
        



Fetching:  ftp.sra.ebi.ac.uk/vol1/fastq/ERR369/001/ERR3698081/ERR3698081_1.fastq.gz
Fetching: https://ftp.sra.ebi.ac.uk/vol1/fastq/ERR369/001/ERR3698081/ERR3698081_1.fastq.gz
Running Command:  wget https://ftp.sra.ebi.ac.uk/vol1/fastq/ERR369/001/ERR3698081/ERR3698081_1.fastq.gz --no-check-certificate -U Mozilla/5.0
--2022-07-01 08:30:29--  https://ftp.sra.ebi.ac.uk/vol1/fastq/ERR369/001/ERR3698081/ERR3698081_1.fastq.gz
Resolving ftp.sra.ebi.ac.uk (ftp.sra.ebi.ac.uk)... 193.62.193.138
Connecting to ftp.sra.ebi.ac.uk (ftp.sra.ebi.ac.uk)|193.62.193.138|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1577298754 (1.5G) [application/octet-stream]
Saving to: ‘ERR3698081_1.fastq.gz’


2022-07-01 08:39:40 (1.07 MB/s) - Connection closed at byte 615823908. Retrying.

--2022-07-01 08:39:41--  (try: 2)  https://ftp.sra.ebi.ac.uk/vol1/fastq/ERR369/001/ERR3698081/ERR3698081_1.fastq.gz
Connecting to ftp.sra.ebi.ac.uk (ftp.sra.ebi.ac.uk)|193.62.193.138|:443... connected.
HT