# Imports

In [1]:
import time
import shutil
import os
import tarfile
import stat
import multiprocessing as mp
import subprocess
import sys

# Functions

In [2]:
def fetch_project_df(profID):
  '''Wrapper to API at www.ebi.ac.uk/ena'''
  import requests
  import json
  import pandas

  url = 'https://www.ebi.ac.uk/ena/portal/api/filereport'
  params = {'accession':profID,
            'format':'json',
            'download':'false',
            'result':'read_run'}
  headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"+
             " AppleWebKit/537.36 (KHTML, like Gecko)"+
             " Chrome/70.0.3538.77 Safari/537.36"}

  print("Accessing: ", url)
  r = requests.get(url,
                  params = params,
                  headers = headers
                  ) 
  print("\tMaking Request To: ", r.url) 
  my_json = json.loads(r.text)
  df = pandas.json_normalize(json.loads(r.text))
  return df

In [3]:
def build_hisat2(histast_url):
  import time
  import shutil
  import os
  import tarfile
  import stat

  installed_wget = True
  try:
    import wget
  except ModuleNotFoundError:
    installed_wget = False
    import subprocess

  
  print("Downloading HISAT2 at: ", histast_url)
  if installed_wget: 
    wget.download(histast_url)
  else:
    subprocess.run(["wget", histast_url])

  print("Inflating HISAT2 at: ", os.getcwd())
  shutil.unpack_archive(histast_url.split("/")[-1], os.getcwd(),"zip")
  
  #for root, dirs, files in os.walk(os.path.join(os.getcwd(),
  #          histast_url.split("/")[-1].replace("-Linux_x86_64.zip",""))):
  #  for d in dirs:
  #    os.chmod(os.path.join(root, d), stat.S_IEXEC) #owner execute
  #  for f in files:
  #    os.chmod(os.path.join(root, f), stat.S_IEXEC) #owner execute

  hisatpath = os.path.join(os.getcwd(),
            histast_url.split("/")[-1].replace("-Linux_x86_64.zip",""),
             "hisat2")
  return hisatpath

def download_indexes(index_url):
  import time
  import shutil
  import os
  import tarfile
  import stat

  installed_wget = True
  try:
    import wget
  except ModuleNotFoundError:
    installed_wget = False
    import subprocess

  print("Downloading INDEXES at: ", index_url)
  if installed_wget: 
    wget.download(index_url)
  else:
    subprocess.run(["wget", index_url])

  print("Inflating Indexes at: ", os.getcwd())
  file = tarfile.open(index_url.split("/")[-1])
  file.extractall('{}/{}'.format(os.getcwd(),
                                 index_url.split("/")[-1].replace(".tar.gz","")))

  indexpath = os.path.join(os.getcwd(),
            index_url.split("/")[-1].replace(".tar.gz",""))
  return indexpath

# Install and Build hisat and index

In [4]:
my_local_path = "/gpfs/gpfs0/project/ds6011-sp22-wiki-data/ds6011-sp22-dod/"
datapath = "/gpfs/gpfs0/project/ds6011-sp22-wiki-data/ds6011-sp22-dod/data"

# Main

## Setup

In [5]:
# Path to HISAT2 executable
HISAT2_path = ''.join([my_local_path,"hisat2-2.2.1/hisat2"])

# Path to reference genomes
index_path = ''.join([my_local_path,"grch38_genome"])

print("HISAT2 Path: {}".format(HISAT2_path))
print("Indexes Path: {}".format(index_path))

HISAT2 Path: /gpfs/gpfs0/project/ds6011-sp22-wiki-data/ds6011-sp22-dod/hisat2-2.2.1/hisat2
Indexes Path: /gpfs/gpfs0/project/ds6011-sp22-wiki-data/ds6011-sp22-dod/grch38_genome


In [6]:
if not os.path.exists(HISAT2_path):
    hisat2_url= "https://cloud.biohpc.swmed.edu/index.php/s/oTtGWbWjaxsQ2Ho/download/hisat2-2.2.1-Linux_x86_64.zip"
    HISAT2_path = build_hisat2(hisat2_url)
    
if not os.path.exists(index_path):
    index_url = "https://genome-idx.s3.amazonaws.com/hisat/grch38_genome.tar.gz"
    index_path = download_indexes(index_url)

In [7]:
!ls /gpfs/gpfs0/project/ds6011-sp22-wiki-data/ds6011-sp22-dod/data

bam_files  PRJEB35615	PRJNA281708  PRJNA634526
csv_files  PRJNA277616	PRJNA41223   sam_files


In [8]:
directories = os.listdir(datapath)
#directories = ["PRJNA475246","PRJEB35615"] # Process Single Project
for directory in directories:
    if not directory.startswith("PRJ"):
        continue
    print("Processing Project: {}".format(directory))
    # Fetch Metadata for Project from ENA API
    df = fetch_project_df(directory)
    project = '/'.join([datapath, directory])
    print("\tProject Path: ",project)
    
    runs = [x.replace(".fastq","").replace("_1","").replace("_2","") for x in sorted(os.listdir(project))]
    runs = list(set(runs))
    single_read = False
    for i in range(len(runs)):
        print()
        run = runs[i]
        #print("\tRun: ", run)
        outfile = "{}.sam".format(run)
        subset = df[df['run_accession']==run]
        if subset.empty: continue
        if ';' in subset['fastq_ftp'].values[0]:
            urls = subset['fastq_ftp'].values[0].split(";")
        else:
            urls = [subset['fastq_ftp'].values[0]]
            single_read = True
        print("\tRun: ",run)
        print("\tSingle Read? :", single_read)
        
        
        # Dont Reprocesses if the .sam file exists 
        if os.path.exists('{}/sam_files/{}/{}'.format(datapath, directory,outfile)):
            print("{} already is processsed. {} Exists. Continuing.".format(run,
                                    '{}/sam_files/{}/{}'.format(datapath, directory,outfile)))
            continue
        if single_read:
            read1 = os.path.join(project, "{}.fastq".format(run))
            if not os.path.exists(read1):
                read1 = os.path.join(project, "{}_1.fastq".format(run))   
            cmd = [HISAT2_path,
           "-p{}".format(mp.cpu_count()),
           "-q",
           "-x",
           "$HISAT2_INDEXES/genome",
           "-U",
           read1,
           "-S",
           outfile,
           "--time"]
        else:
            read1 = os.path.join(project, "{}_1.fastq".format(run))
            read2 = os.path.join(project, "{}_2.fastq".format(run))

            cmd = [HISAT2_path,
           "-p{}".format(mp.cpu_count()),
           "-q",
           "-x",
           "$HISAT2_INDEXES/genome",
           "-1",
           read1,
           "-2",
           read2,
           "-S",
           outfile,
           "--time"]
        print("\tRunning Command: ")
        print(*cmd, sep =" ")
        my_env = {**os.environ,
          'HISAT2_INDEXES': '{}/grch38/'.format(index_path)}

        process = subprocess.Popen(cmd, 
                            stdout=subprocess.PIPE, 
                            stderr=subprocess.STDOUT,
                            env = my_env)
        for line in process.stdout:
            sys.stdout.write(line)
        print()
        print("Moving {} to {}".format(outfile, '{}/sam_files/{}/{}'.format(datapath, directory,outfile)))
        
        if not os.path.exists('{}/sam_files/{}'.format(datapath, directory)):
            os.makedirs('{}/sam_files/{}'.format(datapath, directory))
        shutil.move(outfile, '{}/sam_files/{}/{}'.format(datapath, directory,outfile))
    

Processing Project: PRJNA281708
Accessing:  https://www.ebi.ac.uk/ena/portal/api/filereport
	Making Request To:  https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJNA281708&format=json&download=false&result=read_run
	Project Path:  /gpfs/gpfs0/project/ds6011-sp22-wiki-data/ds6011-sp22-dod/data/PRJNA281708

	Run:  SRR1982814
	Single Read? : True
SRR1982814 already is processsed. /gpfs/gpfs0/project/ds6011-sp22-wiki-data/ds6011-sp22-dod/data/sam_files/PRJNA281708/SRR1982814.sam Exists. Continuing.

	Run:  SRR1982612
	Single Read? : True
SRR1982612 already is processsed. /gpfs/gpfs0/project/ds6011-sp22-wiki-data/ds6011-sp22-dod/data/sam_files/PRJNA281708/SRR1982612.sam Exists. Continuing.


	Run:  SRR1982783
	Single Read? : True
SRR1982783 already is processsed. /gpfs/gpfs0/project/ds6011-sp22-wiki-data/ds6011-sp22-dod/data/sam_files/PRJNA281708/SRR1982783.sam Exists. Continuing.

	Run:  SRR1982660
	Single Read? : True
SRR1982660 already is processsed. /gpfs/gpfs0/project/ds6011