In [1]:
import argparse
import json
import os
import glob
import subprocess
import shutil
import boto3
import botocore
import logging
import datetime

In [79]:
cellranger_json = '/projects/ps-yeolab3/bay001/codebase/metadata/results_dir/yan_YS16/CELLRANGER_yan_YS16_INPUT.yaml'


def copy_file_from_aws(bucket, key, dest, logger):
    """
    Copies a file from AWS to local

    :param bucket: basestring
        bucket name (minus the s3:// prefix)
    :param src: basestring
        output directory where the rawdata should go.
    :return:
    """
    s3 = boto3.client('s3')

    try:
        logger.info("Downloading {} from {} to {}".format(key, bucket, dest))
        s3.download_file(
            bucket, key, dest
        )
        logger.info("Done downloading {} from aws ({})".format(key, dest))
        time.sleep(1)
    except Exception as e:
        logger.error(e)
        raise

        
def copy_dir_from_aws(bucket, dest, logger):
    """
    Copies a directory from local to AWS

    :param src: basestring
        local source filename
    :param bucket: basestring
        aws s3 bucket name
    :param logger: logging.Logger()
        Logger object
    :return:
    """
    if not bucket.startswith('s3://'):
        bucket = 's3://' + bucket

    if not dest.endswith('/'):
        dest = dest + '/'

    upload_dir = os.path.dirname(dest).split('/')[-1] + "/"

    cmd = 'aws s3 cp {} {} --recursive'.format(
        os.path.join(bucket, upload_dir),
        dest
    )
    try:
        logger.info("Downloading from aws: {}".format(cmd))
        ret = subprocess.check_call(cmd, shell=True)
        logger.info("Done downloading {} from aws ({}) with a return code of: {}".format(dest, bucket, ret))
        logger.info("Sleeping for 15s.")
        time.sleep(15)
    except Exception as e:
        logger.error(e)
        
def download_raw_files_from_cellranger_json(fn, output_dir, bucket, logger=None):
    s3 = boto3.client('s3')
    
    with open(fn) as f:
        f.readline()  # skips the /usr/bin/env line
        try:
            data = json.load(f)
            for sample in data['samples']:
                sample_prefix = sample['sample_id']
                src_basedir = sample['fastq_dir']['path']
                dest_dir = os.path.join(output_dir, src_basedir)
                
                # if no sample id specified, download the whole directory
                if sample_prefix == "":
                    logger.info("Sample prefix not found, copying entire directory {}.".format(dest_dir))
                    copy_dir_from_aws(bucket, dest_dir, logger)
                # search and download the files based on sample id prefix
                else:
                    # create local directory if not exists
                    if not os.path.exists(dest_dir):
                        logger.info("Directory ({}) not found in {}, creating it.".format(src_basedir, output_dir))
                        os.makedirs(dest_dir)
                    else:
                        logger.info("Directory ({}) found in {}".format(src_basedir, output_dir))
                    # download data based on sample prefix
                    for obj in s3.list_objects_v2(Bucket=bucket)['Contents']:
                        if obj['Key'].startswith(os.path.join(src_basedir, sample_prefix)):
                            dest_obj = os.path.join(output_dir, obj['Key'])
                            if not os.path.exists(dest_obj):
                                copy_file_from_aws(bucket, obj['Key'], dest_obj)
                            else:
                                logger.info("{} exists, will not re-download.".format(dest_obj))
        except ValueError:
            pass
            logger.error("Couldn't load {}".format(fn))
    return 0

In [81]:
download_raw_files_from_cellranger_json(cellranger_json, '/home/bay001/', 'metadata-pipelines')

Directory (yan) found in /home/bay001/
/home/bay001/yan/YS-16_S5_L006_R1_001.fastq.gz exists, will not re-download.
/home/bay001/yan/YS-16_S5_L006_R2_001.fastq.gz exists, will not re-download.
Downloading yan/YS-16_S6_L006_R1_001.fastq.gz from metadata-pipelines to /home/bay001/yan/YS-16_S6_L006_R1_001.fastq.gz
Done downloading yan/YS-16_S6_L006_R1_001.fastq.gz from aws (/home/bay001/yan/YS-16_S6_L006_R1_001.fastq.gz)


NameError: global name 'time' is not defined