In [1]:
import argparse
import json
import os
import glob
import subprocess
import shutil
import boto3
import botocore
import logging
import datetime
import time
import hashlib

EOL_UNIX = '\n'
EOL_WINDOWS = '\r\n'
PROFILE = 'ncrcrg.u19@gmail.com'

In [2]:
input_dir = '/projects/ps-yeolab4/NCRCRG/home/bay001/json_dir/'
output_dir = '/home/bay001/scratch/NCRCRG/'

log_dir = '/projects/ps-yeolab4/NCRCRG/bin/test_data/'
user = 'bay001'

In [3]:
cellxgene_json = os.path.join(input_dir, 'cellxgene.json')

In [4]:
def load_data_from_json_or_jsonlike(fn, logger):
    """
    Returns data stored in JSON files as a dictionary.
    Also handles cases where a shebang line exists (legacy)
    """
    try:
        with open(fn, 'r') as f:
            return json.load(f)
    except Exception as e:
        with open(fn, 'r') as f:
            f.readline()  # skips the /usr/bin/env line
            try:
                return json.load(f)
            except ValueError:
                logger.error("Couldn't load {}".format(fn))
                return fn


def download_raw_files_from_cellxgene_json(fn, output_dir, bucket, logger):
    session = boto3.Session(profile_name=PROFILE)
    s3client = session.client(
        's3',
    )
    user = get_user(fn)  # we aren't dealing with buckets but rather with users within a bucket.
    files_captured = 0  # if we're looking for sample_ids, make sure we capture at least one file
    
    data = load_data_from_json_or_jsonlike(fn, logger)
    dest_dir = os.path.join(output_dir, data['path']['path'])
    if not os.path.exists(dest_dir):
        logger.info("Downloading {}".format(data['path']['path']))
        os.makedirs(dest_dir)
    # Get the path relative to the json file location
    src_full_dir = os.path.join(
        bucket, 
        get_parent_directory_and_file(
            fn, 
            level=2, 
            include_file=False
        ), 
        "results_dir",
        data['path']['path']
    )
    copy_dir_from_aws(src_full_dir, dest_dir, logger=logger)

In [5]:
# copy logger code

def append_date(fn):
    """
    Appends the date to a filename.

    :param fn: basestring
        file name.
    :return fn.datetime:
    """
    f, ext = os.path.splitext(fn)
    return '{}_{:%Y%m%d%z}{}'.format(
        f, datetime.datetime.now(), ext
    )

def get_user(fn):
    """
    Given a properly formatted filename, return a user
    Here is an example of a properly formatted filename:

    /projects/ps-yeolab4/NCRCRG/home/bay001/work_dir/MIT_S1_L001_R1_001.fastq.gz

    :param fn: basestring
        full path where the USER is the 2nd level parent directory from base file.
    :return user: basestring
        the user
    """
    return os.path.basename(get_parent_directory_and_file(fn, level=2, include_file=False))

def get_parent_directory_and_file(fn, level=1, include_file=True):
    """
    [2]
    
    Returns the base filename and its parent directory.

    :param fn: string
        full file path (ie: /home/bay001/projects/some/path/to.fastq.gz
    :param level: int
        1= parent directory, 2=grandparent directory, etc.
    :return parent_and_file : string
        (ie. path/to.fastq.gz)
    """
    relpath = ""
    subdirs = []
    dirlevel = os.path.dirname(fn)
    file_name = os.path.basename(fn)
    # print("fn: {}, dirlevel: {}, filename: {}".format(fn, dirlevel, file_name))
    for l in range(0, level):
        dirlevel = os.path.dirname(dirlevel)
        if dirlevel != '/':  # we hit the root before hitting level cap.
            subdirs.append(os.path.basename(dirlevel))

    for l in range(0, level):
        if len(subdirs) > 0:  # still more levels to pop
            relpath = os.path.join(relpath, subdirs.pop())
        else:
            if include_file:
                return os.path.join('/', relpath, file_name)
            else:
                return os.path.join('/', relpath)
    if include_file:
        return os.path.join(relpath, file_name)
    else:
        return relpath
    

# Process logs
logger = logging.getLogger('status')
logger.setLevel(logging.INFO)

ih = logging.FileHandler(os.path.join(log_dir, append_date('{}-status.txt'.format(user))))
ih.setLevel(logging.INFO)
logger.addHandler(ih)

eh = logging.FileHandler(os.path.join(log_dir, append_date('{}-status.err'.format(user))))
eh.setLevel(logging.ERROR)
logger.addHandler(eh)

formatter = logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ih.setFormatter(formatter)
eh.setFormatter(formatter)
logger.info("Starting download program")

In [6]:
def copy_dir_from_aws(src, dest, logger):
    """
    Copies a directory from AWS to local
    :param bucket: basestring
        aws s3 source path directory
    :param dest: basestring
        local destination directory
    :param logger: logging.Logger()
        Logger object
    :return:
    """
    if not src.startswith('s3://'):
        src = 's3://' + src

    if not dest.endswith('/'):
        dest = dest + '/'

    upload_dir = os.path.dirname(dest).split('/')[-1] + "/"
    
    cmd = 'aws s3 sync {} {} --profile {}'.format(
        src,
        dest,
        PROFILE
    )
    print(cmd)

In [7]:
download_raw_files_from_cellxgene_json(
    fn=cellxgene_json,
    output_dir='/projects/ps-yeolab4/NCRCRG/home/bay001/work_dir',
    bucket='u19',
    logger=logger,
)

aws s3 sync s3://u19/home/bay001/results_dir/6mo_cortical_organoids_wt_cellranger3_feab5103e16ff600793fea28bb618f63558e6392/results/WT/outs/filtered_feature_bc_matrix/ /projects/ps-yeolab4/NCRCRG/home/bay001/work_dir/6mo_cortical_organoids_wt_cellranger3_feab5103e16ff600793fea28bb618f63558e6392/results/WT/outs/filtered_feature_bc_matrix/ --profile ncrcrg.u19@gmail.com


In [8]:
bucket

NameError: name 'bucket' is not defined