In [1]:
import argparse
import os
import glob
import shutil
import errno
import subprocess
from qtools import Submitter
import tarfile
import logging
import time
import sys
import re

In [225]:
def remove(to_remove, logger):
    """
    Removes a directory or file using shutil or os.

    :param to_remove: basestring
        the directory/file to remove
    :return:
    """
    try:
        #logger.info("Removing {}".format(to_remove))
        shutil.rmtree(to_remove)
    except OSError:
        #logger.info("Removing {}".format(to_remove))
        os.remove(to_remove)


def remove_unnecessary_intermediates(res, pipeline, logger):
    """
    Remove intermediate files as this dramatically reduces upload of res
    folder.
    
    ** Note, this works with removing toplevel files in res (dropseq) 
    or directories in cellranger. Not guaranteed to remove 2nd+ level files **

    :param res: basestring
        results directory
    :param logger: logging.Logger
        Logger object
    :param pipeline: basestring
        Because different pipelines produce differently formatted results,
        this param provides a switch among files to remove for each pipeline.
    :return:
    """
    patterns_to_remove = get_patterns_to_remove(pipeline)
    counter = 0
    for pattern in patterns_to_remove:
        for root, directories, filenames in os.walk(res): #os.path.join(res, 'results2')):
            if counter > 500:
                break
            regex = '[\w\d]+' + pattern
            match = re.compile(regex).match
            # removes files at top level?
            for f in filenames:
                if match(f):
                    file_to_remove = os.path.join(res, f)
                    remove(file_to_remove, logger)
            # recurse into directories and remove.
            for d in directories:
                if len(re.findall(pattern, d)) > 0:
                    directory_to_remove = os.path.join(root, d)
                    try:
                        assert os.path.exists(directory_to_remove)
                    except AssertionError:
                        pass
                    remove(directory_to_remove, logger)
                    
def get_patterns_to_remove(pipeline):
    if pipeline == 'dropseq-runner' or pipeline == 'dropseq-1.13-runner':
        patterns_to_remove = [
            ".sam$",
            ".tagged([\d]+-[\d]+).bam$",
            '.tagged([\d]+-[\d]+).tagged([\d]+-[\d]+).bam$',
            '.tagged([\d]+-[\d]+).tagged([\d]+-[\d]+).filtered.trimmed_smart.bam$',
            '.tagged([\d]+-[\d]+).tagged([\d]+-[\d]+).filtered.trimmed_smart.polyA_filtered.bam$',
            '.tagged([\d]+-[\d]+).tagged([\d]+-[\d]+).filtered.trimmed_smart.polyA_filtered.STARAligned.out.bam$',
            '.tagged([\d]+-[\d]+).tagged([\d]+-[\d]+).filtered.trimmed_smart.polyA_filtered.STARUnmapped.out.mate1$',
            '.tagged([\d]+-[\d]+).tagged([\d]+-[\d]+).filtered.trimmed_smart.polyA_filtered.STARAligned.out.namesorted.bam$',
            '.tagged([\d]+-[\d]+).tagged([\d]+-[\d]+).filtered.trimmed_smart.polyA_filtered.STARAligned.out.namesorted.merged.bam$',
            '.tagged([\d]+-[\d]+).tagged([\d]+-[\d]+).filtered.trimmed_smart.polyA_filtered.STARAligned.out.namesorted.merged.TaggedGeneExon.bam$',
        ]
    elif pipeline == 'cellranger-runner' or \
            pipeline == 'cellranger-2.1.1-runner' or \
            pipeline == 'cellranger-3.0.1-runner' or \
            pipeline == 'cellranger-3.0.2-runner':
        patterns_to_remove = [
            "SC_RNA_COUNTER_CS$",
            "SC_RNA_AGGREGATOR_CS$",
        ]  # TODO: cellranger
    else:
        patterns_to_remove = []
    
    return patterns_to_remove
                


In [208]:
work_dir = '/home/bay001/projects/codebase/NCRCRG/home/bay001/results_dir/cached/MIT_vs_IH_pipeline_test_54e6c4247f6072336ee4afccbe0a057fa1c3c320/results2/'

In [203]:
remove_unnecessary_intermediates(work_dir, "dropseq-runner")

/home/bay001/projects/codebase/NCRCRG/home/bay001/results_dir/cached/MIT_vs_IH_pipeline_test_54e6c4247f6072336ee4afccbe0a057fa1c3c320/results/MIT_vs_IH_MIT_SSS.tagged1-12.tagged13-20.filtered.trimmed_smart.polyA_filtered.STARAligned.out.bam
/home/bay001/projects/codebase/NCRCRG/home/bay001/results_dir/cached/MIT_vs_IH_pipeline_test_54e6c4247f6072336ee4afccbe0a057fa1c3c320/results/MIT_vs_IH_IH_Ori.tagged1-12.tagged13-20.filtered.trimmed_smart.polyA_filtered.STARAligned.out.bam
/home/bay001/projects/codebase/NCRCRG/home/bay001/results_dir/cached/MIT_vs_IH_pipeline_test_54e6c4247f6072336ee4afccbe0a057fa1c3c320/results/MIT_vs_IH_MIT_Ori.tagged1-12.tagged13-20.filtered.trimmed_smart.polyA_filtered.STARAligned.out.bam
/home/bay001/projects/codebase/NCRCRG/home/bay001/results_dir/cached/MIT_vs_IH_pipeline_test_54e6c4247f6072336ee4afccbe0a057fa1c3c320/results/MIT_vs_IH_IH_SSS.tagged1-12.tagged13-20.filtered.trimmed_smart.polyA_filtered.STARAligned.out.bam
/home/bay001/projects/codebase/NCRCRG/

In [226]:
work_dir = '/home/bay001/projects/codebase/NCRCRG/home/bay001/results_dir/cached/1k_brain_mouse_c7b5df4417704b25c81f8df4622a45763b1733f9/results2/'

In [227]:
remove_unnecessary_intermediates(work_dir, "cellranger-runner", logger=None)

In [40]:
paths = recursive_remove(work_dir, "SC_RNA_COUNTER_CS")
paths[:5]

['_filelist', '_sitecheck', '_perf', '_finalstate', '_invocation']

In [35]:
for path in paths:
    if "SC_RNA_COUNTER_CS" in path:
        print(path)

In [64]:
counter = 0
paths = []
files = []
for root, directories, filenames in os.walk(work_dir, topdown=True):
    path = root
    paths.append(root)
    for file in filenames:
        fullpath = os.path.join(path, file)
        files.append(fullpath)
        assert os.path.exists(fullpath)
        regex = "SC_RNA_COUNTER_CS"
        
        if len(re.findall(regex, fullpath)) > 0:
            print(fullpath)
        #if counter > 100:
        #    break
        counter += 1

In [65]:
len(paths)

1734

In [66]:
len(files)

111

In [67]:
paths[:3]

['/home/bay001/projects/codebase/NCRCRG/home/bay001/results_dir/cached/1k_brain_mouse_c7b5df4417704b25c81f8df4622a45763b1733f9/results2/',
 '/home/bay001/projects/codebase/NCRCRG/home/bay001/results_dir/cached/1k_brain_mouse_c7b5df4417704b25c81f8df4622a45763b1733f9/results2/neurons_900',
 '/home/bay001/projects/codebase/NCRCRG/home/bay001/results_dir/cached/1k_brain_mouse_c7b5df4417704b25c81f8df4622a45763b1733f9/results2/neurons_900/outs']