# large_files_handler.ipynb


In [1]:
import subprocess
import os
import glob

In [3]:
def get_large_files_list():
    """ 
    Returns a list of filenames for files greater than 50 Mb 
    Note that recursive search starts in parent folder.
    """
    result = subprocess.run('find .. -type f -size +50M', stdout=subprocess.PIPE, shell=True)
    return [i.decode('utf8') for i in result.stdout.split()]

In [4]:
def confirm_large_files_are_listed_in_gitignore(large_files_list):

    # if ../.gitignore does not exist, create it
    gitignore_path = '../.gitignore'
    if not os.path.exists(gitignore_path):
        os.system(f'touch {gitignore_path}')

    # read ../.gitignore as a string
    with open(gitignore_path, 'r') as f:
        s = f.read()          

    # make sure that a filename for each file greater than 50 Mb is included in the string
    string_has_been_modified = False
    for filename in large_files_list:
        found = any(filename in x for x in s)
        if not found:
            s = s + filename + '\n'
            string_has_been_modified = True

    # if the string has been modified, replace ../.gitignore
    if string_has_been_modified:
        with open(gitignore_path, 'w') as f:
            f.write(s)    

In [32]:
def assemble_large_files_from_parts():
    parts_files = sorted(glob.glob('../**/*.part_*', recursive=True))
    for file in list(set([s[:s.find('.part_')] for s in parts_files])):
        command = f'cat {file}.part_?? > {file}'
        print(command)
        subprocess.run(command, shell=True)

In [6]:
def split_large_files(large_file_list):
    for filename in large_file_list:
        command = f'split -b 40MB {filename} {filename}.part_'
        print(command)
        subprocess.run(command, shell=True)

In [7]:
# MAIN

large_files_list = get_large_files_list()
if len(large_files_list) == 0:   # this is the state immediately after cloning from github
    assemble_large_files_from_parts()
else:
    split_large_files(large_files_list)
    
large_files_list = get_large_files_list()
confirm_large_files_are_listed_in_gitignore(large_files_list)
print('FINISHED')

split -b 40MB ../code/object-detectors/inference_data/frozen_inference_graph_5classes.pb ../code/object-detectors/inference_data/frozen_inference_graph_5classes.pb.part_
split -b 40MB ../code/object-detectors/inference_data/frozen_inference_graph_3classes.pb ../code/object-detectors/inference_data/frozen_inference_graph_3classes.pb.part_
split -b 40MB ../code/object-detectors/inference_data/mask_rcnn_cvat_0160.h5 ../code/object-detectors/inference_data/mask_rcnn_cvat_0160.h5.part_
