In [1]:
from filesplit.split import Split
from filesplit.merge import Merge
import os
import glob
import pathlib

In [2]:
# List files larger than 50 MB

bigfiles = []
for f in glob.glob('**', recursive=True):
    if os.path.isfile(f):
        if os.path.getsize(f) > 50e6:
            bigfiles.append(f)
print(bigfiles)

['FLY275.DAT', 'testmerge/FLY275.DAT']


Now that we have found files > 50 MB we need to:

* add filenames to .gitignore if not already there
* splits directories if not already there

In [3]:
# Ensure .gitignore exists
path = pathlib.Path('.gitignore')
path.touch(exist_ok=True)

# Read .gitignore text as a string
with open(path,"r") as f:
    string = f.read()

# Add paths for big files to the string    
for f in bigfiles:
    if not f in string:
        string += f'{f}\n'

# Overwrite .gitignore with modified string
with open(path,"w") as f:
    f.write(string)

In [6]:
for bigfile in bigfiles:
    # Ensure outputdir exists
    outputdir = pathlib.Path(bigfile + '_splits')
    outputdir.mkdir(parents=True, exist_ok=True) 
    
    # Create splits
    split = Split(bigfile, outputdir)
    split.bysize(50e6)

In [11]:
def merge_splits(inputdir, outputdir, outputfilename):
    """
    inputdir: directory contining splits created by the filesplit module
    outputdir: directory into which the reassembled file will be placed
    outputfilename: filename for the reassembled file
    """
    # Ensure outputdir exists
    pathlib.Path(outputdir).mkdir(parents=True, exist_ok=True) 

    # Reassemble splits
    merge = Merge(inputdir='FLY275.DAT_splits', outputdir='testmerge', outputfilename='FLY275.DAT')
    merge.merge()

# merge_splits(inputdir='FLY275.DAT_splits', outputdir='testmerge', outputfilename='FLY275.DAT')

In [12]:
for bigfile in bigfiles:
    merge_splits(inputdir=bigfile+'_splits', outputdir='merged', outputfilename=bigfile)