# Development of Archive-Upload Script

In [104]:
import time
from pathlib import Path
import yaml
import pickle
import bz2
from pprint import pprint

## Notes on Design

* When a finished file is found, compressed, and archived, immediately try
  to upload.  If the upload fails, add the file to the "database" of files
  that need to be uploaded.  That database of pending uploads should hold
  the local file path and the destination S3 bucket.  It could just be a list
  of two-tuples that is pickled to disk in the ~/.archive-upload directory.
* For the newly finished file, may just want to append it to the "upload pending"
  list, and then after that start trying to work through the list, uploading
  the oldest file first.
* The AWS sync command may not be the best because if someone cleans out the S3
  bucket, the script will try to re-upload all of the local files in the archive
  directory.  The approach aboves solves that problem by only uploading a file once
  to S3.  It also has the advantage of only requiring Python boto3 methods (not
  needing a AWS command line utility)
* Archiving and Uploading application log files may be tricky.  Rotating file handler will
  keep producing files of the same names: archive-upload.log, archive-upload.log.1, etc.  One idea
  might be to take the last X lines of error file at the end of the day and copy
  those lines into a day-specific log file: 2019-07-23_archive-upload.log.  Then this utility
  could archive and upload those files.

## Sample Configuration File

In [101]:
%%writefile archive-upload-example.config
directories:
  - directory: /home/tabb99/arch-test/abc
    file-patterns: 
      - pattern: "*.csv"
        finished-secs: 2600
      - pattern: "*.log"
        finished-secs: 3000
    bucket: orca.acep.org/abc
    archive-dir: /home/tabb99/arch-test/archive/abc
    delete-after: 365          # days
  - directory: /home/tabb99/arch-test/xyz
    file-patterns:
      - pattern: "*.txt"
        finished-secs: 500
    bucket: orca.acep.org/xyz
    archive-dir: /home/tabb99/arch-test/archive/xyz

Overwriting archive-upload-example.config


In [102]:
# Read in the configuration file that controls execution of the script.
cfg_fn = 'archive-upload-example.config'
config = yaml.safe_load(open(cfg_fn, 'r'))
pprint(config)

{'directories': [{'archive-dir': '/home/tabb99/arch-test/archive/abc',
                  'bucket': 'orca.acep.org/abc',
                  'delete-after': 365,
                  'directory': '/home/tabb99/arch-test/abc',
                  'file-patterns': [{'finished-secs': 2600, 'pattern': '*.csv'},
                                    {'finished-secs': 3000,
                                     'pattern': '*.log'}]},
                 {'archive-dir': '/home/tabb99/arch-test/archive/xyz',
                  'bucket': 'orca.acep.org/xyz',
                  'directory': '/home/tabb99/arch-test/xyz',
                  'file-patterns': [{'finished-secs': 500,
                                     'pattern': '*.txt'}]}]}


## Sample Files to Upload

In [75]:
%%bash
cd /home/tabb99/arch-test
touch abc/hello01.csv
touch abc/hello02.csv
touch abc/another01.log
touch abc/another02.log
touch abc/junk
touch xyz/another01.txt
touch xyz/another02.txt
touch xyz/junk

## Initialization

In [72]:
# Create Application working directory if it does not exist.
p_app = Path('~').expanduser() / '.archive-upload'
p_app.mkdir(exist_ok=True)
p_up_pending = p_app / 'upload_pending.pkl'

# Read a list of pending uploads, if file is present, otherwise,
# set to empty list.
if p_up_pending.exists():
    with p_up_pending.open('rb') as fin:
        upload_pending = pickle.load(fin)
else:
    upload_pending = []

In [112]:
# Loop through the list of directories, looking for completed files.
for dr in config['directories']:
    # Path to directory holding data files
    p_dr = Path(dr['directory'])
    
    # Path to directory where finished, compressed files will be archived
    p_archive = Path(dr['archive-dir'])
    
    # make the archive directory if it does not exists
    p_archive.mkdir(parents=True, exist_ok=True)
    
    # Loop through file patterns
    for pat in dr['file-patterns']:
        for p_f in p_dr.glob(pat['pattern']):
            # p_f is a Path to a file matching the pattern.
            # test to see if it is a completed file
            file_age = time.time() - p_f.stat().st_mtime
            if file_age > pat['finished-secs']:
                print(f'complete {p_f}, {file_age:.0f} secs old')
                p_arch_fn = p_archive / (p_f.name + '.bz2')
                with bz2.open(p_arch_fn, "wb") as fout:
                  fout.write(p_f.read_bytes())

            
    # Check for files to delete in the archive directory
    # but only delete if the file is not in the upload pending list.
    

complete /home/tabb99/arch-test/abc/hello01.csv, 3548 secs old
complete /home/tabb99/arch-test/abc/hello02.csv, 3548 secs old
complete /home/tabb99/arch-test/abc/another01.log, 3548 secs old
complete /home/tabb99/arch-test/abc/another02.log, 3548 secs old
complete /home/tabb99/arch-test/xyz/another01.txt, 3548 secs old
complete /home/tabb99/arch-test/xyz/another02.txt, 3548 secs old


In [None]:
# Upload files

In [74]:
# save the upload pending list
with p_up_pending.open('wb') as fout:
    pickle.dump(upload_pending, fout)
