The notebook solves problem of dumping last updated timestamp for objects uploads under particular S3 path and then visualizing the frequency of uploads

Approach:
    * Use boto3 list-objects API to pull info for all the files under S3 path ( can take long time )
    * Generate interactive chart of the frequency of uploads "bucketed" in 5-min intervals (a.k.a how many files were uploaded every 5 min)
    
How to use the notebook:
    * Fill in parameters in the first cell
    * Select and run each cell in order
    * To "reset" the jupyter notebook you can revert changes to the *.ipynb file via GIT

In [None]:
class CONFIG:
    class ARGS:
        s3url = 'TODO: put in S3 path s3://<bucket>/s3/path/to/scan',
        label = None 

In [None]:
import json
import boto3
import argparse
from tqdm.notebook import tqdm
from urllib.parse import urlparse
from datetime import date, datetime

In [None]:
"""
Script routines
"""
def scan_s3_path(from_s3_url):
    '''Using AWS S3 API to fetch information about objects uploaded (updated)
        under specific S3 url'''
    client = boto3.client('s3')
    print(f'Scanning {from_s3_url}')
    s3_url = urlparse(from_s3_url)
    s3_bucket = s3_url.netloc
    s3_prefix = s3_url.path[1:]
    paginator = client.get_paginator('list_objects_v2')
    listing = {}
    pbar_iterator = tqdm(paginator.paginate(Bucket=s3_bucket, Prefix=s3_prefix))
    pbar_iterator.set_description_str('Iterating over S3 API response pages')
    for page in pbar_iterator:
        try:
            page_contents = page['Contents']
            listing.update( { rec['Key'] : rec['LastModified'] for rec in page_contents } )
        except KeyError:
            # Ignoring KeyError that may occur on "folders" (empty objects)
            pass
    return listing



In [None]:
s3_stats = scan_s3_path(from_s3_url=CONFIG.ARGS.s3url)