In [None]:
import logging
import urllib.request
import re
import os
import boto3
from pathlib import Path
from datetime import datetime

import pandas as pd
import numpy as np
import s3fs

import data_utils
import aws_utils

# You may want to do it this way if you are debuging / editing the support files 
# modules imported via aimport will before you execute any cell
# %load_ext autoreload
# %autoreload 1
# %aimport data_utils
# %aimport aws_utils

## First we'll establish some basic logging

In [None]:
logger = logging.getLogger('main')
logger.setLevel(logging.DEBUG)

s_tdy = datetime.today().strftime('%m-%d-%Y')
main_log = Path(f'logs/chi-town-scrub-data_{s_tdy}.log')
if not main_log.parent.exists(): main_log.parent.mkdir(parents=True, exist_ok=True)
if main_log.exists(): os.remove(main_log)

# create console handler and set level to debug
ch = logging.StreamHandler()
ch_main_log = logging.FileHandler(main_log)
ch.setLevel(logging.DEBUG)
ch_main_log.setLevel(logging.DEBUG)

# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(funcName)s - %(message)s')

# add formatter to ch
ch.setFormatter(formatter)
ch_main_log.setFormatter(formatter)

logger.addHandler(ch)
logger.addHandler(ch_main_log)

## Download our dataset

### We'll be using Chicago crime statistics from 2001 - present (approx 1.8 GB) as an example.  To find out more about the dataset you can check out [data.gov](https://catalog.data.gov/dataset/crimes-2001-to-present-398a4)

In [None]:
DOWNLOAD_DIR = Path('data')
DATA_URL = 'https://data.cityofchicago.org/api/views/ijzp-q8t2/rows.csv?accessType=DOWNLOAD'

if not DOWNLOAD_DIR.exists(): DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)
tgt_file = DOWNLOAD_DIR/'chi-town-crime.csv'

if tgt_file.exists():
    logger.info(f'{tgt_file.name} already exists')
else:
    logger.info('Attempting to download dataset')
    urllib.request.urlretrieve(DATA_URL, tgt_file)

## Now let's split our data up into manageable chunks

### Our example data could be easily handled in memory, but imagine we're working with 100 gigs or a TB, in that case we can divide and conquer

By default the helper function data_utils.split_file will split things into files of 1,000,000 rows each.  This is useful if you want to look at it in something in Excel.  Some considerations for processing files in Lambda would be that we can have a max of approx 3 gigs of memory for a Lambda function, so you would want to keep file sizes < 1 gig, to leave room for overhead, depending on how you process it.  Also, Lambda functions can run up to a max of 15 minutes.  We can run a  thousand of a particular Lambda function at once by default (you can request limit increases).   Lambda functions also get more processing power when you give them more memory, but the exact formula for this isn't public knowledge AFAIK, so play around with it and try to find a balance

In [None]:
tgt_file = DOWNLOAD_DIR/'chi-town-crime.csv'

if not tgt_file.exists():
    logger.info(f'Couldn"t find {tgt_file.name}, try downloading it again above')
else:
    split_files = data_utils.split_file(tgt_file, has_headers=True, include_headers=True)

logger.info('Split files')
for file in split_files:
    logger.info(f'{file.name}')

## Now let's upload our chunks of data to the S3 bucket we created in the CloudFormation script

<font color=red>Change S3_BUCKET below to the name of your bucket</font>

Here I am using the s3fs module for the uploads.  I've had issues when transferring larger files (> 10 gigs or so) with timeouts, and also Sagemaker notebooks periodically request you to log-in again, so if you're transferring a large amount of files, you may want to opt for the command line
    
To do that, open up the terminal (click the + button on the top left to get to the luancher, then select terminal at the bottom)

Do the following:<br>
    
cd Sagemaker/data<br>
aws s3 cp . s3://YOUR_BUCKET --recursive --exclude '*crime.csv'<br>

you may want to run it this way first<br>
aws s3 cp . s3://YOUR_BUCKET --recursive --dryrun --exclude '*crime.csv'<br>
    
the parameter --dryrun will show you what it's going to do without actually doing it, so you can make sure you got the command right

You can see more about [the aws s3 cp (copy) command here](https://docs.aws.amazon.com/cli/latest/reference/s3/cp.html)

In [None]:
S3_BUCKET = 'chi-town-scrub-data'
s3 = s3fs.S3FileSystem()

if not s3.exists(S3_BUCKET):
    logger.info(f'Couldn"t find S3 bucket {S3_BUCKET}, make sure the bucket above exists in your account')
    logger.info(f'It should be the one you created in the CloudFormation script')
else:
    for file in sorted(DOWNLOAD_DIR.glob('*[0-9][0-9][0-9]*.csv')):
        s3_bucket_key = f'{S3_BUCKET}/{file.name}'
        aws_utils.upload_s3_file(s3, file, s3_bucket_key)