## Preprocessing (for the final time!)

In [None]:
import sklearn
import boto3
import jsonm

In [None]:
%%writefile HelloBlazePreprocess.py

import json
import zipfile

# Function below unzips the archive to the local directory. 

def unzip_data(input_data_path):
    with zipfile.ZipFile(input_data_path, 'r') as input_data_zip:
        input_data_zip.extractall('.')
        return input_data_zip.namelist()[0]

# Input data is a file with a single JSON object per line with the following format: 
# {
#  "reviewerID": <string>,
#  "asin": <string>,
#  "reviewerName" <string>,
#  "helpful": [
#    <int>, (indicating number of "helpful votes")
#    <int>  (indicating total number of votes)
#  ],
#  "reviewText": "<string>",
#  "overall": <int>,
#  "summary": "<string>",
#  "unixReviewTime": <int>,
#  "reviewTime": "<string>"
# }
# 
# We are specifically interested in the fields "helpful" and "reviewText"
#

def label_data(input_data):
    labeled_data = []
    HELPFUL_LABEL = "__label__1"
    UNHELPFUL_LABEL = "__label__2"
     
    for l in open(input_data, 'r'):
        l_object = json.loads(l)
        helpful_votes = float(l_object['helpful'][0])
        total_votes = l_object['helpful'][1]
        reviewText = l_object['reviewText']
        if total_votes != 0:
            if helpful_votes / total_votes > .5:
                labeled_data.append(" ".join([HELPFUL_LABEL, reviewText]))
            elif helpful_votes / total_votes < .5:
                labeled_data.append(" ".join([UNHELPFUL_LABEL, reviewText]))
          
    return labeled_data

# Labeled data is a list of sentences, starting with the label defined in label_data. 

def split_sentences(labeled_data):
    new_split_sentences = []
    for d in labeled_data:
        label = d.split()[0]        
        sentences = " ".join(d.split()[1:]).split(".") # Initially split to separate label, then separate sentences
        for s in sentences:
            if s: # Make sure sentences isn't empty. Common w/ "..."
                new_split_sentences.append(" ".join([label, s]))
    return new_split_sentences

def write_data(data, train_path, test_path, proportion):
    border_index = int(proportion * len(data))
    train_f = open(train_path, 'w')
    test_f = open(test_path, 'w')
    index = 0
    for d in data:
        if index < border_index:
            train_f.write(d + '\n')
        else:
            test_f.write(d + '\n')
        index += 1

if __name__ == "__main__":
    unzipped_path = unzip_data('/opt/ml/processing/input/Toys_and_Games_5.json.zip')
    labeled_data = label_data(unzipped_path)
    new_split_sentence_data = split_sentences(labeled_data)
    write_data(new_split_sentence_data, '/opt/ml/processing/output/train/hello_blaze_train_scikit', '/opt/ml/processing/output/test/hello_blaze_test_scikit', .9)

## Exercise: Upload unprocessed data - Solution. 

In [6]:
import os 
import boto3

BUCKET = "udacity-sagemaker-solutiondata2021"
s3_prefix = "l2e4"
item_name = "Toys_and_Games_5.json.zip"

def upload_file_to_s3(file_name):
    object_name = os.path.join(s3_prefix, file_name)
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, BUCKET, object_name)
    except ClientError as e:
        logging.error(e)
        return False

upload_file_to_s3(item_name)

source_path = "s3://" + "/".join([BUCKET, s3_prefix, item_name])
print(source_path)

s3://udacity-sagemaker-solutiondata2021/l2e4/Toys_and_Games_5.json.zip


## Exercise: Launch a processing job through the SciKitLearn interface - Solution

In [9]:
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

role = get_execution_role()

sklearn_processor = SKLearnProcessor(framework_version='0.20.0',
                                     role=role,
                                     instance_type='ml.m5.large',
                                     instance_count=1)

sklearn_processor.run(code='HelloBlazePreprocess.py',
                        inputs=[ProcessingInput(
                        source=source_path,
                        destination='/opt/ml/processing/input')],
                      outputs=[ProcessingOutput(source='/opt/ml/processing/output/train'),
                               ProcessingOutput(source='/opt/ml/processing/output/test')]
                     )


Job Name:  sagemaker-scikit-learn-2021-08-12-05-31-06-758
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://udacity-sagemaker-solutiondata2021/l2e4/Toys_and_Games_5.json.zip', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-565094796913/sagemaker-scikit-learn-2021-08-12-05-31-06-758/input/code/HelloBlazePreprocess.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'output-1', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-west-2-565094796913/sagemaker-scikit-learn-2021-08-12-05-31-06-758/output/output-1', 'LocalPath': '/opt/ml/processing/output/train', 'S3UploadMode': 'EndOfJob'}}, {'O

## Exercise: Sanity Check

Use the method below to find the input and output path

In [10]:
sklearn_processor.jobs[-1].describe()

{'ProcessingInputs': [{'InputName': 'input-1',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://udacity-sagemaker-solutiondata2021/l2e4/Toys_and_Games_5.json.zip',
    'LocalPath': '/opt/ml/processing/input',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}},
  {'InputName': 'code',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-565094796913/sagemaker-scikit-learn-2021-08-12-05-31-06-758/input/code/HelloBlazePreprocess.py',
    'LocalPath': '/opt/ml/processing/input/code',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}}],
 'ProcessingOutputConfig': {'Outputs': [{'OutputName': 'output-1',
    'S3Output': {'S3Uri': 's3://sagemaker-us-west-2-565094796913/sagemaker-scikit-learn-2021-08-12-05-31-06-758/output/output-1',
     'LocalPath': '/opt/ml/processing/output/train',
     'S3