In [3]:
import argparse
import sagemaker
import boto3
from sagemaker import get_execution_role
import os
import pandas as pd
import numpy as np
import sys


from sagemaker.processing import ProcessingInput, ProcessingOutput
sys.path.extend(['../src/preprocess', '../config'])
import helpers.instance as ins
import helpers.s3 as s3_helper
import helpers.utils as ut
import helpers.athena as at
region = boto3.Session().region_name
session = sagemaker.Session()
sm = boto3.Session().client(service_name='sagemaker',region_name=region)
# READ YAML FILES AND STORE RELEVANT PATHS in dictionaries
dataset_cfg = ins.read_config('../config/datasets.yaml') 
config_cfg = ins.read_config('../config/config.yaml')

In [4]:
config_cfg['tags']['processing']

[{'Key': 'MlOpsStep', 'Value': 'processing'},
 {'Key': 'sagemaker:project-name', 'Value': 'unitsalesrisk-intern'},
 {'Key': 'b_module', 'Value': 'Unit_Sales_Risk_Interns'},
 {'Key': 'b_costcenter', 'Value': '080410'},
 {'Key': 'b_env', 'Value': 'staging'},
 {'Key': 'b_program', 'Value': 'internship'},
 {'Key': 'b_role', 'Value': 'advanalytics'},
 {'Key': 'b_service', 'Value': 'Sagemaker'}]

In [5]:
role = sagemaker.get_execution_role(session)
role

'arn:aws:iam::976432587531:role/Sgmkr-advanalytics'

In [7]:
script_list = ins.get_all_dictionary_values(dataset_cfg['script']) #make an instance and stored in a dictionary
script_list
s3_helper.upload_files_to_s3(script_list, config_cfg['bucket'], config_cfg['prefix'])

FileNotFoundError: [Errno 2] No such file or directory: '../src/helpers/utils.py'

In [4]:
code = os.path.join('s3://',dataset_cfg['bucket'],dataset_cfg['prefix'],dataset_cfg['script']['unit'])
code

's3://pske-stg-advanalytics/projects/unitsalesrisk-interns/src/preprocess/dataCollectionMay31.py'

In [5]:
unitsalesrisk_sklearn_processor = ins.get_sklearn_processor('fault-rules-unit-preprocess', config_cfg)

In [6]:
unit_sklearn_processor = ins.get_sklearn_processor('UnitSalesRiskInterns', config_cfg)

unit_sklearn_processor.run(
    code = os.path.join('s3://',dataset_cfg['bucket'],dataset_cfg['prefix'],dataset_cfg['script']['unit']),
    inputs = [
        ProcessingInput(source=os.path.join('s3://',dataset_cfg['bucket'],dataset_cfg['prefix'],dataset_cfg['script']['helpers']['utils']),
                         destination='/opt/ml/processing/input/utils'),
        ProcessingInput(source=os.path.join('s3://',dataset_cfg['bucket'],dataset_cfg['prefix'],dataset_cfg['script']['helpers']['s3']),
                         destination='/opt/ml/processing/input/s3'),
        ProcessingInput(source=os.path.join('s3://',dataset_cfg['bucket'],dataset_cfg['prefix'],dataset_cfg['script']['helpers']['instance']),
                         destination='/opt/ml/processing/input/instance'),
        ProcessingInput(source=os.path.join('s3://',dataset_cfg['bucket'],dataset_cfg['prefix'],dataset_cfg['script']['helpers']['athena']),
                         destination='/opt/ml/processing/input/athena')
        
    ],
    outputs = [
        ProcessingOutput(output_name='processed_units', 
                           source='/opt/ml/processing/output/processed_units')
    ],
    arguments = [
        '--s3_input_path', 's3://pske-stg-advanalytics/Projects/Unit_Sale_Risk_Interns/Data/Athena_query/base_query.txt'
        #'--s3_input_path', 's3://pske-stg-advanalytics/Projects/Unit_Sale_Risk_Interns/Data/Athena_query/postsaleissue.txt'
    ] 
)


Job Name:  UnitSalesRiskInterns-2022-06-09-15-12-59-434
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://pske-stg-advanalytics/projects/unitsalesrisk-interns/src/helpers/utils.py', 'LocalPath': '/opt/ml/processing/input/utils', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://pske-stg-advanalytics/projects/unitsalesrisk-interns/src/helpers/s3.py', 'LocalPath': '/opt/ml/processing/input/s3', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-3', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://pske-stg-advanalytics/projects/unitsalesrisk-interns/src/helpers/instance.py', 'LocalPath': '/opt/ml/processing/input/instance', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', '

UnexpectedStatusException: Error for Processing job UnitSalesRiskInterns-2022-06-09-15-12-59-434: Failed. Reason: AlgorithmError: See job logs for more information

In [None]:
roadcall_sklearn_processor = ins.get_sklearn_processor('fault-rules-roadcall-preprocess', config_cfg)

roadcall_sklearn_processor.run(
    code = os.path.join('s3://', dataset_cfg['bucket'],dataset_cfg['prefix'],dataset_cfg['script']['roadcall']),
    inputs = [
        ProcessingInput(source=os.path.join('s3://', dataset_cfg['bucket'],dataset_cfg['prefix'],dataset_cfg['script']['helpers']['utils']),
                         destination='/opt/ml/processing/input/utils'),
        ProcessingInput(source=os.path.join('s3://', dataset_cfg['bucket'],dataset_cfg['prefix'],dataset_cfg['script']['helpers']['s3']),
                         destination='/opt/ml/processing/input/s3')
    ],
    outputs = [
        ProcessingOutput(output_name='processed_roadcalls', 
                           source='/opt/ml/processing/output/processed_roadcalls')
    ],
    arguments = [
        '--s3_input_path', dataset_cfg['raw']['roadcall']['input_path'],
        '--s3_target_data_output_path', dataset_cfg['interim']['roadcall']
    ] 
)