# ADS-508-01-SP23 Team 8: Final Project

# Train model

Much of the code is modified from `Fregly, C., & Barth, A. (2021). Data science on AWS: Implementing end-to-end, continuous AI and machine learning pipelines. O’Reilly.`

## Install missing dependencies

[PyAthena](https://pypi.org/project/PyAthena/) is a Python DB API 2.0 (PEP 249) compliant client for Amazon Athena.

In [1]:
!pip install --disable-pip-version-check -q PyAthena==2.1.0
!pip install --disable-pip-version-check -q sagemaker-experiments==0.1.26
!pip install missingno

[0m

## Globally import libraries

In [11]:
import boto3
from botocore.client import ClientError
import pandas as pd
import numpy as np
from pyathena import connect
from IPython.core.display import display, HTML
import missingno as msno
from skopt import BayesSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error
import datetime as dt
import time
import sagemaker
from sagemaker.sklearn.estimator import SKLearn
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
import joblib
import os
from io import BytesIO

## Instantiate AWS SageMaker and S3 sessions

In [3]:
session = boto3.session.Session()
role = sagemaker.get_execution_role()
region = session.region_name
sagemaker_session = sagemaker.Session()
def_bucket = sagemaker_session.default_bucket()
bucket = 'sagemaker-us-east-ads508-sp23-t8'

s3 = boto3.Session().client(service_name="s3",
                            region_name=region)

sm = boto3.Session().client(service_name="sagemaker",
                            region_name=region)

In [4]:
setup_s3_bucket_passed = False
ingest_create_athena_db_passed = False
ingest_create_athena_table_tsv_passed = False

In [5]:
print(f"Default bucket: {def_bucket}")
print(f"Public T8 bucket: {bucket}")

Default bucket: sagemaker-us-east-1-657724983756
Public T8 bucket: sagemaker-us-east-ads508-sp23-t8


## Verify S3 Bucket Creation

In [6]:
%%bash

aws s3 ls s3://${bucket}/

2023-03-16 17:05:02 aws-athena-query-results-657724983756-us-east-1
2023-03-02 16:56:48 sagemaker-studio-657724983756-5nh7ydsouq7
2023-03-02 17:25:41 sagemaker-studio-657724983756-7yc8bp8xk0b
2023-03-02 17:01:51 sagemaker-us-east-1-657724983756
2023-03-17 05:19:31 sagemaker-us-east-ads508-sp23-t8


In [7]:
response = None

try:
    response = s3.head_bucket(Bucket=bucket)
    print(response)
    setup_s3_bucket_passed = True
except ClientError as e:
    print(f"[ERROR] Cannot find bucket {bucket} in {response} due to {e}.")

{'ResponseMetadata': {'RequestId': 'QMD6AFGEE4VF6S1Y', 'HostId': 'ESaRyq/DNTe/0WHmji/kplwxmDHgEzDh5sdOdo7GU+9BMstH1vkUoAAitW3AVjEs7yvaeYmEQ1o=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'ESaRyq/DNTe/0WHmji/kplwxmDHgEzDh5sdOdo7GU+9BMstH1vkUoAAitW3AVjEs7yvaeYmEQ1o=', 'x-amz-request-id': 'QMD6AFGEE4VF6S1Y', 'date': 'Sat, 01 Apr 2023 18:26:23 GMT', 'x-amz-bucket-region': 'us-east-1', 'x-amz-access-point-alias': 'false', 'content-type': 'application/xml', 'server': 'AmazonS3'}, 'RetryAttempts': 0}}


In [8]:
%store setup_s3_bucket_passed

Stored 'setup_s3_bucket_passed' (bool)


## Pass in train and test X from CSV

In [9]:
s3_train_x01_csv_path = f"s3://{def_bucket}/team_8_data/modeling_data/train_x01.csv"
train_x01 = pd.read_csv(s3_train_x01_csv_path)
s3_test_x01_csv_path = f"s3://{def_bucket}/team_8_data/modeling_data/test_x01.csv"
test_x01 = pd.read_csv(s3_test_x01_csv_path)

print(f'{train_x01.shape}')
print(f'\n{test_x01.shape}')

(25284, 48)

(6321, 48)


## Pass in train and test y from np array

In [12]:
# Define the S3 object key
train_y01_s3_key = 'team_8_data/modeling_data/train_y01.npy'

# Load the numpy array from S3
with BytesIO() as data:
    s3.download_fileobj(def_bucket, train_y01_s3_key, data)
    data.seek(0)
    train_y01 = np.load(data)

# Define the S3 object key
test_y01_s3_key = 'team_8_data/modeling_data/test_y01.npy'

# Load the numpy array from S3
with BytesIO() as data:
    s3.download_fileobj(def_bucket, test_y01_s3_key, data)
    data.seek(0)
    test_y01 = np.load(data)

# Confirm that the numpy array was loaded from S3
print(f'{train_y01.shape}')
print(f'{test_y01.shape}')

(25284, 1)
(6321, 1)


## Model Training using Grid search with 5-fold cross-validation

### Lasso

In [15]:
# Start timer script
start_time = dt.datetime.today()

# Set the script path
script_path = '06b_Modeling02_amc_v1.py'

# Set up S3 bucket and prefix for saving output
s3_prefix = 'team_8_data/models'

# Set the hyperparameters
hyperparameters = {'alpha': [.01, .05, .1, .5, 1, 2],
                   'selection': ['cyclic', 'random']
                  }

# Set up SKLearnProcessor
sklearn_sm_inst01 = SKLearn(entry_point=script_path,
                            source_dir='./',
                            role=role,
                            framework_version='0.23-1',
                            instance_count=1,
                            instance_type='ml.m5.large'
                           )




# Set the inputs
train_input = f's3://{def_bucket}/team_8_data/modeling_data/test_x01.csv'
test_input = f's3://{def_bucket}/team_8_data/modeling_data/test_y01.npy'

# Set the output
output = f's3://{def_bucket}/{s3_prefix}'

# Fit the estimator on your training data
sklearn_sm_inst01.fit({'train': train_input, 'test': test_input})

# End timer script
end_time = dt.datetime.today()
time_elapse = end_time - start_time
print(f'End Time = {end_time}')
print(f'Script Time = {time_elapse}')

INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2023-04-01-18-27-36-114


2023-04-01 18:27:48 Starting - Starting the training job...
2023-04-01 18:28:03 Starting - Preparing the instances for training...
2023-04-01 18:28:50 Downloading - Downloading input data...
2023-04-01 18:29:20 Training - Downloading the training image...
2023-04-01 18:29:50 Training - Training image download completed. Training in progress..[34m2023-04-01 18:29:55,563 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2023-04-01 18:29:55,566 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-04-01 18:29:55,609 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2023-04-01 18:29:56,883 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-04-01 18:29:56,896 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-04-01 18:29:56,907 sagemaker-training-toolkit INFO     No GPUs detect

UnexpectedStatusException: Error for Training job sagemaker-scikit-learn-2023-04-01-18-27-36-114: Failed. Reason: AlgorithmError: framework error: 
Traceback (most recent call last):
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_containers/_trainer.py", line 84, in train
    entrypoint()
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/training.py", line 39, in main
    train(environment.Environment())
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/training.py", line 35, in train
    runner_type=runner.ProcessRunnerType)
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/entry_point.py", line 100, in run
    wait, capture_error
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/process.py", line 291, in run
    cwd=environment.code_dir,
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/process.py", line 208, in check_error
    info=extra_info,
sagemaker_training.errors.ExecuteUserScriptError: ExecuteUserScriptError:
ExitCode 1
ErrorMessage ""
Command "/miniconda3/bin/python 06b_Modeling02_amc_v1.py"

Execu

## Release Resources

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}