In [3]:
import boto3
import sagemaker
import pandas as pd
from sklearn.model_selection import train_test_split

In [18]:

sm_boto3 = boto3.client('sagemaker')
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'diabetessagemakerbucket'
print(region)
print('using this bucket: ', bucket)

us-east-1
using this bucket:  diabetessagemakerbucket


In [7]:
df = pd.read_csv('diabetes.csv')

In [11]:
features = list(df.columns)
print(features)
labels = features.pop(-1)
print(labels)

['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
Outcome


In [12]:
x = df[features]
y = df[labels]

In [13]:
print(x.shape)
print(y.shape)

(768, 8)
(768,)


In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [15]:
trainx = pd.DataFrame(x_train)
trainx[labels] = y_train

testx = pd.DataFrame(x_test)
testx[labels] = y_test

In [16]:
print(trainx.shape)
print(testx.shape)

(614, 9)
(154, 9)


In [20]:
trainx.to_csv('train.csv', index=False)
testx.to_csv('test.csv', index=False)

In [22]:
sk_prefix = 'sagemaker/diabetes/sklearn'
trainpath = sess.upload_data(path='train.csv', bucket=bucket, key_prefix=sk_prefix)

testpath = sess.upload_data(path='test.csv', bucket=bucket, key_prefix=sk_prefix)

print(trainpath)
print(testpath)

s3://diabetessagemakerbucket/sagemaker/diabetes/sklearn/train.csv
s3://diabetessagemakerbucket/sagemaker/diabetes/sklearn/test.csv


In [23]:
%%writefile script.py

import os
import joblib
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pathlib
from io import StringIO
import boto3
import sklearn
import argparse

print('Importing Libraries')
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

print('Model Function has been defined')
if __name__ == '__main__':
    print('[INFO] Extracting Arguments')
    parser = argparse.ArgumentParser()
    
    #Hyperparameters
    parser.add_argument('--n_estimators', type=int, default=100)
    parser.add_argument('--random_state', type=int, default=0)
    
    
    
    # Data, model, and output directories
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--train-file', type=str, default='train.csv')
    parser.add_argument('--test-file',type=str, default='test.csv')
    
    print('Arguments have been defined')
    
    args,_ = parser.parse_known_args()
    print("sklearn version: ",  sklearn.__version__)
    print('joblib version: ', joblib.__version__)
    
    print('[INFO] Reading Data')
    print()
    
    
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    
    x_train = train_df[features]
    x_test  = test_df[features]
    y_train = train_df[label]
    y_test  = test_df[label]
    
    print('Data has been splitted')
    
    print('column order: ')
    print(features)
    print()
    
    
    print('label column is : ',label)
    print(label)
    print()
    
    print('data shape: ')
    
    print("------train data shape: ")
    print('x_train shape: ', x_train.shape)
    print('y_train shape: ', y_train.shape)
    
    print("------test data shape: ")
    print('x_test shape: ', x_test.shape)
    print('y_test shape: ', y_test.shape)
    print()
    
    print('Training Model.......')
    print()
    model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state,verbose=1)
    model.fit(x_train, y_train)
    print('Model has been trained')
    
    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print('Model has been saved')
    print()
    
    print('Evaluating Model')
    y_pred_test = model.predict(x_test)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_rep = classification_report(y_test, y_pred_test)


    print()
    print('Model has been evaluated')
    print()
    print('Test Accuracy: ', test_acc)
    print('Classification Report: ')
    print(test_rep)
    print()

Writing script.py


In [27]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point='script.py',
    role='arn:aws:iam::975050189259:role/awsendtoendrole',
    instance_count=1,
    instance_type='ml.m5.large',
    framework_version=FRAMEWORK_VERSION,
    base_job_name='diabetes-sklearn',
    hyperparameters={
        'n_estimators': 100,
        'random_state': 0
    },
    use_spot_instances=True,
    max_wait=7200,
    max_run=3600
)

In [28]:
#launch training job with asynchronous call
sklearn_estimator.fit({'train': trainpath, 'test': testpath}, wait=True)

INFO:sagemaker:Creating training-job with name: diabetes-sklearn-2024-04-16-06-28-43-385


2024-04-16 06:28:49 Starting - Starting the training job...
2024-04-16 06:29:04 Starting - Preparing the instances for training...
2024-04-16 06:29:45 Downloading - Downloading input data...
2024-04-16 06:30:16 Downloading - Downloading the training image...
2024-04-16 06:31:02 Training - Training image download completed. Training in progress.
2024-04-16 06:31:02 Uploading - Uploading generated training model.[34m2024-04-16 06:30:55,713 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-04-16 06:30:55,717 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-04-16 06:30:55,774 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-04-16 06:30:55,967 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-04-16 06:30:55,980 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024

In [29]:
#deploy the model
sklearn_estimator.latest_training_job.wait(logs='None')
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)['ModelArtifacts']['S3ModelArtifacts']

print('Model artifact saved at: ', artifact)


2024-04-16 06:31:17 Starting - Preparing the instances for training
2024-04-16 06:31:17 Downloading - Downloading the training image
2024-04-16 06:31:17 Training - Training image download completed. Training in progress.
2024-04-16 06:31:17 Uploading - Uploading generated training model
2024-04-16 06:31:17 Completed - Training job completed
Model artifact saved at:  s3://sagemaker-us-east-1-975050189259/diabetes-sklearn-2024-04-16-06-28-43-385/output/model.tar.gz


In [30]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = 'diabetes-sklearn-model' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
sklearn_model = SKLearnModel(
    name=model_name,
    model_data=artifact,
    role='arn:aws:iam::975050189259:role/awsendtoendrole',
    entry_point='script.py',
    framework_version=FRAMEWORK_VERSION
)

In [34]:
print(sklearn_model)
model_name

<sagemaker.sklearn.model.SKLearnModel object at 0x0000017D93D444F0>


'diabetes-sklearn-model2024-04-16-07-07-40'

In [35]:
endpoint_name = 'diabetes-sklearn-endpoint'+ strftime("%Y-%m-%d-%H-%M-%S", gmtime())
predictor = sklearn_model.deploy(
    instance_type='ml.t2.medium',
    initial_instance_count=1,
    endpoint_name=endpoint_name
)

INFO:sagemaker:Creating model with name: diabetes-sklearn-model2024-04-16-07-07-40
INFO:sagemaker:Creating endpoint-config with name diabetes-sklearn-endpoint2024-04-16-07-12-36
INFO:sagemaker:Creating endpoint with name diabetes-sklearn-endpoint2024-04-16-07-12-36


-------!

In [37]:
print(endpoint_name)
predictor

diabetes-sklearn-endpoint2024-04-16-07-12-36


<sagemaker.sklearn.model.SKLearnPredictor at 0x17d955c1250>

In [72]:
a = testx[features][152:154].values.tolist()
print(a)


In [73]:
predictor.predict(a)

array([0, 0], dtype=int64)

In [74]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': 'c18869ff-e71d-4fab-b8f4-bb05a0e3e3bf',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'c18869ff-e71d-4fab-b8f4-bb05a0e3e3bf',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Tue, 16 Apr 2024 08:00:46 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}