In [1]:
import boto3
import pandas as pd
import sagemaker
from sklearn.model_selection import train_test_split


sm_boto3 = boto3.client('sagemaker')
sess = sagemaker.Session()
region = sess.boto_region_name
bucket = 'bucketsagemaker0504'
print("Using bucket " + bucket)

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/yuewang/Library/Application Support/sagemaker/config.yaml
Using bucket bucketsagemaker0504


In [2]:
df = pd.read_csv('train.csv')
df.head()


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [7]:
df.shape

(2000, 21)

In [8]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [3]:
features = list(df.columns)
label = features.pop(-1)
x = df[features]
y = df[label]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1600, 20) (400, 20) (1600,) (400,)


In [5]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train
testX = pd.DataFrame(X_test)
testX[label] = y_test
print(trainX.shape, testX.shape)

(1600, 21) (400, 21)


In [7]:
trainX.to_csv('train-V1.csv', index=False)
testX.to_csv('test-V1.csv', index=False)

In [8]:
# send data to S3. SageMaker will take training data from s3
sk_prefix = 'sagemaker/mobile_price_classification/sklearncontainer'
trainpath = sess.upload_data(
    path='train-V1.csv', bucket=bucket,
    key_prefix=sk_prefix)
testpath = sess.upload_data(
    path='test-V1.csv', bucket=bucket,
    key_prefix=sk_prefix)
print(trainpath, testpath)

s3://bucketsagemaker0504/sagemaker/mobile_price_classification/sklearncontainer/train-V1.csv s3://bucketsagemaker0504/sagemaker/mobile_price_classification/sklearncontainer/test-V1.csv


In [25]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, roc_curve
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
import sklearn
import boto3
import pathlib
from io import StringIO
import argparse

# Load model
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

# Train model
if __name__ =='__main__':
    # Parse arguments
    print("[INFO] Parsing arguments")
    parser = argparse.ArgumentParser()

    # Hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument('--n_estimators', type=int, default=100)
    parser.add_argument('--random_state', type=int, default=0)

    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--train-file', type=str, default='train-V1.csv')
    parser.add_argument('--test-file', type=str, default='test-V1.csv')

    args, _ = parser.parse_known_args()

    print("Sklearn version: " + sklearn.__version__)
    print("Joblib version: " + joblib.__version__)
    
    print('[INFO] Reading data')
    print()
    # Load data
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    features = list(train_df.columns)
    label = features.pop(-1)

    print("Building training and testing datasets")
    print()
    # Split data
    X_train = train_df[features]
    y_train = train_df[label]
    X_test = test_df[features]
    y_test = test_df[label]

    print("Column Order:")
    print(features)
    print()
    print("lanel column:") 
    print(label)
    print()   
    print("---shape of training and testing datasets---")
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    print()

    print("Trainning Random Forest model ...")
    print()
    model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state)
    model.fit(X_train, y_train)
    print()

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print("Model saved at: {}".format(model_path))
    print()

    y_pred_test = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_report = classification_report(y_test, y_pred_test)

    print()
    print("---Metrics Results for Testing Data---")
    print()
    print("Total Rows are:", X_test.shape[0])
    print("[TESTING] Accuracy: ", test_accuracy)
    print("[TESTING] Classification Report: ")
    print(test_report)
    

Overwriting script.py


In [26]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point='script.py',
    # IAM,groups,ARN
    role="arn:aws:iam::905418413938:role/SageMakerRole",
    instance_count=1,
    instance_type='ml.m5.large',
    framework_version=FRAMEWORK_VERSION,
    base_job_name='rf-scikit',
    hyperparameters={
        'n_estimators': 100,
        'random_state': 42
    },
    use_spot_instances=True,
    max_wait=7200,
    max_run=3600,
    )

In [27]:
sklearn_estimator.fit({'train': trainpath, 'test': testpath}, wait=True)

INFO:sagemaker:Creating training-job with name: rf-scikit-2024-05-04-19-52-19-343


2024-05-04 19:52:20 Starting - Starting the training job...
2024-05-04 19:52:35 Starting - Preparing the instances for training...
2024-05-04 19:53:03 Downloading - Downloading input data...
2024-05-04 19:53:28 Downloading - Downloading the training image...
2024-05-04 19:54:19 Training - Training image download completed. Training in progress.
2024-05-04 19:54:19 Uploading - Uploading generated training model2024-05-04 19:54:12,819 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-05-04 19:54:12,823 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-05-04 19:54:12,865 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-05-04 19:54:13,007 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-05-04 19:54:13,019 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-05-04 19:54:13,031 sagemaker-training-toolkit INFO

In [28]:
sklearn_estimator.latest_training_job.wait(logs='None')
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name)['ModelArtifacts']['S3ModelArtifacts']
print('Model artifact saved at:', artifact)


2024-05-04 19:54:34 Starting - Preparing the instances for training
2024-05-04 19:54:34 Downloading - Downloading the training image
2024-05-04 19:54:34 Training - Training image download completed. Training in progress.
2024-05-04 19:54:34 Uploading - Uploading generated training model
2024-05-04 19:54:34 Completed - Training job completed
Model artifact saved at: s3://sagemaker-us-east-2-905418413938/rf-scikit-2024-05-04-19-52-19-343/output/model.tar.gz


In [29]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = 'Custom-Sklearn-RF-Model-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    model_data=artifact,
    role="arn:aws:iam::905418413938:role/SageMakerRole",
    entry_point='script.py',
    framework_version=FRAMEWORK_VERSION,
)

In [30]:
# folder location to save the model
model_name

'Custom-Sklearn-RF-Model-2024-05-04-20-07-39'

In [31]:
# Deploy the model to an endpoint
endpoint_name = 'Custom-Sklearn-RF-Model-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName ={}", format(endpoint_name))

Pretdictor = model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large',
    endpoint_name=endpoint_name
)

EndpointName ={} Custom-Sklearn-RF-Model-2024-05-04-20-10-54


INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2024-05-04-20-10-56-105
INFO:sagemaker:Creating endpoint-config with name Custom-Sklearn-RF-Model-2024-05-04-20-10-54
INFO:sagemaker:Creating endpoint with name Custom-Sklearn-RF-Model-2024-05-04-20-10-54


-----!

endpoint_name

In [32]:

testX[features][0:2].values.tolist()

[[1646.0,
  0.0,
  2.5,
  0.0,
  3.0,
  1.0,
  25.0,
  0.6,
  200.0,
  2.0,
  5.0,
  211.0,
  1608.0,
  686.0,
  8.0,
  6.0,
  11.0,
  1.0,
  1.0,
  0.0],
 [1182.0,
  0.0,
  0.5,
  0.0,
  7.0,
  1.0,
  8.0,
  0.5,
  138.0,
  8.0,
  16.0,
  275.0,
  986.0,
  2563.0,
  19.0,
  17.0,
  19.0,
  1.0,
  0.0,
  0.0]]

In [33]:
print(Pretdictor.predict(testX[features][0:2].values.tolist()))

[0 2]


In [None]:
# sm_boto3.delete_endpoint(EndpointName=endpoint_name)