In [1]:
!pip install sagemaker

import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd 

# Create a boto3 session with the specified region
region = 'us-east-1'
boto_session = boto3.Session(region_name=region)

# Initialize SageMaker session using the boto3 session
session = sagemaker.Session(boto_session=boto_session)
boto3_sm = boto3.client("sagemaker", region_name=region)
bucket = 'mobbucketsagemaker213'
print("Using bucket"+ bucket)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/codespace/.config/sagemaker/config.yaml
Using bucketmobbucketsagemaker213


In [2]:
!pip install opendatasets



In [3]:
import opendatasets as od
import os

In [4]:
df = pd.read_csv("mobile-price/train.csv")

In [5]:
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [6]:
df.shape

(2000, 21)

In [7]:
df["price_range"].value_counts()

price_range
1    500
2    500
3    500
0    500
Name: count, dtype: int64

In [8]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [9]:
df.isnull().mean()*100

battery_power    0.0
blue             0.0
clock_speed      0.0
dual_sim         0.0
fc               0.0
four_g           0.0
int_memory       0.0
m_dep            0.0
mobile_wt        0.0
n_cores          0.0
pc               0.0
px_height        0.0
px_width         0.0
ram              0.0
sc_h             0.0
sc_w             0.0
talk_time        0.0
three_g          0.0
touch_screen     0.0
wifi             0.0
price_range      0.0
dtype: float64

In [10]:
features = list(df.columns)
features

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi',
 'price_range']

In [11]:
label = features.pop(-1)
label

'price_range'

In [12]:
x = df[features]
y = df[label]

In [13]:
y.head()

0    1
1    2
2    2
3    2
4    1
Name: price_range, dtype: int64

In [14]:
x.shape

(2000, 20)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [16]:
print("X_train Shape:", X_train.shape)
print("X_test Shape:", X_test.shape)
print("y_train Shape:", y_train.shape)
print("y_test Shape:", y_test.shape)

X_train Shape: (1600, 20)
X_test Shape: (400, 20)
y_train Shape: (1600,)
y_test Shape: (400,)


In [17]:
trainX = pd.DataFrame(X_train)
trainX['label'] = y_train

testX = pd.DataFrame(X_test)
testX['label'] = y_test

In [18]:
print(trainX.shape)
print(testX.shape)

(1600, 21)
(400, 21)


In [19]:
trainX.to_csv('train_v1.csv', index=False)
testX.to_csv('test_v1.csv', index=False)

In [22]:
#send data to S3
sk_prefix = "sagemaker/mobile_price_classification/sklearncontainer"
trainpath = session.upload_data(path="train_v1.csv", bucket=bucket, key_prefix=sk_prefix)
testpath = session.upload_data(path="test_v1.csv", bucket=bucket, key_prefix=sk_prefix)

In [23]:
print(trainpath)
print(testpath)

s3://mobbucketsagemaker213/sagemaker/mobile_price_classification/sklearncontainer/train_v1.csv
s3://mobbucketsagemaker213/sagemaker/mobile_price_classification/sklearncontainer/test_v1.csv


In [24]:
%%writefile script.py

import argparse
import os
import json
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score
import joblib
import pathlib
from io import StringIO
import boto3
import pandas as pd
import numpy as np

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

if __name__ =='__main__':

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument('--n_estimators', type=int, default=100)
    parser.add_argument('--random_state', type=int, default=0)
    # parser.add_argument('--epochs', type=int, default=10)
    # parser.add_argument('--batch-size', type=int, default=100)
    # parser.add_argument('--learning-rate', type=float, default=0.1)

    # an alternative way to load hyperparameters via SM_HPS environment variable.
    # parser.add_argument('--sm-hps', type=json.loads, default=os.environ['SM_HPS'])

    # input data and model directories
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
    parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST'])
    parser.add_argument('--train_file', type=str, default='train_v1.csv')
    parser.add_argument('--test_file', type=str, default='test_v1.csv')

    args, _ = parser.parse_known_args()

    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (80%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (20%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
    print("Training RandomForest Model.....")
    print()
    model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose=3, n_jobs=None)
    model.fit(X_train, y_train)
    print()

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print("Model persisted at " + model_path)
    print()

    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)

Overwriting script.py


In [25]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point = "script.py",
    role="arn:aws:iam::590183923859:role/SageMakerExecutionRole",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)

In [26]:
sklearn_estimator.fit({"train":trainpath, "test":testpath}, wait=True)

INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2024-05-29-20-13-16-796


2024-05-29 20:13:18 Starting - Starting the training job...
2024-05-29 20:13:33 Starting - Preparing the instances for training...
2024-05-29 20:14:05 Downloading - Downloading input data...
2024-05-29 20:14:31 Downloading - Downloading the training image...
2024-05-29 20:15:22 Training - Training image download completed. Training in progress.
2024-05-29 20:15:22 Uploading - Uploading generated training model.2024-05-29 20:15:15,471 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-05-29 20:15:15,475 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-05-29 20:15:15,519 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-05-29 20:15:15,683 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-05-29 20:15:15,696 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-05-29 20:15:15,708 sagemaker-training-toolkit INF

In [27]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = boto3_sm.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact present at: ", artifact)


2024-05-29 20:15:34 Starting - Preparing the instances for training
2024-05-29 20:15:34 Downloading - Downloading the training image
2024-05-29 20:15:34 Training - Training image download completed. Training in progress.
2024-05-29 20:15:34 Uploading - Uploading generated training model
2024-05-29 20:15:34 Completed - Training job completed
Model artifact present at:  s3://sagemaker-us-east-1-590183923859/RF-custom-sklearn-2024-05-29-20-13-16-796/output/model.tar.gz


In [28]:
artifact

's3://sagemaker-us-east-1-590183923859/RF-custom-sklearn-2024-05-29-20-13-16-796/output/model.tar.gz'

In [31]:
# Making a copy of the built model for deployment
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name=model_name,
    model_data=artifact,
    role="arn:aws:iam::590183923859:role/SageMakerExecutionRole",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION
)

In [32]:
model

<sagemaker.sklearn.model.SKLearnModel at 0x7493dd72c3d0>

In [33]:
# Endpoint Deployment
endpoint_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name
)

EndpointName=Custom-sklearn-model-2024-05-29-20-38-58


INFO:sagemaker:Creating model with name: Custom-sklearn-model-2024-05-29-20-37-41
INFO:sagemaker:Creating endpoint-config with name Custom-sklearn-model-2024-05-29-20-38-58
INFO:sagemaker:Creating endpoint with name Custom-sklearn-model-2024-05-29-20-38-58


-------!

In [34]:
predictor

<sagemaker.sklearn.model.SKLearnPredictor at 0x7493de4379a0>

In [35]:
endpoint_name

'Custom-sklearn-model-2024-05-29-20-38-58'

In [37]:
# First five rows of the test dataset for prediction using the endpoint created above
testX[features][0:5].values.tolist()

[[1454.0,
  1.0,
  0.5,
  1.0,
  1.0,
  0.0,
  34.0,
  0.7,
  83.0,
  4.0,
  3.0,
  250.0,
  1033.0,
  3419.0,
  7.0,
  5.0,
  5.0,
  1.0,
  1.0,
  0.0],
 [1092.0,
  1.0,
  0.5,
  1.0,
  10.0,
  0.0,
  11.0,
  0.5,
  167.0,
  3.0,
  14.0,
  468.0,
  571.0,
  737.0,
  14.0,
  4.0,
  11.0,
  0.0,
  1.0,
  0.0],
 [1524.0,
  1.0,
  1.8,
  1.0,
  0.0,
  0.0,
  10.0,
  0.6,
  174.0,
  4.0,
  1.0,
  154.0,
  550.0,
  2678.0,
  16.0,
  5.0,
  13.0,
  1.0,
  0.0,
  1.0],
 [1807.0,
  1.0,
  2.1,
  0.0,
  2.0,
  0.0,
  49.0,
  0.8,
  125.0,
  1.0,
  10.0,
  337.0,
  1384.0,
  1906.0,
  17.0,
  13.0,
  13.0,
  0.0,
  1.0,
  1.0],
 [1086.0,
  1.0,
  1.7,
  1.0,
  0.0,
  1.0,
  43.0,
  0.2,
  111.0,
  6.0,
  1.0,
  56.0,
  1150.0,
  3285.0,
  11.0,
  5.0,
  17.0,
  1.0,
  1.0,
  0.0]]

In [38]:
print(predictor.predict(testX[features][0:5].values.tolist()))

[3 0 2 1 3]


In [39]:
# Deleting the endpoint to avoid the costs
boto3_sm.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': 'fa2324cc-cf8f-40b6-b640-41ef63a3df6e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'fa2324cc-cf8f-40b6-b640-41ef63a3df6e',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Wed, 29 May 2024 20:50:52 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}