## Mobile Price Classification

In [1]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd

try:
    # You're using a SageMaker notebook
    sess = sagemaker.Session()
    bucket = sess.default_bucket()
    role = sagemaker.get_execution_role()
    region = sess.boto_session.region_name
except ValueError :
    # You're using a notebook somewhere else
    print("Setting role and SageMaker session manually...")
    bucket = "sagemakerbucketedurekademo"
    region = "ap-south-1"

    iam = boto3.client("iam")
    sagemaker_client = boto3.client("sagemaker")

    sagemaker_execution_role_name = "aws_sagemaker_ececution"  # Change this to your role name
    role = iam.get_role(RoleName=sagemaker_execution_role_name)["Role"]["Arn"]
    boto3.setup_default_session(region_name=region, profile_name="default")
    sess = sagemaker.Session(sagemaker_client=sagemaker_client, default_bucket=bucket)
sm_boto3 = boto3.client("sagemaker", region_name="us-east-1", )
# sess = sagemaker.Session(boto3.session.Session(region_name=region))
# region = sess.boto_session.region_name
# role = sagemaker.get_execution_role()
# bucket = 'sagemakerbucketedureka' # Mention the created S3 bucket name here
print("Using bucket " + bucket)
print(region)
print(role)

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/asishdash/Library/Application Support/sagemaker/config.yaml


Couldn't call 'get_role' to get Role ARN from role name demollm to get Role path.


Setting role and SageMaker session manually...
Using bucket sagemakerbucketedurekademo
ap-south-1
arn:aws:iam::996431963446:role/aws_sagemaker_ececution


In [2]:
df = pd.read_csv("mob_price_classification_train.csv")

In [3]:

df['price_range'].value_counts(normalize=True)

price_range
1    0.25
2    0.25
3    0.25
0    0.25
Name: proportion, dtype: float64

In [4]:
# Missing Values
df.isnull().mean() * 100

battery_power    0.0
blue             0.0
clock_speed      0.0
dual_sim         0.0
fc               0.0
four_g           0.0
int_memory       0.0
m_dep            0.0
mobile_wt        0.0
n_cores          0.0
pc               0.0
px_height        0.0
px_width         0.0
ram              0.0
sc_h             0.0
sc_w             0.0
talk_time        0.0
three_g          0.0
touch_screen     0.0
wifi             0.0
price_range      0.0
dtype: float64

In [5]:
features = list(df.columns)
features

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi',
 'price_range']

In [6]:
label = features.pop(-1)
label

'price_range'

In [7]:
x = df[features]
y = df[label]

In [8]:
y.value_counts()

price_range
1    500
2    500
3    500
0    500
Name: count, dtype: int64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=1010)

In [11]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [12]:
print(trainX.shape)
print(testX.shape)

(1400, 21)
(600, 21)


In [13]:
trainX.to_csv("train_V1.csv",index = False)
testX.to_csv("test_V1.csv", index = False)

In [14]:
bucket

'sagemakerbucketedurekademo'

In [15]:
# send data to S3. SageMaker will take training data from s3
sk_prefix = "sagemaker/edureka/mobile_price_classification/sklearncontainer"
trainpath = sess.upload_data(
    path="train_V1.csv", bucket=bucket, key_prefix=sk_prefix
)

testpath = sess.upload_data(
    path="test_V1.csv", bucket=bucket, key_prefix=sk_prefix
)
print(trainpath)
print(testpath)

s3://sagemakerbucketedurekademo/sagemaker/edureka/mobile_price_classification/sklearncontainer/train_V1.csv
s3://sagemakerbucketedurekademo/sagemaker/edureka/mobile_price_classification/sklearncontainer/test_V1.csv


In [16]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import joblib
import os
import numpy as np
import pandas as pd
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf
    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=150)
    parser.add_argument("--random_state", type=int, default=1010)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train_V1.csv")
    parser.add_argument("--test-file", type=str, default="test_V1.csv")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- TRAINING DATA (70%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- TESTING DATA (30%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
  
    print("Training RandomForest Model.....")
    print()
    model =  RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose = 3,n_jobs=-1)
    model.fit(X_train, y_train)
    print()
    

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    print("Model persisted at " + model_path)
    print()

    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)

Writing script.py


In [18]:
from sagemaker.sklearn.estimator import SKLearn

# sagemaker.Session(boto3.session.Session())

# sm_boto3 = boto3.client("sagemaker", region_name="us-east-1")
# sagemaker.Session(boto3.session.Session())
# print(sess.boto_session.region_name)
# print(region)
# print(sagemaker.get_execution_role())
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='aws_sagemaker_ececution')['Role']['Arn']
print(role)
FRAMEWORK_VERSION = "0.23-1"
sklearn_estimator = SKLearn(
    entry_point="script.py",
    region = sess.boto_session.region_name,
    role=role,
    # role="arn:aws:iam::434911885902:policy/service-role/AmazonSageMaker-ExecutionPolicy-20240628T172204",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-edureka-sklearn",
    hyperparameters={
        "n_estimators": 150,
        "random_state": 1010,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)

Couldn't call 'get_role' to get Role ARN from role name demollm to get Role path.


arn:aws:iam::996431963446:role/aws_sagemaker_ececution


In [20]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)
# sklearn_estimator.fit({"train": datapath}, wait=True)

INFO:sagemaker:Creating training-job with name: RF-edureka-sklearn-2024-06-29-17-05-12-011


ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateTrainingJob operation: Resource limits for this account have been exceeded. Please contact Customer Support for assistance.

In [84]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


2024-06-28 14:55:31 Starting - Preparing the instances for training
2024-06-28 14:55:31 Downloading - Downloading the training image
2024-06-28 14:55:31 Training - Training image download completed. Training in progress.
2024-06-28 14:55:31 Uploading - Uploading generated training model
2024-06-28 14:55:31 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-1-434911885902/RF-edureka-sklearn-2024-06-28-14-53-02-101/output/model.tar.gz


In [85]:
artifact

's3://sagemaker-us-east-1-434911885902/RF-edureka-sklearn-2024-06-28-14-53-02-101/output/model.tar.gz'

In [89]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "RF-edureka-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name =  model_name,
    model_data=artifact,
    role=role,
    # role="arn:aws:iam::566373416292:role/service-role/AmazonSageMaker-ExecutionRole-20230120T164209",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

In [90]:
model_name

'RF-edureka-model-2024-06-28-15-00-46'

In [91]:
##Endpoints deployment
endpoint_name = "RF-edureka-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
)

EndpointName=RF-edureka-model-2024-06-28-15-01-04


INFO:sagemaker:Creating model with name: RF-edureka-model-2024-06-28-15-00-46
INFO:sagemaker:Creating endpoint-config with name RF-edureka-model-2024-06-28-15-01-04
INFO:sagemaker:Creating endpoint with name RF-edureka-model-2024-06-28-15-01-04


-----

In [None]:
endpoint_name

In [None]:
testX[features][0:2].values.tolist()

In [None]:
print(predictor.predict(testX[features][0:2].values.tolist()))

In [None]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)