In [5]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd

# Create a Boto3 session and SageMaker session
boto_session = boto3.Session(region_name="eu-central-1")
sess = sagemaker.Session(boto_session=boto_session)

# Retrieve the region and bucket information
region = sess.boto_session.region_name
bucket = 'mobile-sagemaker1'

# Validate bucket connectivity
s3_client = boto_session.client('s3')
try:
    response = s3_client.head_bucket(Bucket=bucket)
    print(f"Successfully connected to bucket: {bucket}")
except Exception as e:
    print(f"Error connecting to bucket {bucket}: {e}")

print("Using Bucket:", bucket)
print("Bucket Region:", region)

Successfully connected to bucket: mobile-sagemaker1
Using Bucket: mobile-sagemaker1
Bucket Region: eu-central-1


In [6]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [3]:
# Initialize AWS and SageMaker session details
region = boto3.Session().region_name
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()

# Create a pipeline session
pipeline_session = PipelineSession()

In [7]:
df.shape

(2000, 21)

In [8]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [9]:
features = list(df.columns)
features

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi',
 'price_range']

In [10]:
label = features.pop(-1)
label

'price_range'

In [11]:
x = df[features]
y = df[label]

In [12]:
x.shape,y.shape

((2000, 20), (2000,))

In [13]:
x.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0


In [14]:
y.head()

0    1
1    2
2    2
3    2
4    1
Name: price_range, dtype: int64

In [15]:
X_train,X_test,y_train,y_test =train_test_split(x,y,test_size=0.15,random_state=0)


In [16]:
print(X_train.shape,type(X_train))
print(y_train.shape,type(y_train))
print(X_test.shape,type(X_test))
print(y_test.shape,type(y_test))

(1700, 20) <class 'pandas.core.frame.DataFrame'>
(1700,) <class 'pandas.core.series.Series'>
(300, 20) <class 'pandas.core.frame.DataFrame'>
(300,) <class 'pandas.core.series.Series'>


In [17]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train 

testX = pd.DataFrame(X_test)
testX[label] = y_test


In [18]:
trainX.to_csv("train-V-1.csv",index = False)

testX.to_csv("test-V-1.csv",index = False)

In [19]:
# Upload data efficiently
sk_prefix = "sagemaker/sklearncontainer"

trainpath = sess.upload_data(
    path="train-V-1.csv",
    bucket=bucket,
    key_prefix=sk_prefix
)

testpath = sess.upload_data(
    path="test-V-1.csv",
    bucket=bucket,
    key_prefix=sk_prefix
)

print("Training data uploaded to S3:", trainpath)
print("Test data uploaded to S3:", testpath)

Training data uploaded to S3: s3://mobile-sagemaker1/sagemaker/sklearncontainer/train-V-1.csv
Test data uploaded to S3: s3://mobile-sagemaker1/sagemaker/sklearncontainer/test-V-1.csv


In [20]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import argparse
import os
import pandas as pd


def model_fn(model_dir):
    """Load and return the trained model."""
    return joblib.load(os.path.join(model_dir, "model.joblib"))


if __name__ == "__main__":
    print("[INFO] Extracting arguments...")
    
    # Parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")
    args, _ = parser.parse_known_args()

    print(f"SKLearn Version: {joblib.__version__}")
    print("[INFO] Reading data...")

    # Load datasets
    train_path = os.path.join(args.train, args.train_file)
    test_path = os.path.join(args.test, args.test_file)
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    # Extract features and labels
    features = train_df.columns[:-1]
    label = train_df.columns[-1]
    X_train, y_train = train_df[features], train_df[label]
    X_test, y_test = test_df[features], test_df[label]

    print(f"Training data shape: {X_train.shape}, {y_train.shape}")
    print(f"Testing data shape: {X_test.shape}, {y_test.shape}")
    print("Training RandomForest model...")

    # Train model
    model = RandomForestClassifier(
        n_estimators=args.n_estimators, random_state=args.random_state, verbose=1
    )
    model.fit(X_train, y_train)

    # Save model
    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print(f"Model saved at: {model_path}")

    # Evaluate model
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_report = classification_report(y_test, y_pred_test)

    print(f"Test Accuracy: {test_acc}")
    print("Classification Report:\n", test_report)


Writing script.py


In [21]:
import boto3
from sagemaker import Session
from sagemaker.sklearn.estimator import SKLearn


In [None]:
# Set up the AWS region, bucket, and role
import sagemaker
region_name = "eu-central-1"
bucket_name = "mobile-sagemaker1"
role_arn = sagemaker.get_execution_role()

print(region_name,bucket_name,role_arn)

eu-central-1 mobile-sagemaker1 arn:aws:iam::011528279330:role/service-role/AmazonSageMaker-ExecutionRole-20241211T170582


In [28]:
# Initialize Boto3 session and SageMaker session
boto_session = boto3.Session(region_name=region_name)
sagemaker_session = Session(boto_session=boto_session, default_bucket=bucket_name)

# Check if session and bucket are correctly initialized
print(f"Region: {region_name}")
print(f"Bucket: {bucket_name}")


Region: eu-central-1
Bucket: mobile-sagemaker1


In [29]:
# Define the framework version
FRAMEWORK_VERSION = "0.23-1"

# Define S3 paths for train and test data
train_prefix = "sagemaker/sklearncontainer"
test_prefix = "sagemaker/sklearncontainer"
train_path = f"s3://{bucket_name}/{train_prefix}/train-V-1.csv"
test_path = f"s3://{bucket_name}/{test_prefix}/test-V-1.csv"

# Print the S3 paths to verify
print(f"Train Path: {train_path}")
print(f"Test Path: {test_path}")



Train Path: s3://mobile-sagemaker1/sagemaker/sklearncontainer/train-V-1.csv
Test Path: s3://mobile-sagemaker1/sagemaker/sklearncontainer/test-V-1.csv


In [30]:
# Create an SKLearn estimator object
sklearn_estimator = SKLearn(
    entry_point="script.py",  # Ensure `script.py` exists in the current directory
    role=role_arn,
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0,
    },
    use_spot_instances=True,
    max_wait=7200,
    max_run=3600,
    sagemaker_session=sagemaker_session,
)

# Print the estimator details to verify
print(sklearn_estimator)


<sagemaker.sklearn.estimator.SKLearn object at 0x7fc5881528d0>


In [31]:
# Launch the training job
print("[INFO] Launching SageMaker training job...")
sklearn_estimator.fit(
    {"train": train_path, "test": test_path},  # Use S3 paths for training and test data
    wait=True
)
print("[INFO] Training job complete.")


INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2024-12-11-16-44-40-630


[INFO] Launching SageMaker training job...
2024-12-11 16:44:41 Starting - Starting the training job...
2024-12-11 16:44:55 Starting - Preparing the instances for training...
2024-12-11 16:45:22 Downloading - Downloading input data...
2024-12-11 16:45:47 Downloading - Downloading the training image...
2024-12-11 16:46:38 Training - Training image download completed. Training in progress.
2024-12-11 16:46:38 Uploading - Uploading generated training model[34m2024-12-11 16:46:31,749 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-12-11 16:46:31,751 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-12-11 16:46:31,791 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-12-11 16:46:31,940 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-12-11 16:46:31,952 sagemaker-training-toolkit INFO     No GPUs detected (

In [32]:
# Wait for the training job to complete
sklearn_estimator.latest_training_job.wait(logs="None")

# Retrieve the S3 path for the trained model artifact
artifact = sagemaker.Session().describe_training_job(
    sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

# Print the artifact location
print(f"Model artifact persisted at: {artifact}")



2024-12-11 16:46:51 Starting - Preparing the instances for training
2024-12-11 16:46:51 Downloading - Downloading the training image
2024-12-11 16:46:51 Training - Training image download completed. Training in progress.
2024-12-11 16:46:51 Uploading - Uploading generated training model
2024-12-11 16:46:51 Completed - Training job completed
Model artifact persisted at: s3://mobile-sagemaker1/RF-custom-sklearn-2024-12-11-16-44-40-630/output/model.tar.gz


In [None]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

# Generate a unique model name
model_name = f"custom-sklearn-model-{strftime('%Y-%m-%d-%H-%M-%S', gmtime())}"

# Create the SKLearnModel
model = SKLearnModel(
    name=model_name,
    model_data=artifact,  
    role=sagemaker.get_execution_role(),  # Dynamically retrieve the execution role
    entry_point="script.py",  # Your inference script
    framework_version=FRAMEWORK_VERSION 
)

print(f"[INFO] Model created with name: {model_name}")


[INFO] Model created with name: custom-sklearn-model-2024-12-11-17-41-43


In [34]:
from time import gmtime, strftime

# Generate a unique endpoint name
endpoint_name = f"custom-sklearn-model-{strftime('%Y-%m-%d-%H-%M-%S', gmtime())}"
print(f"[INFO] Endpoint Name: {endpoint_name}")

# Deploy the model
predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",  
    endpoint_name=endpoint_name,
)

print(f"[INFO] Model deployed to endpoint: {endpoint_name}")


[INFO] Endpoint Name: custom-sklearn-model-2024-12-11-17-45-03


INFO:sagemaker:Creating model with name: custom-sklearn-model-2024-12-11-17-41-43
INFO:sagemaker:Creating endpoint-config with name custom-sklearn-model-2024-12-11-17-45-03
INFO:sagemaker:Creating endpoint with name custom-sklearn-model-2024-12-11-17-45-03


-----![INFO] Model deployed to endpoint: custom-sklearn-model-2024-12-11-17-45-03


In [35]:
predictor

<sagemaker.sklearn.model.SKLearnPredictor at 0x7fc582a795d0>

In [36]:
testX[features][0:2].values.tolist()

[[1454.0,
  1.0,
  0.5,
  1.0,
  1.0,
  0.0,
  34.0,
  0.7,
  83.0,
  4.0,
  3.0,
  250.0,
  1033.0,
  3419.0,
  7.0,
  5.0,
  5.0,
  1.0,
  1.0,
  0.0],
 [1092.0,
  1.0,
  0.5,
  1.0,
  10.0,
  0.0,
  11.0,
  0.5,
  167.0,
  3.0,
  14.0,
  468.0,
  571.0,
  737.0,
  14.0,
  4.0,
  11.0,
  0.0,
  1.0,
  0.0]]

In [38]:
print(predictor.predict(testX[features][0:2].values.tolist()))

[3 0]


In [40]:
# Initialize a boto3 client for SageMaker
sm_boto3 = boto3.client("sagemaker")

# Delete the endpoint
try:
    sm_boto3.delete_endpoint(EndpointName=endpoint_name)
    print(f"[INFO] Endpoint {endpoint_name} deleted successfully.")
except Exception as e:
    print(f"[ERROR] Failed to delete endpoint {endpoint_name}: {e}")


[INFO] Endpoint custom-sklearn-model-2024-12-11-17-45-03 deleted successfully.
