In [1]:
import pandas as pd
import numpy as np
import sagemaker
import boto3
from sklearn.model_selection import train_test_split

print("Libraries imported successfully!")

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/abiha07/Library/Application Support/sagemaker/config.yaml
Libraries imported successfully!


In [None]:
# Create a SageMaker session
sess = sagemaker.Session()

# Get your region (should be us-east-1)
region = sess.boto_region_name
print(f"Region: {region}")

bucket_name = "portfolio-mobile-price-zuhair-31" 

# Create the S3 bucket
s3 = boto3.client('s3', region_name=region)

try:
    if region == 'us-east-1':
        s3.create_bucket(Bucket=bucket_name)
    else:
        s3.create_bucket(
            Bucket=bucket_name,
            CreateBucketConfiguration={'LocationConstraint': region}
        )
    print(f"Bucket '{bucket_name}' created successfully.")
except Exception as e:
    print(f"Bucket info: {e}")

Region: us-east-1
Bucket 'portfolio-mobile-price-zuhair-31' created successfully.


In [4]:
# Load raw data
df = pd.read_csv('mob_price_classification_train.csv')

# 'price_range' is our target variable. 
# We split features (X) and label (y)
features = list(df.columns)
label = features.pop(-1) # Remove the last column 'price_range'

x = df[features]
y = df[label]

# Split: 85% Train, 15% Test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=0)

# Re-merge so we can save as CSV
trainX = pd.DataFrame(x_train)
trainX[label] = y_train

testX = pd.DataFrame(x_test)
testX[label] = y_test

# Organize columns: Target must be the last column for this specific sklearn script approach
print("Train Shape:", trainX.shape)
print("Test Shape:", testX.shape)

Train Shape: (1700, 21)
Test Shape: (300, 21)


In [14]:
import os

# Create local folders
os.makedirs('data/train', exist_ok=True)
os.makedirs('data/test', exist_ok=True)

# Save locally (headers=True, index=False is standard for custom scripts)
trainX.to_csv('data/train/train.csv', index=False, header=False)
testX.to_csv('data/test/test.csv', index=False, header=False)

# Upload to S3
prefix = 'mobile_price_classification'

train_path = sess.upload_data(
    path='data/train/train.csv', bucket=bucket_name, key_prefix=f'{prefix}/train'
)

test_path = sess.upload_data(
    path='data/test/test.csv', bucket=bucket_name, key_prefix=f'{prefix}/test'
)

print(f"Train data uploaded to: {train_path}")
print(f"Test data uploaded to: {test_path}")

Train data uploaded to: s3://portfolio-mobile-price-zuhair-31/mobile_price_classification/train/train.csv
Test data uploaded to: s3://portfolio-mobile-price-zuhair-31/mobile_price_classification/test/test.csv


In [7]:
from sagemaker.sklearn.estimator import SKLearn

# === PASTE YOUR IAM ROLE ARN HERE ===
role = "arn:aws:iam::065173207553:role/SageMaker-Execution-Role-Portfolio"

print(f"Using Role: {role}")

Using Role: arn:aws:iam::065173207553:role/SageMaker-Execution-Role-Portfolio


In [15]:
sklearn_estimator = SKLearn(
    entry_point='script.py',          # The script we just wrote
    role=role,                        # The IAM role
    instance_count=1,                 # Number of machines
    instance_type='ml.m4.xlarge',      # Type of machine
    framework_version='1.2-1',        # Scikit-learn version
    base_job_name='rf-custom-sklearn',# Name of the job on dashboard
    hyperparameters={
        'n_estimators': 100,
        'random_state': 0
    }
)

In [16]:
# Launch training job (this is asynchronous, logs will print here)
sklearn_estimator.fit({'train': train_path, 'test': test_path})

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: rf-custom-sklearn-2025-11-26-07-25-39-152


2025-11-26 07:25:44 Starting - Starting the training job...
2025-11-26 07:26:07 Starting - Preparing the instances for training...
2025-11-26 07:26:46 Downloading - Downloading the training image......
  import pkg_resources
2025-11-26 07:28:07,404 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2025-11-26 07:28:07,408 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-11-26 07:28:07,411 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2025-11-26 07:28:07,425 sagemaker_sklearn_container.training INFO     Invoking user training script.
2025-11-26 07:28:07,666 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-11-26 07:28:07,670 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2025-11-26 07:28:07,689 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-11-26 07:28:

In [17]:
# Deploy the model to an endpoint
# We use ml.m4.xlarge (same as training) to minimize quota issues
predictor = sklearn_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge'
)

print("Endpoint deployed!")

INFO:sagemaker:Creating model with name: rf-custom-sklearn-2025-11-26-07-33-30-139
INFO:sagemaker:Creating endpoint-config with name rf-custom-sklearn-2025-11-26-07-33-30-139
INFO:sagemaker:Creating endpoint with name rf-custom-sklearn-2025-11-26-07-33-30-139


-------!Endpoint deployed!


In [18]:
# Prepare some test data (Taking the first 2 rows of our test set)
# We drop the first column because that corresponds to the label (price_range)
test_features = testX.iloc[0:2, 1:].values

print("Sending data to endpoint...")
prediction = predictor.predict(test_features)

print(f"Prediction: {prediction}")
# Expected output: Something like [3 0] or [1 2] (Predicted Price Ranges)

Sending data to endpoint...
Prediction: [1766 1189]


In [19]:
# DELETE THE ENDPOINT to stop billing
predictor.delete_endpoint()
print("Endpoint deleted successfully.")

INFO:sagemaker:Deleting endpoint configuration with name: rf-custom-sklearn-2025-11-26-07-33-30-139
INFO:sagemaker:Deleting endpoint with name: rf-custom-sklearn-2025-11-26-07-33-30-139


Endpoint deleted successfully.
