In [21]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker import image_uris # getting docket image, which contains the algorithm 
from sagemaker.inputs import TrainingInput

import io
import boto3
from datetime import datetime

# Define the IAM role
role = get_execution_role()

# Specify the S3 bucket and prefix where your input data is stored
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
sm = boto3.Session().client(service_name="sagemaker", region_name=region)
s3 = boto3.client('s3')


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [35]:
# Setting up the enviornment
bucket = 'project508data'
train_data_prefix = 'train'
val_data_prefix = 'validation'
train_data_uri = f's3://{bucket}/{train_data_prefix}/'

# setting up paths for the model
train_data = "s3://{}/{}/".format(bucket, train_data_prefix)
val_data = "s3://{}/{}/".format(bucket, val_data_prefix)
train_truth = "s3://{}/{}/".format(bucket, "train_groundtruth")
val_truth = "s3://{}/{}/".format(bucket, "validation_groundtruth")

n_classes = 3
n_samples = 2700
n_epochs = 10
batch_size = 100 # after this number of images, update weights for the model
train_instance_type = 'ml.p2.xlarge'
job_name_prefix = "train-mod5-"
s3_output_path = "s3://{}/{}/{}".format(bucket, train_data_prefix, "model_output")

# getting timestamp
timestamp = (
    str(datetime.now().replace(microsecond=0)).replace(" ", "-").replace(":", "-")
)
job_name = job_name_prefix + timestamp
print(job_name)

train-mod5-2024-04-04-21-36-16


In [23]:
# training the data using the sagemaker built in image classifier
train_image_uri = sagemaker.image_uris.retrieve(
    framework="image-classification",
    region=region,
    image_scope="training",
    version="latest",
)
print(train_image_uri)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


811284229777.dkr.ecr.us-east-1.amazonaws.com/image-classification:1


In [24]:
# Setting up the estimator

autopilot_estimator = sagemaker.estimator.Estimator(
    image_uri=train_image_uri,
    role=role,
    instance_count=1,
    instance_type=train_instance_type,
    volume_size=50,
    max_run=360000,
    input_mode="File",
    output_path=s3_output_path,
    sagemaker_session=sagemaker_session,
)



Number of layers for the network. For data with large image size (for example, 224x224 - like ImageNet), we suggest selecting the number of layers from the set [18, 34, 50, 101, 152, 200]. For data with small image size (for example, 28x28 - like CIFAR), we suggest selecting the number of layers from the set [20, 32, 44, 56, 110]. The number of layers in each set is based on the ResNet paper. For transfer learning, the number of layers defines the architecture of base network and hence can only be selected from the set [18, 34, 50, 101, 152, 200].

In [25]:
# setting up the hyperparameters
autopilot_estimator.set_hyperparameters(
    num_classes= n_classes,  # update this
    epochs= n_epochs,  # update this
    num_training_samples=n_samples,  # update this
    mini_batch_size=batch_size,  # update this
    num_layers= 18,
    use_pretrained_model=1,
    image_shape="3,224,224",
    resize=256,
    learning_rate=0.001,
    use_weighted_loss=1,
    augmentation_type="crop_color_transform",
    precision_dtype="float32",
    multi_label=0,
)

In [26]:
train_imgs = sagemaker.inputs.TrainingInput(
    train_data,
    distribution="FullyReplicated",
    content_type="application/jpeg",
    s3_data_type="S3Prefix",
)

val_imgs = sagemaker.inputs.TrainingInput(
    val_data,
    distribution="FullyReplicated",
    content_type="application/jpeg",
    s3_data_type="S3Prefix",
)


train_tru = sagemaker.inputs.TrainingInput(
    train_truth,
    distribution="FullyReplicated",
    content_type="text/plain",
    s3_data_type="S3Prefix",
)

val_tru = sagemaker.inputs.TrainingInput(
    val_truth,
    distribution="FullyReplicated",
    content_type="text/plain",
    s3_data_type="S3Prefix",
)

data_channels = {
    "train": train_imgs,
    "validation": val_imgs,
    "train_lst": train_tru,
    "validation_lst": val_tru,
}
    

In [27]:
timestamp = (
    str(datetime.now().replace(microsecond=0)).replace(" ", "-").replace(":", "-")
)
job_name = job_name_prefix + timestamp
print(job_name)

train-mod2-2024-04-04-20-36-18


In [28]:
train_tru

<sagemaker.inputs.TrainingInput at 0x7f35ff81d3c0>

In [36]:
autopilot_estimator.fit(inputs=data_channels, logs=True, job_name=job_name)

INFO:sagemaker:Creating training-job with name: train-mod5-2024-04-04-21-36-16


2024-04-04 21:36:20 Starting - Starting the training job...
2024-04-04 21:36:47 Starting - Preparing the instances for training......
2024-04-04 21:37:23 Downloading - Downloading input data...
2024-04-04 21:38:03 Downloading - Downloading the training image........................
2024-04-04 21:41:59 Training - Training image download completed. Training in progress.....[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34mNvidia gpu devices, drivers and cuda toolkit versions (only available on hosts with GPU):[0m
[34mThu Apr  4 21:42:48 2024       [0m
[34m+-----------------------------------------------------------------------------+[0m
[34m| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |[0m
[34m|-------------------------------+----------------------+----------------------+[0m
[34m| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |[0m
[34m| Fan  Te

## Deploying final model so we can run inference

In [39]:
import boto3
import sagemaker
from sagemaker.model import Model
from sagemaker.predictor import Predictor

# Initialize SageMaker session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# Specify the name of the training job
training_job_name = 'train-mod5-2024-04-04-21-36-16'

# Specify the S3 location where model artifacts are stored
model_s3_path = sagemaker_session.describe_training_job(training_job_name)['ModelArtifacts']['S3ModelArtifacts']

# Create a SageMaker Model object
model = Model(
    model_data=model_s3_path,
    role=role,
    image_uri='811284229777.dkr.ecr.us-east-1.amazonaws.com/image-classification:1',  # Specify the container image URI for your model
    predictor_cls=sagemaker.predictor.Predictor,
    sagemaker_session=sagemaker_session
)

# Deploy the model as an endpoint
predictor = model.deploy(
    initial_instance_count=1,  # Number of instances to deploy
    instance_type='ml.m4.xlarge',  # Instance type for deployment
    endpoint_name='Final-model-for-test-data'  # Specify the name for your endpoint
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


INFO:sagemaker:Creating model with name: image-classification-2024-04-04-22-03-42-351
INFO:sagemaker:Creating endpoint-config with name Final-model-for-test-data
INFO:sagemaker:Creating endpoint with name Final-model-for-test-data


--------------!

## Running model on test data

In [50]:
import boto3
import sagemaker

# Initialize SageMaker session
session = sagemaker.Session()

# Specify the endpoint name
endpoint_name = 'Final-model-for-test-data'

# Specify the S3 path to your data
data_uri = 's3://project508data/test/x_1.jpeg'

# Initialize the predictor
predictor = sagemaker.predictor.Predictor(endpoint_name)



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [51]:
print(result)

b'[0.5426226258277893, 0.12491381913423538, 0.3324635326862335]'


In [49]:
import boto3
import json

# Initialize Boto3 S3 client
s3 = boto3.client('s3')

# Specify the S3 bucket and folder path containing the image
bucket_name = 'project508data'
folder_path = 'test/'

# Specify the image file name
image_file_name = 'x_1.jpeg'

# Download the image file from S3
s3_object = s3.get_object(Bucket=bucket_name, Key=folder_path + image_file_name)
image_bytes = s3_object['Body'].read()



In [None]:
import boto3
import json
import pandas as pd

# Initialize Boto3 S3 client
s3 = boto3.client('s3')

# Specify the S3 bucket and folder path containing the images
bucket_name = 'project508data'
folder_path = 'test/'

# Initialize an empty list to store the results
results = []

# Paginate through the objects in the specified folder
paginator = s3.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=folder_path)

# Iterate through each page of objects
for page in page_iterator:
    # Iterate through objects in the current page and process each image
    for obj in page.get('Contents', [])[1:]:
        # Extract the image file name from the object key
        image_key = obj['Key']
        
        # Download the image file from S3
        s3_object = s3.get_object(Bucket=bucket_name, Key=image_key)
        image_bytes = s3_object['Body'].read()
        
        # Perform inference
        result = predictor.predict(image_bytes, initial_args={'ContentType': 'image/jpeg'})
        
        # displaying in progress
        print(f"Processing image: {image_key}")
        
        # Assuming you have the result of the processing in a variable called 'result'
        # Replace this with your actual result from processing the image
        result = {'Image': image_key, 'Result': result}
        
        # Append the result to the list
        results.append(result)

# Create a DataFrame from the results list
results_df = pd.DataFrame(results)

# Display the DataFrame with results
print(results_df)

In [78]:
# Split the predictions into three columns
predictions_lst = results_df['Result']
# Convert bytes-like objects to strings and split them into three columns
predictions_df = pd.DataFrame([x.decode().split(", ") for x in predictions_lst], 
                              columns=["Prediction_1", "Prediction_2", "Prediction_3"])

# Removing brackets within the cells
# Remove brackets and whitespace
predictions_df = predictions_df.map(lambda x: x.strip('[').strip(']').strip())

# converting all columns to float
predictions_df = predictions_df.astype(float)

# Display the DataFrame
print(predictions_df)

     Prediction_1  Prediction_2  Prediction_3
0        0.542623      0.124914      0.332464
1        0.097182      0.070166      0.832652
2        0.522068      0.021652      0.456280
3        0.828095      0.000998      0.170907
4        0.460717      0.013178      0.526105
..            ...           ...           ...
145      0.048304      0.513761      0.437935
146      0.090151      0.576187      0.333663
147      0.085937      0.529144      0.384919
148      0.073800      0.754168      0.172032
149      0.061862      0.754469      0.183669

[150 rows x 3 columns]


In [79]:
# Find column name with maximum value for each row
predictions_df['Max_Column'] = predictions_df.idxmax(axis=1)

# Display the DataFrame
print(predictions_df)

     Prediction_1  Prediction_2  Prediction_3    Max_Column
0        0.542623      0.124914      0.332464  Prediction_1
1        0.097182      0.070166      0.832652  Prediction_3
2        0.522068      0.021652      0.456280  Prediction_1
3        0.828095      0.000998      0.170907  Prediction_1
4        0.460717      0.013178      0.526105  Prediction_3
..            ...           ...           ...           ...
145      0.048304      0.513761      0.437935  Prediction_2
146      0.090151      0.576187      0.333663  Prediction_2
147      0.085937      0.529144      0.384919  Prediction_2
148      0.073800      0.754168      0.172032  Prediction_2
149      0.061862      0.754469      0.183669  Prediction_2

[150 rows x 4 columns]


In [80]:
# Define a function to map column names to integers
def map_column_to_integer(column_name):
    if column_name == 'Prediction_1':
        return 0
    elif column_name == 'Prediction_2':
        return 1
    elif column_name == 'Prediction_3':
        return 2
    else:
        return None

# Apply the function to create a new column with integers
predictions_df['Predicted_labels'] = predictions_df['Max_Column'].apply(lambda x: map_column_to_integer(x))

# Display the DataFrame
print(predictions_df)

     Prediction_1  Prediction_2  Prediction_3    Max_Column  Predicted_labels
0        0.542623      0.124914      0.332464  Prediction_1                 0
1        0.097182      0.070166      0.832652  Prediction_3                 2
2        0.522068      0.021652      0.456280  Prediction_1                 0
3        0.828095      0.000998      0.170907  Prediction_1                 0
4        0.460717      0.013178      0.526105  Prediction_3                 2
..            ...           ...           ...           ...               ...
145      0.048304      0.513761      0.437935  Prediction_2                 1
146      0.090151      0.576187      0.333663  Prediction_2                 1
147      0.085937      0.529144      0.384919  Prediction_2                 1
148      0.073800      0.754168      0.172032  Prediction_2                 1
149      0.061862      0.754469      0.183669  Prediction_2                 1

[150 rows x 5 columns]


In [83]:
%store -r test_df
print(test_df)

     Label   Filename
0        0   x_1.jpeg
1        0  x_10.jpeg
2        0  x_11.jpeg
3        0  x_12.jpeg
4        0  x_13.jpeg
..     ...        ...
145      1  z_51.jpeg
146      1   z_6.jpeg
147      1   z_7.jpeg
148      1   z_8.jpeg
149      1   z_9.jpeg

[150 rows x 2 columns]


In [86]:
# Extract columns from df1 and df2
col_1 = test_df[['Filename','Label']]  # Extracting column 'A' from df1
col_2 = predictions_df['Predicted_labels']  # Extracting column 'C' from df2

# Concatenate columns into a new DataFrame
final_df = pd.concat([col_1, col_2], axis=1)
final_df = final_df.rename(columns={'Label': 'Observed_labels'})
# Display the new DataFrame
print(final_df)

      Filename  Observed_labels  Predicted_labels
0     x_1.jpeg                0                 0
1    x_10.jpeg                0                 2
2    x_11.jpeg                0                 0
3    x_12.jpeg                0                 0
4    x_13.jpeg                0                 2
..         ...              ...               ...
145  z_51.jpeg                1                 1
146   z_6.jpeg                1                 1
147   z_7.jpeg                1                 1
148   z_8.jpeg                1                 1
149   z_9.jpeg                1                 1

[150 rows x 3 columns]


In [89]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# Assuming df is your DataFrame containing observed and predicted labels
# Make sure your DataFrame has columns for observed labels ('observed') and predicted labels ('predicted')

# Extract observed and predicted labels from the DataFrame
observed_labels = final_df['Observed_labels']
predicted_labels = final_df['Predicted_labels']

# Calculate the F1 score
f1 = f1_score(observed_labels, predicted_labels, average='weighted')
accuracy = accuracy_score(observed_labels, predicted_labels)
# Print the F1 score
print("F1 Score:", f1, "Accuracy:", accuracy)

F1 Score: 0.7215796237527518 Accuracy: 0.7133333333333334
