# Data exploration and preprocessing:

In [2]:
! pip install --upgrade sagemaker

Collecting sagemaker
  Downloading sagemaker-2.167.0.tar.gz (843 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m843.4/843.4 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting importlib-metadata<5.0,>=1.4.0 (from sagemaker)
  Using cached importlib_metadata-4.13.0-py3-none-any.whl (23 kB)
Collecting urllib3<1.27,>=1.25.4 (from botocore<1.30.0,>=1.29.154->boto3<2.0,>=1.26.131->sagemaker)
  Downloading urllib3-1.26.16-py2.py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.1/143.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hBuilding wheels for collected packages: sagemaker
  Building wheel for sagemaker (setup.py) ... [?25ldone
[?25h  Created wheel for sagemaker: filename=sagemaker-2.167.0-py2.py3-none-any.whl size=1149213 sha256=df8a574703c573b5ccdf67917dab5c00d8d73916978137e5fafc8b3c71c2b735
  Stored in directory: /root/.cache/pip/w

# Pre-requisite:
- download train.csv and train folder
- save them in this folder

In [35]:
import pandas as pd

In [36]:
data = pd.read_csv('train.csv')
# test = pd.read_csv('test.csv')

In [46]:
data.count()

ImgId          46229
title          46228
description    45187
categories     46229
img_code       46229
cat_code       46229
dtype: int64

In [37]:
data.head(4)

Unnamed: 0,ImgId,title,description,categories
0,B000HYL1V6,TUNGSTEN SOLDER PICK WITH HANDLE,Solder Pick for picking up molten solder when ...,"Arts, Crafts & Sewing"
1,B00006HXWY,Write Right 98167 Screen Protector for Sony T615C,We all screen. And we all need to protect thos...,Cell Phones & Accessories
2,B000GAWSBS,Casio Mens DBC310-1 Databank 300 Digital Watch...,"Bringing you precision at a glance, the Casio ...","Clothing, Shoes & Jewelry"
3,B000040JOL,Factory-Reconditioned DEWALT DW260KR Heavy-Dut...,Factory-Reconditioned DEWALT DW260KR Heavy-Dut...,Tools & Home Improvement


In [38]:
data.groupby('categories').count().reset_index()

Unnamed: 0,categories,ImgId,title,description
0,All Beauty,2200,2200,2190
1,All Electronics,2200,2199,2176
2,Appliances,2200,2200,2181
3,"Arts, Crafts & Sewing",2225,2225,2175
4,Automotive,2200,2200,1999
5,Baby,2200,2200,2177
6,Baby Products,2200,2200,2131
7,Beauty,2202,2202,2149
8,Cell Phones & Accessories,2200,2200,2188
9,"Clothing, Shoes & Jewelry",2200,2200,2182


In [6]:
cols = ['ImgId', 'categories']
data = data[cols]
data = data.assign(img_path =lambda x: ('train/' + x['ImgId'] + '.jpg'))
data = data.assign(img_name =lambda x: (x['ImgId'] + '.jpg'))
data.head()

Unnamed: 0,ImgId,categories,img_path,img_name
0,B000HYL1V6,"Arts, Crafts & Sewing",train/B000HYL1V6.jpg,B000HYL1V6.jpg
1,B00006HXWY,Cell Phones & Accessories,train/B00006HXWY.jpg,B00006HXWY.jpg
2,B000GAWSBS,"Clothing, Shoes & Jewelry",train/B000GAWSBS.jpg,B000GAWSBS.jpg
3,B000040JOL,Tools & Home Improvement,train/B000040JOL.jpg,B000040JOL.jpg
4,B00006IB78,Health & Personal Care,train/B00006IB78.jpg,B00006IB78.jpg


In [7]:
data.nunique()

ImgId         46229
categories       21
img_path      46229
img_name      46229
dtype: int64

In [8]:
# List of available images
import os

img_list = os.listdir('train')
print(len(img_list))


25926


Since the number of images < 'ImgId', not all images are available. As such, I will remove the data that ImgID is not available:

In [9]:
# Remove data that do not have image:
data = data[data.img_name.isin(img_list)]
data.nunique()

ImgId         25926
categories       21
img_path      25926
img_name      25926
dtype: int64

In [10]:
data.groupby('categories').nunique()

Unnamed: 0_level_0,ImgId,img_path,img_name
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
All Beauty,1209,1209,1209
All Electronics,1263,1263,1263
Appliances,1225,1225,1225
"Arts, Crafts & Sewing",1235,1235,1235
Automotive,1238,1238,1238
Baby,1217,1217,1217
Baby Products,1234,1234,1234
Beauty,1244,1244,1244
Cell Phones & Accessories,1237,1237,1237
"Clothing, Shoes & Jewelry",1232,1232,1232


In [43]:
cat_map = {
 'Clothing, Shoes & Jewelry': '06',
 'Tools & Home Improvement': '12',
 'Health & Personal Care': '03',
 'Baby Products': '07',
 'Patio, Lawn & Garden':'14',
 'Baby': '07',
 'Beauty': '04',
 'Sports & Outdoors': '17',
 'All Electronics': '00',
 'Automotive': '05',
 'All Beauty': '04',
 'Office Products': '08',
 'Electronics': '00',
 'Toys & Games': '15',
 'Appliances': '02',
 'Musical Instruments': '16',
 'Industrial & Scientific': '11',
 'Grocery & Gourmet Food': '13',
 'Cell Phones & Accessories': '09',
 'Pet Supplies': '10',
 'Arts, Crafts & Sewing': '01'}


In [45]:
# Create dummy ID for images and categories:

data['img_code'] = pd.factorize(data['ImgId'])[0]
data['cat_code'] = data['categories'].map(lambda x: cat_map[x])

data[['categories','cat_code']].drop_duplicates().sort_values('cat_code').reset_index()

Unnamed: 0,index,categories,cat_code
0,14,Electronics,0
1,15,All Electronics,0
2,0,"Arts, Crafts & Sewing",1
3,35,Appliances,2
4,4,Health & Personal Care,3
5,11,Beauty,4
6,24,All Beauty,4
7,16,Automotive,5
8,2,"Clothing, Shoes & Jewelry",6
9,5,Baby Products,7


In [13]:
data.columns

Index(['ImgId', 'categories', 'img_path', 'img_name', 'img_code', 'cat_code'], dtype='object')

# Stratified splitting into train and test set:

In [14]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, random_state=3407, stratify=data[['categories']])
train.to_csv('train_split.csv', index = False)
test.to_csv('test_split.csv', index = False)
train.head(4)

Unnamed: 0,ImgId,categories,img_path,img_name,img_code,cat_code
12229,B0048I3B9C,All Beauty,train/B0048I3B9C.jpg,B0048I3B9C.jpg,7553,4
8844,B0000223Y5,Tools & Home Improvement,train/B0000223Y5.jpg,B0000223Y5.jpg,5467,12
32343,B0002E2TW6,Musical Instruments,train/B0002E2TW6.jpg,B0002E2TW6.jpg,19962,16
7573,B000FZ1VS0,"Clothing, Shoes & Jewelry",train/B000FZ1VS0.jpg,B000FZ1VS0.jpg,4686,6


# Reformat data to Upload to S3:


In [15]:

# save data to lst file: img_id, category_id, image_path
with open("train.lst", "w") as f:
  for img_ind, category, img_path, img_name, img_code, cat_code in train.itertuples(index=False):
    f.write('\t'.join([str(img_code), str(cat_code), img_path]) + '\n')


with open("test.lst", "w") as f:
  for img_ind, category, img_path, img_name, img_code, cat_code in test.itertuples(index=False):
    f.write('\t'.join([str(img_code), str(cat_code), img_path]) + '\n')

In [16]:
# View some data in lst file:
!head -n 3 train.lst

7553	4	train/B0048I3B9C.jpg
5467	12	train/B0000223Y5.jpg
19962	16	train/B0002E2TW6.jpg


# Pipeline:

## 0. Preparation and setting parameters:

Pre-requisite:
- Download train.csv and "train" folder from Kaggle
- Upload these into 2 separate S3 URI. For me:
    - train.csv: "s3://sagemaker-us-east-1-547381887603/input/train.csv"
    - "train" folder: "s3://sagemaker-us-east-1-547381887603/image-classification/train"

- Remember the where you save them in S3 to be used as Preprocessing Inputs


In [17]:
import os
import urllib.request


import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker import image_uris

role = get_execution_role()

region = boto3.Session().region_name

s3_client = boto3.client("s3")
sm_client = boto3.Session().client(service_name="sagemaker", region_name=region)

sess = sagemaker.Session()

bucket = sagemaker.session.Session().default_bucket()

boto_session = boto3.Session(region_name=region)

training_image = image_uris.retrieve(
    region=boto3.Session().region_name, framework="image-classification"
)


base_job_prefix = 'project-2-image-classification'

# Note: Only need to upload data once!

# Four channels: train, validation, train_lst, and validation_lst
# s3train = "s3://{}/image-classification/train/".format(bucket)
# s3validation = "s3://{}/image-classification/validation/".format(bucket)
s3train_lst = "s3://{}/image-classification/train_lst/".format(bucket)
s3validation_lst = "s3://{}/image-classification/validation_lst/".format(bucket)

# upload the image files to train and validation channels
# !aws s3 cp train $s3train --recursive --quiet
# !aws s3 cp train $s3validation --recursive --quiet

# upload the lst files to train_lst and validation_lst channels
!aws s3 cp train.lst $s3train_lst --quiet
!aws s3 cp test.lst $s3validation_lst --quiet

In [18]:
print(s3train_lst)
print(s3validation_lst)

s3://sagemaker-us-east-1-547381887603/image-classification/train_lst/
s3://sagemaker-us-east-1-547381887603/image-classification/validation_lst/


In [19]:
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat
)
from sagemaker.workflow.steps import ProcessingStep, TrainingStep, CacheConfig


# Cache Pipeline steps to reduce execution time on subsequent executions
cache_config = CacheConfig(enable_caching=True, expire_after="30d")

# Setting a SageMaker Pipeline Session is important to avoid pipeline steps from running before the pipeline is ready
pipeline_session = PipelineSession(boto_session=boto_session, sagemaker_client=sm_client, default_bucket=bucket)

model_package_group_name = "project-2-MXNET"  # Model name in model registry
pipeline_name = "Image - Classification - Pipeline" 

processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1) # AWS Lab role is limited to 1 GPU instance

training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.p2.xlarge") # GPU instance type limited by Lab role

In [20]:
# Remember the S3 URI so you dont need to relook them again

s3train = "s3://sagemaker-us-east-1-547381887603/image-classification/train/"
s3validation = "s3://sagemaker-us-east-1-547381887603/image-classification/validation/"
s3train_lst = "s3://sagemaker-us-east-1-547381887603/image-classification/train_lst/"
s3validation_lst = "s3://sagemaker-us-east-1-547381887603/image-classification/validation_lst/"

print(s3train)
print(s3validation)
print(s3train_lst)
print(s3validation_lst)

s3://sagemaker-us-east-1-547381887603/image-classification/train/
s3://sagemaker-us-east-1-547381887603/image-classification/validation/
s3://sagemaker-us-east-1-547381887603/image-classification/train_lst/
s3://sagemaker-us-east-1-547381887603/image-classification/validation_lst/


# Train a built-in model:

In [29]:
%%time
from time import gmtime, strftime

output_bucket = bucket
output_prefix = "image-classification"
job_name = "img-classification-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
output_location = "s3://{}/{}/output/{}".format(bucket, output_prefix, job_name)
image = sagemaker.image_uris.retrieve(framework="image-classification", region=boto3.Session().region_name, version="1.7-1")

s3_output_location = "s3://{}/{}/output".format(output_bucket, output_prefix)

ic = sagemaker.estimator.Estimator(
    training_image,
    role,
    instance_count=1, # Learner Account limit to 1 instance
    instance_type="ml.p2.xlarge", # Use GPU
    volume_size=50,
    max_run=360000,
    input_mode="File",
    output_path=s3_output_location,
    sagemaker_session=sess,
)
ic.set_hyperparameters(
    num_layers=18,
    use_pretrained_model=0,
    image_shape="3,100,100",
    num_classes=18,
    num_training_samples=33600,
    mini_batch_size=500, # Run higher batch size than this risks exceeding memory
    epochs=10, #to run within 3 hours
    learning_rate=0.01,
    top_k=2,
    precision_dtype="float32",
)
train_data = sagemaker.inputs.TrainingInput(
    "s3://sagemaker-us-east-1-547381887603/image-classification/",
    distribution="FullyReplicated",
    content_type="application/x-image",
    s3_data_type="S3Prefix",
)
validation_data = sagemaker.inputs.TrainingInput(
    "s3://sagemaker-us-east-1-547381887603/image-classification/",
    distribution="FullyReplicated",
    content_type="application/x-image",
    s3_data_type="S3Prefix",
)
train_lst = sagemaker.inputs.TrainingInput(
    "s3://sagemaker-us-east-1-547381887603/image-classification/train_lst/",
    distribution="FullyReplicated",
    content_type="application/x-image",
    s3_data_type="S3Prefix",
)
validation_lst = sagemaker.inputs.TrainingInput(
    "s3://sagemaker-us-east-1-547381887603/image-classification/validation_lst/",
    distribution="FullyReplicated",
    content_type="application/x-image",
    s3_data_type="S3Prefix",
)

#Specify train, validation, train_lst, and validation_lst channels
data_channels = {"train": train_data, "validation": validation_data, 
                 "train_lst": train_lst, "validation_lst": validation_lst}

ic.fit(inputs=data_channels, logs=True)

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: image-classification-2023-06-21-16-40-47-234


2023-06-21 16:40:47 Starting - Starting the training job...
2023-06-21 16:41:14 Starting - Preparing the instances for training.........
2023-06-21 16:42:43 Downloading - Downloading input data...................................[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34mNvidia gpu devices, drivers and cuda toolkit versions (only available on hosts with GPU):[0m
[34mWed Jun 21 16:48:27 2023       [0m
[34m+-----------------------------------------------------------------------------+[0m
[34m| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |[0m
[34m|-------------------------------+----------------------+----------------------+[0m
[34m| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |[0m
[34m| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |[0m
[34m|                               |                      |          

In [22]:
# Check model path in S3:
ic.model_data

's3://sagemaker-us-east-1-547381887603/image-classification/output/image-classification-2023-06-21-16-31-48-265/output/model.tar.gz'

In [23]:
model_path = ic.model_data

# Deploy to endpoint:
link: https://stackoverflow.com/questions/56255154/how-to-use-a-pretrained-model-from-s3-to-predict-some-data

In [24]:
# endpoint_name = f"jumpstart-example-imageclassification-{strftime('%Y-%m-%d-%H-%M-%S', gmtime())}"

# ic_classifier = ic.deploy(initial_instance_count=1, instance_type="ml.m5.large", endpoint_name=endpoint_name)

In [25]:
# Making inference:
# import json
# import numpy as np


# file_name = "image_structure/Appliances/B000E77I0Y.jpg"
# with open(file_name, "rb") as f:
#     payload = f.read()
#     payload = bytearray(payload)

# prediction = ic_classifier.predict(payload, initial_args={"ContentType": "application/x-image"})
# print(prediction)

In [26]:
# import pandas as pd

# train = pd.read_csv('train_split.csv')
# categories = train[['categories', 'code']].drop_duplicates()

# categories

In [27]:
# prediction is a JSON string. Load it into a Python object. 
# probabilities = json.loads(prediction)
# print(len(probabilities))

# # find the class with maximum probability and print the class index
# predicted_category_index = np.argmax(probabilities)
# print(predicted_category_index)
# # confidence = probabilities[predicted_category_index]

In [28]:
# Delete endpoint:
# sm_client.delete_endpoint(EndpointName=endpoint_name)