# Image Classification Model via SageMaker Pipeline template

# PRE-PROCESSING:
Require to have train.csv and 'train' image folder downloaded from Kaggle into the same directory with this script

In [4]:
! pip install --upgrade pip
! pip install "sagemaker==2.159.0"
! pip install opencv-python-headless # for im2rec.py file to convert image folders into .rec files
! pip install mxnet # For image model

[0m

## Pre-process raw data from Kaggle:

In [6]:
import pandas as pd
import os

img_list = os.listdir('train') # "Train" is the image folder from Kaggle
print('Number of available images: ', len(img_list))

data = pd.read_csv('train.csv') #Train.csv is the file downloaded from Kaggle
cols = ['ImgId', 'categories']
data = data[cols]
data = data.assign(img_path =lambda x: ('train/' + x['ImgId'] + '.jpg'))
data = data.assign(img_name =lambda x: (x['ImgId'] + '.jpg'))
data = data[data.img_name.isin(img_list)] # Only keep records with available images
data['img_code'] = pd.factorize(data['ImgId'])[0] # Create dummy image id, required by image model
data.head()

Number of available images:  25926


Unnamed: 0,ImgId,categories,img_path,img_name,img_code
2,B000GAWSBS,"Clothing, Shoes & Jewelry",train/B000GAWSBS.jpg,B000GAWSBS.jpg,0
3,B000040JOL,Tools & Home Improvement,train/B000040JOL.jpg,B000040JOL.jpg,1
4,B00006IB78,Health & Personal Care,train/B00006IB78.jpg,B00006IB78.jpg,2
6,B000YOUIN6,Baby Products,train/B000YOUIN6.jpg,B000YOUIN6.jpg,3
8,B0000CAQ0S,"Patio, Lawn & Garden",train/B0000CAQ0S.jpg,B0000CAQ0S.jpg,4


In [7]:
# Category mapping - to align with Text model:
cat_map = {'Clothing, Shoes & Jewelry': 6,
 'Tools & Home Improvement': 12,
 'Health & Personal Care': 3,
 'Baby Products': 7,
 'Patio, Lawn & Garden':14,
 'Baby': 7,
 'Beauty': 4,
 'Sports & Outdoors': 17,
 'All Electronics': 0,
 'Automotive': 5,
 'All Beauty': 4,
 'Office Products': 8,
 'Electronics': 0,
 'Toys & Games': 15,
 'Appliances': 2,
 'Musical Instruments': 16,
 'Industrial & Scientific': 11,
 'Grocery & Gourmet Food': 13,
 'Cell Phones & Accessories': 9,
 'Pet Supplies': 10,
 'Arts, Crafts & Sewing': 1}

data['cat_code'] = data['categories'].map(lambda x: cat_map[x])
data[['categories','cat_code']].drop_duplicates().sort_values('cat_code')

Unnamed: 0,categories,cat_code
30,Electronics,0
15,All Electronics,0
114,"Arts, Crafts & Sewing",1
35,Appliances,2
4,Health & Personal Care,3
11,Beauty,4
24,All Beauty,4
18,Automotive,5
2,"Clothing, Shoes & Jewelry",6
6,Baby Products,7


## Split images into categories:

In [8]:
from pathlib import Path

# Move the images into sub-folders corresponding to their categories
# Pre-requisite: 
# 1. download train.csv and train folder (for images) from Kaggle and upload to SageMaker in the same directory as this script
# 2. Change notebook kernel to "MXNet 1.8 Python 3.7 CPU Optimized"

for img in img_list:
    if img.endswith(".jpg"):
        image_dir = 'train'
        img_id = img.replace('.jpg','')
        category = data[data['ImgId']==img_id]['cat_code'].iloc[0]
        if category < 10:
            category = '0' + str(category)
        else:
            category = str(category)
        folder = 'image_structure/' + category # Parent folder = image_structure, child folders = catgory folders
        Path(folder).mkdir(parents=True, exist_ok=True) # Create category folder if not exist
        os.rename(f'{image_dir}/{img}', f'{folder}/{img}') # move image from 'train' folder to category folder