### Import required libraries

In [1]:
import os, fnmatch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image
Image.LOAD_TRUNCATED_IMAGES = True
from urllib.error import HTTPError
from urllib.request import urlretrieve

### Set base and required folders path

In [2]:
cwd = os.getcwd()
inputFileFolder = os.path.join(cwd,"Desktop","Kaggle","Input")
#outputFileFolder = os.path.join(cwd,"Desktop","Kaggle","Input","OriginalImages") #"TrainImages/"

## Location of original images
orignalFilesLoc = os.path.join(inputFileFolder,"OriginalImages") 

## Location where cropped and re-sized images needs to be stored
targetFilesLoc = os.path.join(inputFileFolder,"TrainTestImages")

## Print paths
print("Current working directory: %s" %(cwd))
print('Input Files Base Folder: %s' %(inputFileFolder))
print("Original File Location: %s" %(orignalFilesLoc))
print("Target File Location: %s" %(targetFilesLoc))

Current working directory: /Users/kuldeepsingh/Documents/bdml/capstone/cropresize
Input Files Base Folder: /Users/kuldeepsingh/Documents/bdml/capstone/cropresize/Desktop/Kaggle/Input
Original File Location: /Users/kuldeepsingh/Documents/bdml/capstone/cropresize/Desktop/Kaggle/Input/OriginalImages
Target File Location: /Users/kuldeepsingh/Documents/bdml/capstone/cropresize/Desktop/Kaggle/Input/TrainTestImages


### Sub-Folder Location

In [3]:
## In case you want to target any specific folder, then replace "" with folder name below
subFolder = "Tree"
if (subFolder != ""):
    orignalFilesLoc = os.path.join(orignalFilesLoc,subFolder)
    targetFilesLoc = os.path.join(targetFilesLoc,subFolder)

print("Original File Location: %s" %(orignalFilesLoc))
print("Target File Location: %s" %(targetFilesLoc))

Original File Location: /Users/kuldeepsingh/Documents/bdml/capstone/cropresize/Desktop/Kaggle/Input/OriginalImages/Tree
Target File Location: /Users/kuldeepsingh/Documents/bdml/capstone/cropresize/Desktop/Kaggle/Input/TrainTestImages/Tree


## Cropping Image

### Reading bounding boxes

In [4]:
boundingBoxPath = os.path.join(inputFileFolder,"train_bounding_boxes.csv")
dfTrainBB = pd.read_csv(boundingBoxPath)
dfTrainBB.set_index('ImageID',inplace=True)
dfTrainBB.head()

Unnamed: 0_level_0,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
ImageID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
000002b66c9c498e,xclick,/m/01g317,1,0.0125,0.195312,0.148438,0.5875,0,1,0,0,0
000002b66c9c498e,xclick,/m/01g317,1,0.025,0.276563,0.714063,0.948438,0,1,0,0,0
000002b66c9c498e,xclick,/m/01g317,1,0.151562,0.310937,0.198437,0.590625,1,0,0,0,0
000002b66c9c498e,xclick,/m/01g317,1,0.25625,0.429688,0.651563,0.925,1,0,0,0,0
000002b66c9c498e,xclick,/m/01g317,1,0.257812,0.346875,0.235938,0.385938,1,0,0,0,0


### Create Label lookup

In [5]:
labelsPath = os.path.join(inputFileFolder,"class-descriptions.csv")
dfLables = pd.read_csv(labelsPath)

dirClassesID= dict([(Desc.replace(" ",""),ID) for ID, Desc in zip(dfLables.label_code, dfLables.description)])
print(len(dirClassesID))

19699


## Methods to process the image

#### This method is used to traverse the given input path and call processImage method for each image in the folder

In [6]:
def processImageFolder(origPath, tgtPath):
    
    ## Check if input folder is there or not
    if not os.path.exists(origPath):
        print("Input path '%s' does not exist, provide valid path." %(origPath))
        return
    
    for file in os.listdir(origPath):
        subLoc = os.path.join(origPath, file)
        if os.path.isfile(subLoc):
            imageID = file.split(".")[0]
            imageExt = file.split(".")[-1]
            #print("Image ID: %s, Image Ext:%s" %(imageID, imageExt))
            ## Create a function in which file, input and target location will be send
            processImage(file, origPath, tgtPath)
        else:
            newTgtPath = os.path.join(tgtPath,file)
            print("Orig folder: %s" %(subLoc))
            print("Targ folder: %s" %(newTgtPath))
            processImageFolder(subLoc, newTgtPath)

#### This method is used to crop and resize the given image and save the same at the target location

In [7]:
def processImage(imageFile, imageLoc, targetLoc, showImage=False):
    
    imageID = imageFile.split(".")[0]
    imageExt = imageFile.split(".")[-1]
    label = os.path.basename(os.path.normpath(imageLoc))
    labelID = dirClassesID[label]
    
    ## Check if target folder is there or not
    if not os.path.exists(targetLoc):
        os.makedirs(targetLoc)
    
    ## Read image
    origImage = Image.open(os.path.join(imageLoc,imageFile))
    ## convert image to array
    arrImage = np.array(origImage)
    width, height = origImage.size   # Get dimensions
    
    if(showImage):
        print("Label: %s" %(label))
        print("Label ID: %s" %(labelID))
        print("Image ID: %s" %(imageID))
        print("Image Loc: %s" %(imageLoc))
        print("Image size: %s" %(str(origImage.size)))
        print("Image Array Shape: %s" %(str(arrImage.shape)))
        plt.imshow(arrImage)
    
    ## Get bounding boxes
    boundingBoxes = dfTrainBB.loc[(dfTrainBB.index == imageID) & (dfTrainBB['LabelName'] == labelID),
                                 ['XMin','YMin','XMax','YMax']].values
    subImagecount = 1
    
    ## Loop through each bounding box and create a image
    for box in boundingBoxes:
        left = box[0]*width 
        top = box[1] *height
        right =box[2] * width
        bottom = box[3] * height

        ## Crop image
        cropImage = origImage.crop((left, top, right, bottom))
        resizeImage = cropImage.resize((256,256))
        
        ## Save image at location
        tgtImageName = imageID + "_" + str(subImagecount) + "." + imageExt
        tgtPath = os.path.join(targetLoc,tgtImageName)
        
        ## If target path exist, delete the existing file
        if(os.path.exists(tgtPath)):
            print("Deleting: %s" %(tgtPath))
            os.remove(tgtPath)
        
        ## Save image
        resizeImage.save(tgtPath)

        ## Increment the counter
        subImagecount = subImagecount+1

        if (showImage):
            dtcrp = np.array(resizeImage)
            print(dtcrp.shape)
            plt.imshow(dtcrp)
        

### Call processImageFolder to process the images in a given folder and sub-folder

In [8]:
processImageFolder(orignalFilesLoc, targetFilesLoc)

  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
