# Bitmoji Parser

### Flow
**For each image**:
1. Find all of the green rectanges (HSV between (40,150,210) and (120,250,250))
2. Save each green rectangle – with an area greater than 1000 pixels, and a height and width each greater than 40 pixels – to a new image. These represent XXX category of images and are saved with the naming convention "\[original name\]-XXX_category-\[object number\].png"
3. Repeat for other colour rectangles too

In [1]:
from matplotlib import pyplot as plt
from glob import glob
import random as rng
import pandas as pd
import numpy as np
import cv2

INPUT_FOLDER = glob("../../../../../Dropbox (Princeton)/BitmojiClassroomTwitter/Item Tagging/Kaytee/Tagged/*")
OUTPUT_FOLDER = "../../../../../Dropbox (Princeton)/BitmojiClassroomTwitter/Parsed Image Segments"

In [2]:
import os

def find_bounding_boxes(hsv_img,
                        colour_dict):
    """
    
        Returns an array of bounding boxes
    """
    low,high = colour_dict['low_hsv'], colour_dict['high_hsv']
    mask = cv2.inRange(hsv_img, low, high)
    masked_img = cv2.bitwise_and(hsv_img, hsv_img, mask = mask)
    # Converting the image to grayscale helps with the findContours function
    masked_grey = cv2.cvtColor(masked_img, cv2.COLOR_BGR2GRAY)
    # Smoothing without removing edges.
    bi_lat = cv2.bilateralFilter(masked_grey, 7, 50, 50)
    # Adding a bit of blur and then thresholding helps reduce noise
    blurred = cv2.blur(bi_lat, (3,3))
    #Apply thresholding to the image
    ret, thresholded = cv2.threshold(blurred, 1, 255, cv2.THRESH_OTSU)
    contours, hierarchy = cv2.findContours(thresholded,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
    bounding_boxes = [cv2.boundingRect(c) for c in contours]
    return bounding_boxes


def write_bounding_boxes(img,
                         bounding_boxes,
                         img_prefix = "../data/output/[image_name]-[category]",
                         offset = 0,
                         thresh = lambda x,y,w,h : h*w*1000 and w > 40 and h > 40):
    # filter out noise based on preset threshold
    bounding_boxes = [bb for bb in bounding_boxes if thresh(*bb)]
    object_ids = []
    for i,[x,y,w,h] in enumerate(bounding_boxes):
        section = img[y:y+h, x:x+w]
        obj_id = i+offset
        object_ids.append(obj_id)
        cv2.imwrite(f"{img_prefix}-{obj_id:03d}.png", section[:,:,::-1])
    return bounding_boxes,object_ids


def output_path(img_fp,colour_dict):
    direc = OUTPUT_FOLDER
    # Get rid of whatever is after the last period
    img_pref = os.path.basename(img_fp).split('.')[0]
    category = colour_dict['category']
    return img_pref, f"{direc}/{img_pref}_{category}"
    

## Parse bounding boxes

In [4]:
from utils import categories

series = []
for path  in INPUT_FOLDER:
    offset = 0
    for colour in categories:
        img_name, output_prefix = output_path(path,colour)
        bitmoji = cv2.imread(path, 1)
        bitmoji = cv2.cvtColor(bitmoji, cv2.COLOR_BGR2RGB)
        hsv_bitmoji = cv2.cvtColor(bitmoji, cv2.COLOR_RGB2HSV)
        bounding_boxes = find_bounding_boxes(hsv_bitmoji,colour)
        bef = len(bounding_boxes)
        bounding_boxes,object_ids = write_bounding_boxes(bitmoji,
                                              bounding_boxes,
                                              output_prefix,
                                              offset)
        offset += len(bounding_boxes)
        for [x,y,w,h],obj_id in zip(bounding_boxes,object_ids):
            row = [img_name, # image name
                   colour['category'], #category
                   obj_id,
                   x, #X1
                   x+w, #X2
                   y, #Y1
                   y+h #Y2
                  ]
            series.append(row)
        if len(bounding_boxes): print(path,output_prefix,bef,'->',len(bounding_boxes))

coordinate_df = pd.DataFrame(series,columns=["Image Name",
                                             "Object Category",
                                             "Object ID",
                                             "X1 Coordinate",
                                             "X2 Coordinate",
                                             "Y1 Coordinate",
                                             "Y2 Coordinate"]).sort_values(["Image Name","Object ID"])

print("{} rows and {} columns".format(*coordinate_df.shape))

../../../../../Dropbox (Princeton)/BitmojiClassroomTwitter/Item Tagging/Kaytee/Tagged/user26_2195.jpg ../../../../../Dropbox (Princeton)/BitmojiClassroomTwitter/Parsed Image Segments/user26_2195_02_wall_hangings 1 -> 1
../../../../../Dropbox (Princeton)/BitmojiClassroomTwitter/Item Tagging/Kaytee/Tagged/user38_4029.jpg ../../../../../Dropbox (Princeton)/BitmojiClassroomTwitter/Parsed Image Segments/user38_4029_01_books 11 -> 1
../../../../../Dropbox (Princeton)/BitmojiClassroomTwitter/Item Tagging/Kaytee/Tagged/user38_4029.jpg ../../../../../Dropbox (Princeton)/BitmojiClassroomTwitter/Parsed Image Segments/user38_4029_02_wall_hangings 140 -> 4
../../../../../Dropbox (Princeton)/BitmojiClassroomTwitter/Item Tagging/Kaytee/Tagged/user42_4681 copy.jpg ../../../../../Dropbox (Princeton)/BitmojiClassroomTwitter/Parsed Image Segments/user42_4681 copy_01_books 35 -> 4
../../../../../Dropbox (Princeton)/BitmojiClassroomTwitter/Item Tagging/Kaytee/Tagged/user58_998 copy.jpg ../../../../../Dropb

In [4]:
coordinate_df.to_csv("../data/coordinates.csv",index=False)