# Data Pre-processing and Exploration
## Project - Dishwasher-safe or Not
This notebook processes the images of utensils and cookware and creates a dataset for training a classifier which classifies whether an object is dishwasher-safe or not

### 1. Environment setup

In [None]:
!pip3 install --upgrade pip

!pip3 install tensorflow-estimator==2.1

!pip3 install tensorflow

!python3 -m pip show tensorflow

### 2. Image Preprocessing

The following code iterates through all images in the given directory and converts raw image to pre-processed image by performing the following steps:
- crop image to square 
- resize image to 256x256

Pre-requisites:
Make sure the image is a square and at the center of the image or the object might get cropped

In [None]:
import os
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np

SIZE = 256, 256
raw_data_dir = '/Users/anushree/Desktop/Dishwasher-safe Or Not/Data/raw_data'
processed_data_dir = '/Users/anushree/Desktop/Dishwasher-safe Or Not/Data/processed_data/'

In [None]:
def crop_resize_image(filepath):
    im = Image.open(filepath)
    # plt.imshow(np.asarray(im))
    # plt.show()

    # Find dimensions of square from current dimensions
    # New dimension will be the NxN where N is min(width,height)
    width, height = im.size   
    new_width = min(width, height)
    new_height = min(width, height)

    left = (width - new_width)/2
    top = (height - new_height)/2
    right = (width + new_width)/2
    bottom = (height + new_height)/2

    # Crop image to square
    im_cropped = im.crop((left, top, right, bottom))
    # print(im_cropped.size)
    # plt.imshow(np.asarray(im_cropped))
    # plt.show()

    # Resize image to 256x256
    im_cropped.thumbnail(SIZE, Image.Resampling.LANCZOS)
    # plt.imshow(np.asarray(im_cropped))
    # plt.show()
    im_cropped.save(processed_data_dir+filepath.split('/')[-1])

ext = ['JPG', 'jpeg', 'png', 'jpg', 'gif', 'webp']
# iterate over files
for filename in os.listdir(raw_data_dir):
    f = os.path.join(raw_data_dir, filename)
    # checking if it is an image file
    if os.path.isfile(f) and filename.split('.')[-1] in ext:
        print(f)
        crop_resize_image(f)


# crop_resize_image('/Users/anushree/Desktop/Dishwasher-safe Or Not/Data/raw_data/teaspoonraw29.JPG')

### 3. Data Exploration
The following code helps to visualize cluster of similar images using KNN algorithm.
The features of the image are extracted using VGG and reduced using VGG before using in the KNN algorithm.

In [None]:
# Reference: https://towardsdatascience.com/how-to-cluster-images-based-on-visual-similarity-cd6e7209fe34
# for loading/processing the images
from keras.preprocessing.image import load_img
from tensorflow.keras.utils import img_to_array
from keras.applications.vgg16 import preprocess_input

# models
from keras.applications.vgg16 import VGG16
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle


In [None]:
data_path = r"/Users/anushree/Desktop/Dishwasher-safe Or Not/Data/final_data/not-dishwasher-safe/"
p = r"/Users/anushree/Desktop/Dishwasher-safe Or Not/Data/data_features.pkl"

### 3.1 Extract features using VGG16

In [None]:
def extract_features(file, model):
        # load the image as a 224x224 array
        img = load_img(file, target_size=(224, 224))
        # convert from 'PIL.Image.Image' to numpy array
        img = np.array(img)
        # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels)
        reshaped_img = img.reshape(1, 224, 224, 3)
        # prepare image for model
        imgx = preprocess_input(reshaped_img)
        # get the feature vector
        features = model.predict(imgx, use_multiprocessing=True)
        return features

def extract_and_save_features():
    # change the working directory to the path where the images are located
    os.chdir(data_path)

    # this list holds all the image filename
    utensils = []

    for path, subdirs, files in os.walk(data_path):
        # loops through each file in the directory
        for file in files:
            # adds only the image files to the utensils list
            if file.endswith(('JPG', 'jpeg', 'png', 'jpg', 'gif', 'webp')):
                utensils.append(os.path.join(path,file))
    print(utensils[:10])

#     # load the image as a 224x224 array
#     img = load_img(utensils[0], target_size=(256,256))
#     # convert from 'PIL.Image.Image' to numpy array
#     img = np.array(img)
#     print(img.shape)

    model = VGG16()
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

    data = {}

    # loop through each image in the dataset
    for utensil in utensils:
        # try to extract the features and update the dictionary
        try:
            feat = extract_features(utensil, model)
            data[utensil] = feat
        # if something fails, save the extracted features as a pickle file (optional)
        except:
            print("Error:",utensil)
            with open(p, 'wb') as file:
                pickle.dump(data, file)
    
    with open(p, 'wb') as file:
        pickle.dump(data, file)
        

extract_and_save_features()

### 3.2 Create clusters using KNN

In [None]:
# function that lets you view a cluster (based on identifier)
def view_cluster(cluster,groups):
    plt.figure(figsize=(25, 25));
    # gets the list of filenames for a cluster
    files = groups[cluster]
    # only allow up to 30 images to be shown at a time
    if len(files) > 30:
        print(f"Clipping cluster size from {len(files)} to 30")
        files = files[:29]
    # plot each image in the cluster
    for index, file in enumerate(files):
        plt.subplot(10, 10, index + 1);
        img = load_img(file)
        img = np.array(img)
        plt.imshow(img)
        plt.axis('off')
    plt.show()

def create_clusters(data):
    # get a list of the filenames
    filenames = np.array(list(data.keys()))
    # get a list of just the features
    feat = np.array(list(data.values()))
    print(feat.shape)
    # reshape so that there are 210 samples of 4096 vectors
    feat = feat.reshape(-1, 4096)

    # reduce the amount of dimensions in the feature vector
    pca = PCA(n_components=500)
    pca.fit(feat)
    x = pca.transform(feat)
    print(f"Components before PCA: {feat.shape[1]}")
    print(f"Components after PCA: {pca.n_components}")

    # cluster feature vectors
    kmeans = KMeans(n_clusters=36, random_state=22)
    kmeans.fit(x)

    # holds the cluster id and the images { id: [images] }
    groups = {}
    for file, cluster in zip(filenames, kmeans.labels_):
        if cluster not in groups.keys():
            groups[cluster] = []
            groups[cluster].append(file)
        else:
            groups[cluster].append(file)

    print(groups[0])

    for cluster in groups.keys():
        view_cluster(cluster,groups)
        
#     # this is just incase you want to see which value for k might be the best
#     sse = []
#     list_k = list(range(3, 70))
    
#     for k in list_k:
#         km = KMeans(n_clusters=k, random_state=22)
#         km.fit(x)
    
#         sse.append(km.inertia_)
    
#     # Plot sse against k
#     plt.figure(figsize=(6, 6))
#     plt.plot(list_k, sse)
#     plt.xlabel(r'Number of clusters *k*')
#     plt.ylabel('Sum of squared distance');
#     plt.show()

with (open(p, "rb")) as file:
    data = pickle.load(file)
    create_clusters(data)