# ENGR 418 Project - Stage 2
## Sorting lego blocks using image data

- Group #20
- Jesse Alele 82807728
- Zach Kelly 41637836

-------------------

### Import Statements

In [125]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from skimage import feature
from skimage.io import imread, imshow
from skimage.transform import resize
from scipy.ndimage import rotate
from sklearn.linear_model import LogisticRegression

### Feature Extraction

In [126]:
def feature_extractor(folder):

    # collect the file names in the folder directory
    file_names = os.listdir(folder)
    # define the square image size
    im_size = 100
    # initalize the feature array
    x = []

    # iterate through all the images in the specified folder 
    for i in range(len(file_names)):             
       
        # append the ith file name to folder path for reading in the ith image
        dir = os.path.join(folder,file_names[i])
        # read in the images as 2d grayscale array and resize to 100x100 pixels 
        im = imread(dir,as_gray=True)
        im = resize(im, (100, 100))
    
        # get the dimensions of image
        im_height = im.shape[0]
        im_width = im.shape[1]
        
        ## Feature 1

        # re-initialization
        sig_edges = []
        total_sig_edges = 0

        # rotate the image between 0 and 180 degrees with 60 degree increments
        for degree in range(0,180,60):

            # rotate image by current degree
            rotated_im = rotate(im, angle=degree, reshape=False, order=1)
            # find edges of rotated image using canny algorithm. Outputs boolean 2D array of (100x100)
            edges = feature.canny(image=rotated_im, sigma=5, low_threshold=0.001, high_threshold=0.08)
            # sum the 2D array column wise (returns 1x100 array)
            column_sums = np.array(edges.sum(axis=0))
            
            # truncate the values of column_sums that are less than 4 to 0 
            for col in range(len(column_sums)):
                
                if column_sums[col] < 3:
                    column_sums[col] = 0

            # append the filtered column_sums array to sig_edges
            sig_edges.append(sum(column_sums)) 
        
        # total all the resulting sums for all the rotations of the image
        total_sig_edges = sum(sig_edges)

        ## Features 2, 3 and 4

        # intialization
        width = 0
        diff_width = 0
        prev_width = 0
        min_width = 0
        max_diff_width = 0
        area = 0  

        # find the edges of ith image (defualt orientation) using canny algorithm. Outputs boolean 2D array of (100x100)
        im = feature.canny(image=im, sigma=5, low_threshold=0.001, high_threshold=0.08) 

        # iterate row by row through the 2D image array
        for row in range(im_height): 
            # re-initialize left and right edge variables for every row
            left_edge = 0
            right_edge = 0
            # update previous row width
            prev_width = width

            # iterate through the pixels in the current row 
            for col in range(im_width): 
                # if the pixel is an edge store as first edge and break loop
                if(im[row][col] == True): 
                    left_edge = col
                    break

            # iterate through the pixels in the current row
            for col in range(im_width): 
                # if the pixel is an edge store as last_edge (the last instance of an edge pixel will replace the value in last_edge) 
                if(im[row][col] == True):
                    right_edge = col

            ## Feature 2: get the area of the shape

          
            width = right_edge - left_edge # compute the width of the current row 
            area += width # add up all the slices of area (rows with dimensions width x pixel) 
            
            ## Feature 3: get the smallest width

            # store the current row width if not equal to 100 or 0 (the last row containing edge values will be left as min_width) 
            if(width < im_width and width != 0):
                min_width = width

            ## Feature 4: get the largest change in width between successive rows 

            # store the current change in width
            diff_width = abs(width-prev_width) 
            # if the change is greater than the running total, replace the value
            if(diff_width > max_diff_width ):
                max_diff_width = diff_width

        x.append([area,min_width,max_diff_width,total_sig_edges])

    x = np.array(x)

    return x

### Label Extraction

In [127]:
def label_extractor(folder):

    # get array containing all the file names in folder directory
    file_names = os.listdir(folder)
    # intialize label array
    y = []
    # initalize variables to store count of shapes
    cir = 0
    rec = 0
    squ = 0

    # iterate through the images in the specified folder
    for i in range(len(file_names)):        
        # fill label array
        if 'cir' in file_names[i]:
            y.append(0) # class circle
            cir += 1 # count the number of circles
        elif 'rec' in file_names[i]:
            y.append(1) # class rectangle
            rec += 1 # count the number of rectangles 
        elif 'squ' in file_names[i]:
            y.append(2) # class square
            squ += 1 # count the number of squares

    # convert list to numpy array
    y = np.array(y)

    return y, cir, rec, squ

### Model Training

In [128]:
# define folder path for training data
train_folder = r""

# call feature_extractor() to collect feature data from training set
x_train = feature_extractor(train_folder)
# call label_extractor() to collect label data and shape counts from training set
y_train, cir, rec, squ = label_extractor(train_folder)

# intitialize and fit K Nearest Neighbours model
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train,y_train)


### Testing Function

In [129]:
def test_function(test_folder, model):

    # call feature_extractor() to collect feature data
    x = feature_extractor(test_folder)
    # call label_extractor() to collect label data and shape counts
    [y,cir,rec,squ] = label_extractor(test_folder)

    # predict classes using trained model and feature data. Store in y_pred array
    y_pred = knn.predict(x)

    # assess predictions using accuracy score and confusion matrix metrics
    accuracy = accuracy_score(y,y_pred)
    confusion = pd.crosstab(y,y_pred)

    # define list containing the lego shapes
    shapes = ['circles','rectangles','squares']

    # iterate through shapes array
    for i in range(len(shapes)):

        # compute the accuracy score of the ith shape using the confusion matrix
        total = confusion.loc[i,:].sum()
        shape_count = confusion.loc[i,i]
        score = shape_count/total
        # print the accuracy score of the ith shape
        print(f'The accuracy score on the {shapes[i]} is: {score*100:.2f}%')

    # print overall accuracy score and confusion matrix
    print(f'The overall accuracy score is: {accuracy*100:.2f}%')
    print('--------------------------------------')
    print('The confusion matrix is:')
    display(confusion)
    
    return

### Accuracy on Training Set

In [130]:
# define folder path for training data
train_folder = r""

# call test_function() and pass the training set path and trained K Nearest Neighbour model
test_function(train_folder,knn)

The accuracy score on the circles is: 100.00%
The accuracy score on the rectangles is: 100.00%
The accuracy score on the squares is: 100.00%
The overall accuracy score is: 100.00%
--------------------------------------
The confusion matrix is:


col_0,0,1,2
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,36,0,0
1,0,36,0
2,0,0,36


### Accuracy on Test Set

In [131]:
# define folder path for testing data
test_folder = r""

# call test_function() and pass the training set path and trained K Nearest Neighbour model
test_function(test_folder,knn)

The accuracy score on the circles is: 100.00%
The accuracy score on the rectangles is: 88.89%
The accuracy score on the squares is: 83.33%
The overall accuracy score is: 90.74%
--------------------------------------
The confusion matrix is:


col_0,0,1,2
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,18,0,0
1,0,16,2
2,3,0,15
