# ENGR 418 Project - Stage 2
## Sorting lego blocks using image data

- Group #20
- Jesse Alele 82807728
- Zach Kelly 41637836

Import python modules

In [11]:
import numpy as np 
import matplotlib.pyplot as plt
import os
from numpy import asarray
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from PIL import Image

Define function to get training and testing data from raw image files

In [23]:
im_width = int(np.sqrt(4067)) 
im_length = int(np.sqrt(4067))

def get_data(folder, im_width, im_length):

    # create list of file names inside folder
    file_names = os.listdir(folder)
    # intialize np array to store feature data
    x = np.empty((len(file_names),im_width*im_length))
    # initialize np array to store class labels
    y = np.empty((len(file_names),1))
    
    num_cir = 0 # number of images that are labelled as class circle
    num_rec = 0 # number of images that are labelled as class rectangle
    num_squ = 0 # number of images that are labelled as class square
    
    for i in range(len(file_names)):
       
        # fill class labels array
        if 'cir' in file_names[i]:
            y[i] = 0 # class circle
            num_cir = num_cir + 1 # keep running count of circle labels
        elif 'rec' in file_names[i]:
            y[i] = 1 # class rectangle
            num_rec = num_rec + 1 # keep running count of rectangle labels
        elif 'squ' in file_names[i]:
            y[i] = 2 # class square
            num_squ = num_squ + 1 # keep running count of square labels
            
        # fill feature array
        path = folder + '/' + file_names[i] # update path as we iterate through file_names array
        im = Image.open(path).convert('L') # open ith image and convert to grayscale
        im = im.resize((im_length,im_width)) # resize to specified pixel length and width
        im_array = asarray(im) # convert to numpy array
        x[i,:] = im_array.reshape(1,-1) # reshape to im_length*im_width columns by one row then insert as ith row in x
    
    y = np.ravel(y)

    # assure the folder contains an equal count of each different shape, so only one variable (num_shape) is needed to seperate the shapes in class labels array
    if num_cir == num_rec == num_squ:
        num_shape = num_cir
    else:
        print('The number of circles, rectangles and squares in the specified folder are not equal')

    return x, y, num_shape

## Raw features
Training analysis

In [28]:
# Define training folder path
training_folder = r"C:\Users\zachk\Documents\ubco\year5\ubc_term1\engr418\project\stage2\lego_dataset_2\training"
# store the returned data from get_data as x_train, y_train, num_shape
x_train, y_train, num_shape = get_data(training_folder,im_width,im_length) 

model = LogisticRegression(max_iter=1000) 
model.fit(x_train, y_train)

y_train_pred = model.predict(x_train)

print(f'The confusion matrix is:\n {confusion_matrix(y_train,y_train_pred)}')
print(f'The accuracy score on the circles is: {accuracy_score(y_train[0:num_shape],y_train_pred[0:num_shape])*100:.2f}%')
print(f'The accuracy score on the rectangles is: {accuracy_score(y_train[num_shape:2*num_shape],y_train_pred[num_shape:2*num_shape])*100:.2f}%')
print(f'The accuracy score on the squares is: {accuracy_score(y_train[2*num_shape:3*num_shape],y_train_pred[2*num_shape:3*num_shape])*100:.2f}%')
print(f'The accuracy score on the training data set is: {accuracy_score(y_train,y_train_pred)*100:.2f}%')

The confusion matrix is:
 [[27  0  0]
 [ 0 27  0]
 [ 0  0 27]]
The accuracy score on the circles is: 100.00%
The accuracy score on the rectangles is: 100.00%
The accuracy score on the squares is: 100.00%
The accuracy score on the training data set is: 100.00%


Testing analysis

In [29]:
# define testing folder path
testing_folder = r"C:\Users\zachk\Documents\ubco\year5\ubc_term1\engr418\project\stage2\lego_dataset_2\testing"
# store the returned data from get_data as x_test, y_test, num_shape
x_test, y_test, num_shape = get_data(testing_folder,im_width,im_length) 

def test_function(folder, im_width, im_length):

    # create list of file names inside folder
    file_names = os.listdir(folder)
    # intialize np array to store feature data
    x = np.empty((len(file_names),im_width*im_length))
    # initialize np array to store class labels
    y = np.empty((len(file_names),1))
    
    num_cir = 0 # number of images that are labelled as class circle
    num_rec = 0 # number of images that are labelled as class rectangle
    num_squ = 0 # number of images that are labelled as class square
    
    for i in range(len(file_names)):
       
        # fill class labels array
        if 'cir' in file_names[i]:
            y[i] = 0 # class circle
            num_cir = num_cir + 1 # keep running count of cirlce labels
        elif 'rec' in file_names[i]: 
            y[i] = 1 # class rectangle
            num_rec = num_rec + 1 # keep running count of rectangle labels
        elif 'squ' in file_names[i]: 
            y[i] = 2 # class square
            num_squ = num_squ + 1 # keep running count of square labels
            
        # fill feature array
        path = folder + '/' + file_names[i] # update path as we iterate through file_names array
        im = Image.open(path).convert('L') # open ith image and convert to grayscale
        im = im.resize((im_length,im_width)) # resize to specified pixel length and width
        im_array = asarray(im) # convert to numpy array
        x[i,:] = im_array.reshape(1,-1) # reshape to im_length*im_width columns by one row then insert as ith row in x

    y = np.ravel(y)
    
    # assure the folder contains an equal number of the different shapes, so only one variable (num_shape) is needed to seperate the shapes in class labels array
    if num_cir == num_rec == num_squ:
        num_shape = num_cir
    else: 
        print('The number of circles, rectangles and squares in the specified folder are not equal')

    y_test_pred = model.predict(x)
    test_accuracy = accuracy_score(y,y_test_pred)
    cir_accuracy = accuracy_score(y_test[0:num_shape],y_test_pred[0:num_shape])
    rec_accuracy = accuracy_score(y_test[num_shape:2*num_shape],y_test_pred[num_shape:2*num_shape])
    squ_accuracy = accuracy_score(y_test[2*num_shape:3*num_shape],y_test_pred[2*num_shape:3*num_shape])
    test_confusion = confusion_matrix(y,y_test_pred)
    
    return test_confusion, test_accuracy, cir_accuracy, rec_accuracy, squ_accuracy

test_confusion, test_accuracy, cir_accuracy, rec_accuracy, squ_accuracy = test_function(testing_folder,im_width,im_length)


print(f'The confusion matrix is:\n {test_confusion}')
print(f'The accuracy score on the circles is: {cir_accuracy*100:.2f}%')
print(f'The accuracy score on the rectangles is: {rec_accuracy*100:.2f}%')
print(f'The accuracy score on the squares is: {squ_accuracy*100:.2f}%')
print(f'The accuracy score on the entire training dataset is: {test_accuracy*100:.2f}%')


The confusion matrix is:
 [[16  7  4]
 [ 2 23  2]
 [10  6 11]]
The accuracy score on the circles is: 59.26%
The accuracy score on the rectangles is: 85.19%
The accuracy score on the squares is: 40.74%
The accuracy score on the entire training dataset is: 61.73%


im_width_test = [20, 25, 50, 63]
im_length_test = [20, 25, 50, 63]
test_confusions = []
test_accuracies = []

for i in range(4):

    im_width = im_width_test[i]
    im_length = im_length_test[i]

    test_confusion, test_accuracy, cir_accuracy, rec_accuracy, squ_accuracy = test_function(testing_folder, im_width, im_length)

    test_confusions[i] = test_confusion
    test_accuracies[i] = test_accuracy

### Feature Engineering

In [15]:
# 