# Image Classification Using SVM and Random Forest

In [1]:
import numpy as np
import cv2
from matplotlib import pyplot as plt
from skimage import io
import glob
import re
import pandas as pd
from sklearn.utils import Bunch
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

## Load data and return a structured dataset

In [2]:
train_csv = pd.read_csv("./data/TrainAnnotations.csv")
def load_images(path):
    files = glob.glob(path + "*.jpg")
    images = []
    hsv_data = []
    file_name = []
    annotations =[]
    
    for file in files:
        name = re.sub("./data/TrainData/", "", file)
        image = cv2.imread(file)
        hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

        for rows in train_csv.iterrows():
            if rows[1].file_name == name:
                annotation = rows[1].annotation
                annotations.append(annotation)
        
        file_name.append(name)
        images.append(image.flatten())
        hsv_data.append(hsv.flatten())   
    
    file_name = np.array(file_name)
    images = np.array(images)
    hsv_data = np.array(hsv_data) 
    annotations = np.array(annotations)
    
    return Bunch(file_name = file_name,
                images = images,
                hsv_data = hsv_data,
                annotations = annotations)      

In [3]:
image_dataset = load_images("./data/TrainData/")

In [5]:
image_dataset

{'file_name': array(['001516.jpg', '000608.jpg', '016718.jpg', ..., '018937.jpg',
        '013876.jpg', '013123.jpg'], dtype='<U10'),
 'images': array([[251, 255, 254, ..., 109, 197, 184],
        [144, 151, 146, ..., 175, 189, 178],
        [117, 173, 160, ...,  45,  67,  65],
        ...,
        [122, 171, 155, ...,  82, 126, 113],
        [ 74, 191, 164, ..., 192, 220, 214],
        [ 94, 179, 159, ...,  84, 111, 115]], dtype=uint8),
 'hsv_data': array([[ 38,   4, 255, ...,  34, 114, 197],
        [ 51,  12, 151, ...,  54,  19, 189],
        [ 37,  83, 173, ...,  33,  84,  67],
        ...,
        [ 40,  73, 171, ...,  39,  89, 126],
        [ 37, 156, 191, ...,  36,  32, 220],
        [ 37, 121, 179, ...,  26,  69, 115]], dtype=uint8),
 'annotations': array([1, 3, 0, ..., 0, 0, 2])}

## Split Data into Train and Test

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    image_dataset.hsv_data, image_dataset.annotations, test_size=0.3,random_state=109)

In [6]:
X_train.shape

(717, 921600)

## SVM Classifiers Using Linear, Polynomial and RBF Kernels

In [7]:
classification_reports = []
for kernel in ('linear', 'poly', 'rbf'):
    clf = SVC(kernel=kernel)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    classification_reports.append(classification_report(y_test, y_pred))

## Classification Report for Linear Kernel

In [8]:
print(classification_reports[0])

              precision    recall  f1-score   support

           0       0.83      0.97      0.89       135
           1       0.82      0.57      0.67        56
           2       0.85      0.77      0.81        44
           3       0.90      0.86      0.88        42
           4       0.97      0.97      0.97        31

    accuracy                           0.85       308
   macro avg       0.87      0.83      0.84       308
weighted avg       0.85      0.85      0.85       308



## Classification Report for Polynomial Kernel

In [9]:
print(classification_reports[1])

              precision    recall  f1-score   support

           0       0.84      0.96      0.89       135
           1       0.77      0.61      0.68        56
           2       0.87      0.77      0.82        44
           3       0.85      0.83      0.84        42
           4       0.97      0.94      0.95        31

    accuracy                           0.85       308
   macro avg       0.86      0.82      0.84       308
weighted avg       0.85      0.85      0.84       308



## Classification Report for RBF Kernel

In [10]:
print(classification_reports[2])

              precision    recall  f1-score   support

           0       0.67      0.98      0.80       135
           1       0.59      0.30      0.40        56
           2       1.00      0.30      0.46        44
           3       0.76      0.74      0.75        42
           4       1.00      0.90      0.95        31

    accuracy                           0.72       308
   macro avg       0.80      0.64      0.67       308
weighted avg       0.75      0.72      0.68       308



# Random Forest Classifier

In [12]:
classification_reports = []
for n_estimator in (100, 300, 500, 700):
    clf = RandomForestClassifier(n_estimators=n_estimator, 
                               random_state=109, 
                               max_features = 'sqrt',
                               n_jobs=-1)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    classification_reports.append(classification_report(y_test, y_pred))


## Classification Report for Random Forest Classifier with 100 Trees

In [13]:
print(classification_reports[0])

              precision    recall  f1-score   support

           0       0.66      0.97      0.79       135
           1       0.55      0.29      0.38        56
           2       0.93      0.32      0.47        44
           3       0.72      0.67      0.69        42
           4       1.00      0.90      0.95        31

    accuracy                           0.70       308
   macro avg       0.77      0.63      0.66       308
weighted avg       0.72      0.70      0.67       308



## Classification Report for Random Forest Classifier with 300 Trees

In [14]:
print(classification_reports[1])

              precision    recall  f1-score   support

           0       0.66      0.99      0.79       135
           1       0.50      0.21      0.30        56
           2       0.94      0.36      0.52        44
           3       0.76      0.67      0.71        42
           4       1.00      0.90      0.95        31

    accuracy                           0.70       308
   macro avg       0.77      0.63      0.65       308
weighted avg       0.72      0.70      0.67       308



## Classification Report for Random Forest Classifier with 500 Trees

In [15]:
print(classification_reports[2])

              precision    recall  f1-score   support

           0       0.67      0.99      0.79       135
           1       0.52      0.23      0.32        56
           2       0.94      0.36      0.52        44
           3       0.74      0.67      0.70        42
           4       1.00      0.90      0.95        31

    accuracy                           0.71       308
   macro avg       0.77      0.63      0.66       308
weighted avg       0.72      0.71      0.67       308



## Classification Report for Random Forest Classifier with 700 Trees

In [16]:
print(classification_reports[3])

              precision    recall  f1-score   support

           0       0.67      0.99      0.80       135
           1       0.54      0.23      0.33        56
           2       0.94      0.36      0.52        44
           3       0.74      0.67      0.70        42
           4       1.00      0.90      0.95        31

    accuracy                           0.71       308
   macro avg       0.78      0.63      0.66       308
weighted avg       0.73      0.71      0.67       308



# Training with SVM Linear Kernal (Best Performing Model)

In [7]:
clf_opt = SVC(kernel='linear')
clf_opt.fit(image_dataset.hsv_data, image_dataset.annotations)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

## Perform 5-Fold Cross-Validation on the Entire Dataset

In [20]:
scores = cross_val_score(clf_opt, image_dataset.hsv_data, image_dataset.annotations, cv=5)
scores

array([0.87317073, 0.85365854, 0.83414634, 0.81463415, 0.80487805])

## Load Test Data

In [8]:
testfiles = glob.glob("./data/TestData_new/*.jpg")
testfiles.sort()
test_images = [cv2.imread(f) for f in testfiles]
test_hsv = np.array([cv2.cvtColor(img, cv2.COLOR_BGR2HSV).flatten() for img in test_images])

## Predictions on Test Data

In [10]:
ypred_test = clf_opt.predict(test_hsv)

## Turn Prediction into One-Hot Encoding

In [11]:
# Taken from Homework2b
def vectorize_result(nclass, j):
    """
    Return a nclass-dimensional unit vector with 1.0 in the j-th position
    and zero elsewhere
    """
    e = np.zeros((nclass,1))
    e[j] = 1.0
    return e

## Convert Test Prediction into One-Hot Encoding and Save Predictions

In [12]:
encode = [vectorize_result(5, ypred_test[i]) for i in range(ypred_test.shape[0])]
pred_df = pd.DataFrame(np.array(encode).reshape((ypred_test.shape[0], 5)).astype(np.uint8))
# Save predictions to csv
pred_df.to_csv("predict.csv", header=False, index=False)