# Image Classification Using SVM

In [9]:
import numpy as np
import cv2
from matplotlib import pyplot as plt
from skimage import io
import glob
import re
import pandas as pd
from sklearn.utils import Bunch
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.svm import SVC

## Load data and return a structured dataset

In [2]:
train_csv = pd.read_csv("./data/TrainAnnotations.csv")
def load_images(path):
    files = glob.glob(path + "*.jpg")
    images = []
    hsv_data = []
    file_name = []
    annotations =[]
    
    for file in files:
        name = re.sub("./data/TrainData/", "", file)
        image = cv2.imread(file)
        hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)[:,:,0]

        for rows in train_csv.iterrows():
            if rows[1].file_name == name:
                annotation = rows[1].annotation
                annotations.append(annotation)
        
        file_name.append(name)
        images.append(image)
        hsv_data.append(hsv.flatten())
        
        
        
    
    file_name = np.array(file_name)
    images = np.array(images)
    hsv_data = np.array(hsv_data) 
    annotations = np.array(annotations)
    
    return Bunch(file_name = file_name,
                images = images,
                hsv_data = hsv_data,
                annotations = annotations)
        

In [3]:
image_dataset = load_images("./data/TrainData/")

In [5]:
image_dataset

{'file_name': array(['001516.jpg', '000608.jpg', '016718.jpg', ..., '018937.jpg',
        '013876.jpg', '013123.jpg'], dtype='<U10'),
 'images': array([[[[251, 255, 254],
          [255, 255, 254],
          [255, 255, 254],
          ...,
          [ 44, 108,  89],
          [ 45, 102,  81],
          [ 44,  96,  73]],
 
         [[251, 255, 254],
          [255, 255, 254],
          [255, 255, 254],
          ...,
          [ 44,  96,  86],
          [ 49,  98,  84],
          [ 52,  98,  79]],
 
         [[251, 255, 254],
          [255, 255, 254],
          [255, 255, 254],
          ...,
          [ 72, 120, 114],
          [ 83, 133, 123],
          [ 96, 148, 131]],
 
         ...,
 
         [[104, 184, 171],
          [ 76, 145, 134],
          [ 76, 135, 121],
          ...,
          [ 90, 199, 183],
          [ 91, 199, 186],
          [ 91, 202, 188]],
 
         [[ 89, 161, 141],
          [ 75, 140, 124],
          [ 76, 133, 118],
          ...,
          [ 93, 196, 181

In [44]:
# imgs =['000006.jpg', '000016.jpg', '000032.jpg', '000097.jpg', '000104.jpg', '000122.jpg', '000237.jpg', '000253.jpg',
# '000265.jpg', '000300.jpg']

In [59]:
# annotations = []
# for file in imgs:
#     for rows in train_csv.iterrows():
# #         print(rows)
#         if rows[1].file_name == file:
#             annotation = rows[1].annotation
#             annotations.append(annotation)

# imgs = np.array(imgs)
# annotations = np.array(annotations)
# x = Bunch(imgs = imgs,
#           annotations = annotations)
# print(x)


{'imgs': array(['000006.jpg', '000016.jpg', '000032.jpg', '000097.jpg',
       '000104.jpg', '000122.jpg', '000237.jpg', '000253.jpg',
       '000265.jpg', '000300.jpg'], dtype='<U10'), 'annotations': array([0, 0, 0, 4, 0, 3, 4, 0, 0, 2])}


## Split Data into Train and Test

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    image_dataset.hsv_data, image_dataset.annotations, test_size=0.2,random_state=109)

In [5]:
X_train.shape

(820, 307200)

## Train Data

In [6]:
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

## Prediction

In [7]:
y_pred = clf.predict(X_test)

## Get Accuracy

In [10]:
print(accuracy_score(y_test,y_pred))

0.8390243902439024


In [11]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.93      0.87        90
           1       0.75      0.60      0.67        35
           2       0.77      0.80      0.78        25
           3       0.96      0.77      0.86        31
           4       1.00      0.96      0.98        24

   micro avg       0.84      0.84      0.84       205
   macro avg       0.86      0.81      0.83       205
weighted avg       0.84      0.84      0.84       205



# Perform cross-validation
* Using the entire dataset

In [10]:
scores = cross_val_score(clf, image_dataset.hsv_data, image_dataset.annotations, cv=5)
scores

array([0.79710145, 0.76585366, 0.8097561 , 0.81372549, 0.76470588])

# Load test data

In [17]:
# load all test data; convert them into HSV space and collect hue channel
testfiles = glob.glob("./data/TestData/*.jpg")
testfiles.sort()
Test = [cv2.imread(f) for f in testfiles]
Test = np.array([cv2.cvtColor(img, cv2.COLOR_BGR2HSV)[:,:,0].flatten() for img in Test])

In [18]:
# making prediction on the test datab
yhat_test = clf.predict(Test)

In [20]:
# function to turn prediction into one hot coding format
def vectorize_result(nclass, j):
    """
    Return a nclass-dimensional unit vector with 1.0 in the j-th position
    and zero elsewhere
    """
    e = np.zeros((nclass,1))
    e[j] = 1.0
    return e

In [25]:
# convert test prediction into one hot-coding format
encode = [vectorize_result(5, yhat_test[i]) for i in range(yhat_test.shape[0])]
pred_df = pd.DataFrame(np.array(encode).reshape((yhat_test.shape[0], 5)).astype(np.uint8))

In [27]:
# output prediction
pred_df.to_csv("prediction.csv", header=False, index=False)