## Comparing SVM and VGG16


In [1]:

#mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#Import necessary packages and libraries

import zipfile
import os
from PIL import Image

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score


## Data Preprocessing


In [6]:
m_files = os.listdir('/content/drive/MyDrive/fashion-data/men')
w_files = os.listdir('/content/drive/MyDrive/fashion-data/women')

#merge both datasets 
all_files = m_files + w_files
print('total:',len(all_files))

total: 2512


In [7]:
#convert image files into arrays
path = '/content/drive/MyDrive/fashion-data'
def read_img(file,name,pth):
    images = np.zeros((14700))
    for image in file: 
        arr = Image.open(pth+'/'+name+'/'+image) #get img array 
        img = arr.resize((70,70)) #resize for standard sizing
        arr = np.asarray(img) #turn into array
        img.close()
        flatten = arr.flatten() #flatten
        images = np.vstack((images,flatten)) #stack 
    images = np.delete(images, 0, 0)
    return images

#read images from files 
men = read_img(m_files[:500],'men',path)
women = read_img(w_files[:500],'women',path)

#turn into dataframes and add class labels
men = pd.DataFrame(men)
men['label'] = 0
women = pd.DataFrame(women)
women['label'] = 1

#merge and shuffle
fashion = men.append(women,ignore_index=True)
fashion = shuffle(fashion)

#separate dependent and independent variables
X = fashion.loc[:, fashion.columns != 'label']
y = fashion['label']

#see dataframe
fashion.head()

#split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=42)

print('done baby')

done baby


## 1. Support Vector Machines
- Train a support vector classifier using each of the following kernels:
    - Linear
    - Poly (degree = 2)
    - RBF

- If you encounter any issues with training time or memory issues, then you may use a reduced dataset, but carefully detail why and how you reduced the dataset. Unnecessarily reducing the dataset will result in reduced grades!
- Report your error rates on the testing dataset for the different kernels.


In [14]:
## LINEAR
svm_linear = svm.SVC(kernel='linear')
svm_linear.fit(X_train,y_train)
print(svm_linear.score(X_test,y_test))


0.552


In [13]:
## POLY
svm_poly = svm.SVC(kernel='poly', degree=2)
svm_poly.fit(X_train,y_train)
print(svm_poly.score(X_test,y_test))

0.638


In [15]:
## RBF
svm_rbf = svm.SVC(kernel='rbf')
svm_rbf.fit(X_train,y_train)
print(svm_rbf.score(X_test,y_test))


0.652


## 2. Deep Neural Networks
Using Keras load the VGG16 network. This is the convolutional neural network which won ImageNet 2014, and the accompanying paper is available here, if you want to read more about it. Keras code to perform this step is available here, under the heading "Extract features with VGG16."

- Perform transfer learning using VGG16.
- What loss function did you choose, and why?
- What performance do you achieve on your test set and how does this compare to the performance you were originally able to achieve with the linear methods?
- (optional) If you want, you can also perform a "fine-tuning" step. In this step we unfreeze the weights and then perform a few more iterations of gradient descent. This fine tuning can help the network specialize its performance in the particular task that it is needed for. Now, measure the new performance on your test set and compare it to the performance from the previous step.

In [58]:
from keras.preprocessing.image import img_to_array
from keras.preprocessing.image import load_img
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
from keras.applications.vgg16 import decode_predictions

### DATA PREPROCESSING ### 

## TRAINING VGG on one image only

#preprocess one image
man = path+'/men/'+m_files[1]
image = load_img(man, target_size=(224, 224))

image = img_to_array(image)
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
image = preprocess_input(image)

#LOAD VGG16
model = VGG16()

# predict the probability across all output classes for
yhat = model.predict(image)

# convert the probabilities to class labels
label = decode_predictions(yhat)
# retrieve the most likely result, e.g. highest probability
label = label[0][0]
# print the classification
print('%s (%.2f%%)' % (label[1], label[2]*100))

shoe_shop (34.26%)


In [57]:
#MULTIPLE IMAGES 

#prepaare images in bulk for VGG model
def read_img_vgg(files,class_,path_):
  imgs = np.zeros((1, 224, 224, 3))
  for i in files: 
    ipath = path_+class_+i
    im = load_img(ipath, target_size=(224, 224))
    # convert the image pixels to a numpy array
    im = img_to_array(im)
    # reshape data for the model
    im = im.reshape((1, im.shape[0], im.shape[1], im.shape[2]))
    # prepare the image for the VGG model
    im = preprocess_input(im)
    #stack onto the overall list
    imgs = np.vstack((imgs,im))
  return imgs

path_ = '/content/drive/MyDrive/fashion-data'

men_ = read_img_vgg(m_files[:500],'/men/',path_)
women_ = read_img_vgg(w_files[:500],'/women/',path_)


## Comparison 