In [1]:
from sklearn.datasets import load_files       
from keras.utils import np_utils
import numpy as np
from glob import glob
import pandas as pd
import matplotlib.pyplot as plt
import tarfile 
import scipy.io
import cv2
import os

%matplotlib inline 

Using TensorFlow backend.


In [6]:
# creation of train, validation and test folders

import os
import random
from shutil import copyfile

def img_train_test_split(img_source_dir, train_size, validation_size):
    """
    Randomly splits images over a train and validation folder, while preserving the folder structure
    
    Parameters
    ----------
    img_source_dir : string
        Path to the folder with the images to be split. Can be absolute or relative path   
        
    train_size : float
        Proportion of the original images that need to be copied in the subdirectory in the train folder
    """    
    if not (isinstance(img_source_dir, str)):
        raise AttributeError('img_source_dir must be a string')
        
    if not os.path.exists(img_source_dir):
        raise OSError('img_source_dir does not exist')
        
    if not (isinstance(train_size, float)):
        raise AttributeError('train_size must be a float')
        
    # Set up empty folder structure if not exists
    if not os.path.exists('../data'):
        os.makedirs('../data')
    else:
        if not os.path.exists('../data/train'):
            os.makedirs('../data/train')
        if not os.path.exists('../data/validation'):
            os.makedirs('../data/validation')
        if not os.path.exists('../data/test'):
            os.makedirs('../data/test')
            
    # Get the subdirectories in the main image folder
    subdirs = [subdir for subdir in os.listdir(img_source_dir) if os.path.isdir(os.path.join(img_source_dir, subdir))]

    for subdir in subdirs:
        subdir_fullpath = os.path.join(img_source_dir, subdir)
        if len(os.listdir(subdir_fullpath)) == 0:
            print(subdir_fullpath + ' is empty')
            break

        train_subdir = os.path.join('../data/train', subdir)
        validation_subdir = os.path.join('../data/validation', subdir)
        test_subdir = os.path.join('../data/test', subdir)

        # Create subdirectories in train and validation folders
        if not os.path.exists(train_subdir):
            os.makedirs(train_subdir)

        if not os.path.exists(validation_subdir):
            os.makedirs(validation_subdir)
            
        if not os.path.exists(test_subdir):
            os.makedirs(test_subdir)

        train_counter = 0
        validation_counter = 0
        test_counter = 0

        # Randomly assign an image to train or validation folder
        for filename in os.listdir(subdir_fullpath):
            if filename.endswith(".jpg") or filename.endswith(".png"): 
                fileparts = filename.split('.')

                if random.uniform(0, 1) <= train_size:
                    copyfile(os.path.join(subdir_fullpath, filename),
                             os.path.join(train_subdir, str(train_counter) + '.' + fileparts[1]))
                    train_counter += 1
                elif random.uniform(0, 1) <= validation_size:
                    copyfile(os.path.join(subdir_fullpath, filename),
                             os.path.join(validation_subdir, str(validation_counter) + '.' + fileparts[1]))
                    validation_counter += 1
                else :
                    copyfile(os.path.join(subdir_fullpath, filename),
                             os.path.join(test_subdir, str(test_counter) + '.' + fileparts[1]))
                    test_counter += 1
                    

img_train_test_split('../data/Images', 0.7, 0.4)

In [2]:
# define function to load train, test, and validation datasets
def load_dataset(path):
    data = load_files(path)
    dog_files = np.array(data['filenames'])
    dog_targets = np_utils.to_categorical(np.array(data['target']))
    return dog_files, dog_targets

# load train, test, and validation datasets
train_files, train_targets = load_dataset('../data/train')
valid_files, valid_targets = load_dataset('../data/validation')
test_files, test_targets = load_dataset('../data/test')

# load list of dog names
# the [20:-1] portion simply removes the filepath and folder number
dog_names = [item[25:-1] for item in sorted(glob("../data/train/*/"))]

# print statistics about the dataset
print('There are %d total dog categories.' % len(dog_names))
print('There are %s total dog images.\n' % len(np.hstack([train_files, valid_files, test_files])))
print('There are %d training dog images.' % len(train_files))
print('There are %d validation dog images.' % len(valid_files))
print('There are %d test dog images.'% len(test_files))

There are 120 total dog categories.
There are 20580 total dog images.

There are 14531 training dog images.
There are 4232 validation dog images.
There are 1817 test dog images.


In [3]:
# return a dictionary that holds all images category by category. 
def load_images_from_folder(folder):
    images = {}
    for filename in os.listdir(folder):
        category = []
        path = folder + "/" + filename
        for dog in os.listdir(path):
            #img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            img = cv2.imread(path + "/" + dog,0)
            if img is not None:
                category.append(img)
        images[filename] = category
    return images

images = load_images_from_folder('../data/train')  # take all images category by category 
test = load_images_from_folder("../data/validation") # take test images 

In [4]:
len(images.get('n02085620-Chihuahua'))

96

In [None]:
# Return an array whose first index holds the decriptor_list without an order
# And the second index holds the sift_vectors dictionary which holds the descriptors but this is seperated 
# class by class
def sift_features(images):
    sift_vectors = {}
    descriptor_list = []
    sift = cv2.xfeatures2d.SIFT_create()
    for key,value in images.items():
        features = []
        for img in value:
            keypoints, desc = sift.detectAndCompute(img,None)
           
            desc = desc.astype('int32')
            descriptor_list.extend(desc)
            features.append(desc)
        sift_vectors[key] = features
    return [descriptor_list, sift_vectors]

sifts = sift_features(images) 
# Takes the descriptor list which is unordered one
descriptor_list = sifts[0] 
# Takes the sift features that is seperated class by class for train data
all_bovw_feature = sifts[1] 
# Takes the sift features that is seperated class by class for test data
test_bovw_feature = sift_features(test)[1] 

In [None]:
# A k-means clustering algorithm who takes 2 parameter which is number 
# of cluster(k) and the other is descriptors list(unordered 1d array)
# Returns an array that holds central points.

from sklearn.cluster import MiniBatchKMeans

def kmeans(k, descriptor_list):
    kmeans = MiniBatchKMeans(n_clusters=500,
                             random_state=0,
                             batch_size=300)
    kmeans.fit(descriptor_list)
    visual_words = kmeans.cluster_centers_ 
    return visual_words
    
# Takes the central points which is visual words    
visual_words = kmeans(500, descriptor_list) 

In [24]:
type(all_bovw_feature)

dict

In [23]:
all_bovw_feature

{'n02085620-Chihuahua': [array([[  1,   4,  11, ...,   0,   0,   0],
         [108, 119,   0, ...,  12,   0,   0],
         [ 12,   1,   0, ...,   1,   0,   0],
         ...,
         [105,  23,   0, ...,   0,   0,   3],
         [  5,  41,  21, ...,   0,   0,   0],
         [ 45,   4,   0, ...,   1,   1,   5]], dtype=int32),
  array([[  3,   2,   0, ...,   4,  14,  47],
         [ 74, 137,  11, ...,   0,   0,   0],
         [  5,   0,   1, ...,   2,   1,   1],
         ...,
         [  0,   0,   0, ...,  98,   2,   3],
         [  0,   0,   0, ...,  16,   1,   0],
         [ 19, 118,  41, ...,   0,   0,   0]], dtype=int32),
  array([[ 1, 16, 38, ..., 50, 12,  0],
         [ 4, 20, 89, ...,  4,  0,  0],
         [ 4, 13, 19, ..., 10,  0,  0],
         ...,
         [ 0,  0,  2, ...,  4,  1,  2],
         [24,  4,  3, ...,  0,  5, 23],
         [32,  1,  0, ..., 35,  0,  0]], dtype=int32),
  array([[ 5,  0,  0, ...,  0,  0,  0],
         [59,  0,  0, ...,  0,  0,  0],
         [ 1,  0, 

In [33]:
visual_words[0]

array([  9.43359266,   8.9952281 ,  10.41348386,  12.99935013,
        11.57903337,   9.09712758,   8.53675474,   9.24839854,
        30.20682548,  12.44699853,   9.84087491,  10.15283064,
        10.182279  ,   9.53738604,  12.01622816,  27.28527397,
        98.1526821 ,  10.25825798,   3.76671556,   2.55324285,
         2.2393375 ,   2.69214772,  14.07360232, 123.31045175,
        62.02395232,   6.35898769,   1.80884565,   1.91436582,
         3.5512561 ,   6.88980077,  29.04472956, 114.20131088,
        13.99461537,  11.49312067,  10.35011233,  11.63261972,
        11.7909464 ,  10.8437529 ,  10.61744991,  12.66182669,
        79.07599755,  20.9637373 ,  11.87905008,  12.15231075,
        12.74237332,  11.00404776,   9.29983103,  22.99422545,
       144.45951316,  24.36266409,   2.34405927,   1.3235234 ,
         1.153889  ,   1.27142247,   5.96143491, 110.41409659,
        80.78678723,  14.15054682,   5.0647641 ,   6.66585588,
        10.23293165,   9.99364985,  17.17602169,  80.14

In [21]:
visual_words

array([[  9.43359266,   8.9952281 ,  10.41348386, ...,   1.86776092,
          1.74794363,   5.83012422],
       [ 47.36617867, 102.16407068,  44.01468458, ...,  14.89018874,
         10.29170845,  11.07580721],
       [ 76.56027613,  32.14735099,   9.33875856, ...,  15.89965204,
          4.08682231,   4.41449658],
       ...,
       [ 16.47514245,  15.37514245,  16.3997151 , ...,  21.86887464,
         53.37001425,  84.91196581],
       [ 21.81871778,  40.94479187,  39.09031888, ...,  10.94209471,
          8.60344663,  10.92357266],
       [ 21.72171295,  21.58451859,  20.84124901, ...,  13.05623637,
         15.24567227,  16.38938958]])

In [8]:
self.mega_histogram = np.array([np.zeros(self.n_clusters) for i in range(n_images)])
old_count = 0
for i in range(n_images):
    l = len(descriptor_list[i])
    for j in range(l):
        if kmeans_ret is None:
            idx = self.kmeans_ret[old_count+j]
        else:
            idx = kmeans_ret[old_count+j]
        self.mega_histogram[i][idx] += 1
        old_count += l
    print "Vocabulary Histogram Generated"

SyntaxError: Missing parentheses in call to 'print' (<ipython-input-8-97b9541f8a05>, line 12)

In [None]:
for key,value in all_bovw_feature.items():
        category = []
        for img in value:
            histogram = np.zeros(len(visual_words))
            c = 0
            for each_feature in img:
                c += 1
                print(c)
                print(each_feature)

In [9]:
def find_index(image, center):
    count = 0
    ind = 0
    for i in range(len(center)):
        if(i == 0):
            count = distance.euclidean(image, center[i]) 
            #count = L1_dist(image, center[i])
        else:
            dist = distance.euclidean(image, center[i]) 
            #dist = L1_dist(image, center[i])
            if(dist < count):
                ind = i
                count = dist
    return ind

In [10]:
# Takes 2 parameters. The first one is a dictionary that holds the descriptors that are separated class by class 
# And the second parameter is an array that holds the central points (visual words) of the k means clustering
# Returns a dictionary that holds the histograms for each images that are separated class by class. 
from scipy import ndimage
from scipy.spatial import distance

def image_class(all_bovw, centers):
    dict_feature = {}
    for key,value in all_bovw.items():
        print(key)
        category = []
        for img in value:
            histogram = np.zeros(len(centers))
            for each_feature in img:
                ind = find_index(each_feature, centers)
                histogram[ind] += 1
            category.append(histogram)
        dict_feature[key] = category
    return dict_feature
    
# Creates histograms for train data    
bovw_train = image_class(all_bovw_feature, visual_words) 
# Creates histograms for test data
bovw_test = image_class(test_bovw_feature, visual_words) 

n02107908-Appenzeller
n02093991-Irish_terrier
n02096585-Boston_bull
n02086910-papillon
n02110185-Siberian_husky
n02108000-EntleBucher
n02108915-French_bulldog
n02086079-Pekinese
n02109525-Saint_Bernard
n02106166-Border_collie
n02089867-Walker_hound
n02088238-basset
n02096177-cairn
n02099601-golden_retriever
n02091831-Saluki
n02097130-giant_schnauzer
n02085620-Chihuahua
n02086646-Blenheim_spaniel
n02092002-Scottish_deerhound
n02098105-soft-coated_wheaten_terrier
n02106030-collie
n02095314-wire-haired_fox_terrier
n02104029-kuvasz
n02094114-Norfolk_terrier
n02100236-German_short-haired_pointer
n02105855-Shetland_sheepdog
n02115913-dhole
n02111277-Newfoundland
n02099267-flat-coated_retriever
n02097047-miniature_schnauzer
n02113624-toy_poodle
n02091134-whippet
n02100583-vizsla
n02110063-malamute
n02089078-black-and-tan_coonhound
n02097209-standard_schnauzer
n02115641-dingo
n02110806-basenji
n02094258-Norwich_terrier
n02105162-malinois
n02099429-curly-coated_retriever
n02086240-Shih-Tzu
n020

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler



In [25]:
x = bovw_train.get('n02107908-Appenzeller')
y = bovw_train.get('n02088238-basset')

In [27]:
len(x)

112

In [30]:
x[1]

array([ 3.,  2.,  4.,  1.,  0.,  2.,  4.,  0.,  1.,  3.,  6.,  3.,  2.,
        7.,  1.,  3.,  2.,  3.,  4.,  5.,  5.,  9.,  7.,  2.,  8.,  2.,
        0., 10.,  5.,  6.,  8.,  1.,  6.,  2.,  4.,  2.,  3.,  0.,  7.,
        0.,  2.,  4.,  4.,  0.,  5.,  7.,  1.,  2.,  1.,  6.,  3.,  0.,
        3.,  0.,  3.,  2.,  1.,  1.,  9.,  4.,  0.,  1.,  2.,  2.,  0.,
        2.,  1.,  5.,  3.,  4.,  0.,  7.,  1.,  3.,  2.,  2.,  0.,  2.,
        5.,  5.,  0.,  5.,  3.,  0.,  1.,  3.,  3.,  4.,  4.,  3.,  0.,
        3.,  1.,  3.,  6.,  7.,  2.,  8.,  0.,  4.,  5.,  1.,  4.,  3.,
        6.,  0.,  2.,  2.,  3.,  0.,  8.,  8.,  0.,  2.,  5., 12.,  5.,
       10.,  2.,  2.,  0.,  6.,  7.,  1.,  2.,  6.,  1.,  3.,  5.,  1.,
        4.,  3.,  0.,  1.,  0., 11.,  2.,  7.,  4.,  4.,  4.,  5.,  3.,
        1.,  2.,  4.,  1.,  3.,  0.,  1.,  1.,  6.,  4.,  4.,  2.,  2.,
        3.,  1.,  3.,  6.,  3.,  8.,  5.,  1.,  2.,  4.,  1.,  0.,  3.,
        3.,  2.,  2.,  1.,  0.,  7.,  3.,  6.,  5.,  0.,  3.,  6