# Associative learning without a teacher - Techniques for grouping high dimensional objects #

In this tutorial, we will reduce our high dimensional feature vector representations of the image objects to two dimensions and explore geometrical and probabilistic ways of grouping or classifying the objects indepedently of a prior group information (i.e., class labels). This approach to learning the data structure is often referred to as unsupervised (as opposed to supervised and semi-supervised), which designates a leaning task that discovers patterns in the data independently of an explicit teacher.  

In [None]:
# organize imports
import os
import glob
import warnings
import numpy as np
import cv2
import mahotas
import h5py
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [None]:
# fixed-sizes for image
fixed_size = tuple((250, 250))

# path to training data
datapath = "dataset"

# bins for histogram
bins = 8

# seed for reproducing same results
seed = 0

In [None]:
def fd_hu_moments(image):
    """
    Feature descriptor 1: Hu Moments for shape
    """
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    feature = cv2.HuMoments(cv2.moments(image)).flatten()
    return feature

def fd_haralick(image):
    """ 
    Feature descriptor 2: Haralick Texture for surface texture
    """
    # convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # compute the haralick texture feature vector
    haralick = mahotas.features.haralick(gray).mean(axis=0)
    # return the result
    return haralick

def fd_histogram(image, mask=None):
    """ 
    Feature descriptor 3: Color histogram for color
    """
    # convert the image to HSV color-space
    image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    # compute the color histogram
    hist = cv2.calcHist(
        [image], [0, 1, 2], None, [bins, bins, bins], [0, 256, 0, 256, 0, 256]
        )
    # normalize the histogram
    cv2.normalize(hist, hist)
    # return the histogram
    return hist.flatten()

In [None]:
image_names = list()
global_features = list()

for fname in sorted(glob.glob(os.path.join(datapath, "*.jpg"))):
    print(fname)
    image = cv2.imread(fname)
    image = cv2.resize(image, fixed_size)
    fv_hu_moments = fd_hu_moments(image)
    fv_haralick = fd_haralick(image)
    fv_histogram = fd_histogram(image)
    global_feature = np.hstack([fv_histogram, fv_haralick, fv_hu_moments])
    #  print(fv_histogram.shape)
    #  print(fv_haralick.shape)
    #  print(fv_hu_moments.shape)
    global_features.append(global_feature)
    image_names.append(fname.split("/")[-1].split(".")[0])

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
rescaled_features = scaler.fit_transform(global_features)

global_features = np.array(rescaled_features)

In [None]:
print(image_names)
print(global_features.shape)

## Group data

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

In [None]:
print("Each of our {} objects are represented as a 1x{} array".format(global_features.shape[0],global_features.shape[1]))

## Dimensionality reduction

Dimensionality reduction is the process of reducing the number of random variables under consideration by obtaining a set of principal variables. Principal variables are a subset of the original variables and preserve, to some extent, the structure and information carried by the original variables.

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2,random_state=0)
pca.fit(global_features.T)
#print(pca.explained_variance_ratio_)  
#print(pca.singular_values_)

print(pca.components_.shape)
Y = pca.components_

x_coords = Y[0,:]
y_coords = Y[1,:]

x_coords.shape

artists = [name.split("_")[0] for name in image_names]


le = LabelEncoder()
labels = le.fit_transform(artists)

artist_label = list(zip(labels, artists))

plt.scatter(x_coords, y_coords, c = labels)
#plt.text(x_coords, y_coords, image_names)
print(artist_label)

for i, x in enumerate(x_coords):
    y = y_coords[i]
    l = artists[i]
    plt.text(x,y,l)

### Cluster-based grouping

K-means clustering aims to partition n observations into k clusters in which each observation belongs to the cluster with the nearest mean, serving as a prototype of the cluster

In [None]:
k = 2
kmeans = KMeans(n_clusters=k, random_state=0).fit(global_features)
kmeans.labels_

plt.scatter(x_coords, y_coords, c = kmeans.labels_)

for i, x in enumerate(x_coords):
    y = y_coords[i]
    l = artists[i]
    plt.text(x,y,l)