# Bag of Visual Words (Features)

In [2]:
import pandas as pd
import copyreg
import pickle

import numpy as np
import cv2
import os
from scipy import ndimage
from scipy.spatial import distance
from sklearn.cluster import KMeans

In [65]:
with open("data/SCUT-FBP5500_v2/train_test_files/split_of_60%training and 40%testing/train.txt","r") as train:
    train_info = [l.split() for l in train.readlines()]
    train_info = {k:v for k,v in train_info}
    #image_list = [c[0] for c in train_info]
    

In [68]:
with open("data/SCUT-FBP5500_v2/train_test_files/split_of_60%training and 40%testing/test.txt","r") as test:
    test_info = [l.split() for l in test.readlines()]
    test_info = {k:v for k,v in test_info}

In [66]:
# let's load in all our dataframes ( 10gb)
path = 'data/SCUT-FBP5500_v2/reduced/'
fl = sorted(os.listdir(path))
fl.remove('.DS_Store') 

comp_df = pd.DataFrame()
for _, p in enumerate(fl):
    comp_df = pd.concat([comp_df,pd.read_pickle(path + p)])
print(f'done')

done


In [69]:
# this cell is going to go through each row and check if the filename is in the train list
train_df_list = []
test_df_list = []
for ix, row in comp_df.iterrows():
    if row[0] in train_info.keys():
        train_df_list.append(row)
    else:
        test_df_list.append(row)
        

In [20]:
# okay let's check the length to make sure we grabbed everything
len(train_df_list) == len(train_info)

True

In [70]:
train_df = pd.DataFrame(train_df_list, columns=comp_df.columns)
test_df = pd.DataFrame(test_df_list, columns=comp_df.columns)

In [71]:
# now to delete the comp_df
del comp_df

In [23]:
train_df.columns

Index(['Filename', 'orb_kp', 'orb_dec', 'male', 'asian', 'PCA_1', 'PCA_2'], dtype='object')

In [29]:
train_df['orb_dec'].sample()

0    [[218, 194, 44, 191, 13, 213, 156, 173, 140, 2...
Name: orb_dec, dtype: object

In [30]:
# extract the keypoint_desc vectors and append them to a list
def orb_extractor(df):
    image_vectors = {}
    descriptor_list =[]
    for _, row in df.iterrows():
        descriptor_list.extend(row['orb_dec'])
        image_vectors[row['Filename']] = row['orb_dec']
    return (image_vectors, descriptor_list)

In [72]:
# okay now we are going to extract what we need for feature engineering
train_orb_dict, train_orbs = orb_extractor(train_df)
test_orb_dict, test_orbs = orb_extractor(test_df)

In [74]:
# before we begin let's save some files

with open('data/SCUT-FBP5500_v2/train_bovw_list', 'wb') as d:
    pickle.dump(train_orbs,d)
    
with open('data/SCUT-FBP5500_v2/train_orb_dict', 'wb') as d:
    pickle.dump(train_orb_dict,d)

# We have 631,264 feature vectors (words) for our images
## time to cluster them with Kmeans; unsupervised classification

In [5]:
train_orbs = pickle.load(open('data/SCUT-FBP5500_v2/train_bovw_list', 'rb'))

In [11]:
train_orb_dict = pickle.load(open('data/SCUT-FBP5500_v2/train_orb_dict', 'rb'))

In [3]:
# A k-means clustering algorithm who takes 2 parameter which is number 
# of cluster(k) and the other is descriptors list(unordered 1d array)
# Returns an array that holds central points.
def kmeans(k, descriptor_list):
    kmeans = KMeans(n_clusters = k, n_init=10)
    kmeans.fit(descriptor_list)
    visual_words = kmeans.cluster_centers_ 
    return visual_words
 

In [51]:
# Takes the central points which is visual words 
# Since this is one subject we can greatly reduce this space
# let's start with the # of original pts ie 86 
visual_words = kmeans(86, train_orbs)

In [52]:
# now let's assume everyone a single identifying property
# this K is too high
#visual_words_2200 = kmeans(2200, train_orbs)

In [75]:
visual_words_500 = kmeans(500, train_orbs)

In [7]:
#with open('data/SCUT-FBP5500_v2/visual_words_500_centers', 'wb') as d:
#    pickle.dump(visual_words_500,d)

In [8]:
type(visual_words_500)

numpy.ndarray

In [53]:
# Now time for the histogram plotting to analyze BOVW

# Takes 2 parameters. The first one is a dictionary that holds the descriptors that are separated by image 
# And the second parameter is an array that holds the central points (visual words) of the k means clustering
# Returns a dictionary that holds the histograms for each images. 
def image_class_normed(all_bovw, centers):
    dict_keys = []
    feats = []
    for key,value in all_bovw.items():
        dict_keys.append(key)
        # obtains distance/closeness to centers for keypoints
        dist = distance.cdist(value, centers, metric='euclidean')
        
        # argmin for each of key points, get the closest feature vocab (center)
        bin_assignment = np.argmin(dist, axis=1)
        
        # classify each kp into symbols
        # create histogram with size N describing number of symbols
        histogram = np.zeros(len(centers))
        for id_assign in bin_assignment:
            histogram[id_assign] += 1
        
        # assign the histogram to global features    
        feats.append(histogram)
    
    # normalize 
    feats = np.asarray(feats)
    feats_norm = np.linalg.norm(feats,axis=1)
    for i in range(0, feats.shape[0]):
        feats[i] = feats[i] /feats_norm[i]
        
    # feats now holds all the image features
    feat_dict = {k:v for k,v in zip(dict_keys, feats)}

    return feat_dict

In [76]:
# Creates histograms for train data  
# returns dict with image as key then a matrix of visual features  
bovw_train = image_class_normed(train_orb_dict, visual_words_500) 


In [77]:
# Creates histograms for test data
bovw_test = image_class_normed(test_orb_dict, visual_words_500)

In [78]:
train_df['rating'] = train_df['Filename'].apply(lambda x: train_info[x])
train_df['bovw'] = train_df['Filename'].apply(lambda x: bovw_train[x])

# assign bovw feature to test df
test_df['bovw'] = test_df['Filename'].apply(lambda x: bovw_test[x])
test_df['rating'] = test_df['Filename'].apply(lambda x: test_info[x])

In [79]:
train_df_flt = train_df[['Filename', 'male', 'asian','bovw','PCA_1', 'PCA_2','rating']]
test_df_flt = test_df[['Filename', 'male', 'asian','bovw','PCA_1', 'PCA_2','rating']]

In [80]:
# save our files
with open('data/SCUT-FBP5500_v2/train_df', 'wb') as d:
    pickle.dump(train_df_flt,d)
    
with open('data/SCUT-FBP5500_v2/test_df', 'wb') as d:
    pickle.dump(test_df_flt,d)

In [64]:
train_df_flt.shape

(2200, 7)

In [63]:
test_df_flt.shape

(3301, 6)