# ML Notebook 3: Unsupervised Learning, Bag of Visual Features ("Words")

In [2]:
import pandas as pd
import copyreg
import pickle

import numpy as np
import cv2
import os
from scipy import ndimage
from scipy.spatial import distance
from sklearn.cluster import KMeans

from tqdm import tqdm 

In [65]:
# First thing that needs to be done is to split the data into our sets

# Training
with open("data/SCUT-FBP5500_v2/train_test_files/split_of_60%training and 40%testing/train.txt","r") as train:
    train_info = [l.split() for l in train.readlines()]
    train_info = {k:v for k,v in train_info}

# Test
with open("data/SCUT-FBP5500_v2/train_test_files/split_of_60%training and 40%testing/test.txt","r") as test:
    test_info = [l.split() for l in test.readlines()]
    test_info = {k:v for k,v in test_info}

In [66]:
# let's load in all our dataframes ( 10gb) from 01_Unsupervised_PCA_ML_pipeline.ipynb
path = 'data/SCUT-FBP5500_v2/reduced/'
fl = sorted(os.listdir(path))
fl.remove('.DS_Store') 

comp_df = pd.DataFrame()
for _, p in enumerate(fl):
    comp_df = pd.concat([comp_df,pd.read_pickle(path + p)])
print(f'done')

done


In [69]:
# this cell is going to go through each row and check if the filename is in the train list
train_df_list = []
test_df_list = []
for ix, row in comp_df.iterrows():
    if row[0] in train_info.keys():
        train_df_list.append(row)
    else:
        test_df_list.append(row)

# okay let's check the length to make sure we grabbed everything
len(train_df_list) == len(train_info)        

In [70]:
# now we construct the dataframes
train_df = pd.DataFrame(train_df_list, columns=comp_df.columns)
test_df = pd.DataFrame(test_df_list, columns=comp_df.columns)

In [71]:
# now delete the comp_df to reduce the overhead
del comp_df

In [23]:
# show feature names for reference
train_df.columns

Index(['Filename', 'orb_kp', 'orb_dec', 'male', 'asian', 'PCA_1', 'PCA_2'], dtype='object')

# SIFT ORB: Visual Concept Detection
## The bag-of-features models is one of the most popular and promising approaches for extracting the underlying semantics from image databases. Essentially this classification approach borrows from the bag-of-words concept and uses visual keypoints as features,"words", to build a dictionary of visual descriptors for an image class database. We will then use this dictionary to model "topics" of an image. We hope that these topics will give us a paradigm in which we can analyze beauty.


In [30]:
# Create a helper function
def orb_extractor(df):
    """
    Helper function to create a list of all keypoints in a image database 
    and a dictionary that maps the keypoints (value) to the image (key)
    
    Params:
    df is the dataframe holding the image database
    
    Return:
    [0] image_vectors : dict object with image filename as key and keypoints as values
    [1] descriptor_list : a list of all the keypoints in the database
    """
    
    image_vectors = {}
    descriptor_list =[]
    for _, row in df.iterrows():
        descriptor_list.extend(row['orb_dec'])
        image_vectors[row['Filename']] = row['orb_dec']
        
    return (image_vectors, descriptor_list)

In [72]:
# okay now we are going to extract what we need for feature engineering
train_orb_dict, train_orbs = orb_extractor(train_df)
test_orb_dict, test_orbs = orb_extractor(test_df)

In [74]:
# before we go any further let's save some files
with open('data/SCUT-FBP5500_v2/train_bovw_list', 'wb') as d:
    pickle.dump(train_orbs,d)
    
with open('data/SCUT-FBP5500_v2/train_orb_dict', 'wb') as d:
    pickle.dump(train_orb_dict,d)

# Unsupervised classification using Kmeans
## We have extracted 631,264 feature vectors (words) for our images
### Next we will take these features, words, and model them into topics via clustering. Essentially, we will plot all the keypoints and then find N cluster centers around the keypoints. Each cluster center represents a "topic" each topic has many keypoints that model it's semantics.

### We use Kmeans for clustering here instead of a density based approach because we believe that the outliers that are lost in density based clustering might be very informative in our beauty paradigm. Conceptually think of these outliers as rarely used words that strongly reference a certain topic.


In [5]:
# Only use training data for dictionary creation otherwise there is leakage
train_orbs = pickle.load(open('data/SCUT-FBP5500_v2/train_bovw_list', 'rb'))
train_orb_dict = pickle.load(open('data/SCUT-FBP5500_v2/train_orb_dict', 'rb'))

In [3]:
# A k-means clustering algorithm takes 2 parameter which is number 
# of cluster(k) and the other is descriptors list(unordered 1d array)
# Returns an array that holds central points.
def kmeans(k, descriptor_list):
    kmeans = KMeans(n_clusters = k, n_init=10)
    kmeans.fit(descriptor_list)
    visual_words = kmeans.cluster_centers_ 
    return visual_words

In [51]:
# Takes the central points which is visual words 
# Since this is one subject we can greatly reduce this space
# let's start with the # of original pts ie 86 
visual_words = kmeans(86, train_orbs)

In [52]:
# now let's assume everyone a single identifying property
# this K is too high for Kmeans and our dataset
#visual_words_2200 = kmeans(2200, train_orbs)

## We landed on representing 500 topics for our image database

In [75]:
# We landed on representing 500 topics
# this variable holds the cluster centers of each topic 
visual_words_500 = kmeans(500, train_orbs)

In [7]:
# this cell exports our cluster centers
# do not uncomment as this will overwrite a timely process
#with open('data/SCUT-FBP5500_v2/visual_words_500_centers', 'wb') as d:
#    pickle.dump(visual_words_500,d)

In [53]:
# Now time for to generate topic frequency for each image –– analyzing BOVF (Bag Of Visual Features).
def image_class_normed(all_bovw, centers):
    """
    This is a helper function that analyzes the euclidean distance between every keypoint in an image 
    and the topic cluster centers. It generates a histogram for each topic representation in an image.
    Essentially, this is generating a document term frequency matrix but for an image.
    
    Params:
    all_bovw: A dictionary that holds the keypoints as values and is separated by image filename key 
    centers: An array that holds the central points (visual topics) of the k means clustering
    
    Return:
    feat_dict : dictionary that holds the histograms for each images. 
    
    """
    dict_keys = []
    feats = []
    for key,value in all_bovw.items():
        dict_keys.append(key)
        # obtains distance/closeness to centers for keypoints
        dist = distance.cdist(value, centers, metric='euclidean')
        
        # argmin for each of key points, get the closest feature vocab (center)
        bin_assignment = np.argmin(dist, axis=1)
        
        # classify each kp into symbols
        # create histogram with size N describing number of symbols
        histogram = np.zeros(len(centers))
        for id_assign in bin_assignment:
            histogram[id_assign] += 1
        
        # assign the histogram to global features    
        feats.append(histogram)
    
    # normalize 
    feats = np.asarray(feats)
    feats_norm = np.linalg.norm(feats,axis=1)
    for i in range(0, feats.shape[0]):
        feats[i] = feats[i] /feats_norm[i]
        
    # feats now holds all the image features
    feat_dict = {k:v for k,v in zip(dict_keys, feats)}

    return feat_dict

In [76]:
# Creates histograms for train data  
# returns dict with image as key then a matrix of visual features  
bovw_train = image_class_normed(train_orb_dict, visual_words_500) 

# Creates histograms for test data
bovw_test = image_class_normed(test_orb_dict, visual_words_500)


In [78]:
# map agg user ratings to tain data
train_df['rating'] = train_df['Filename'].apply(lambda x: train_info[x])
# assign BOVF feature to train df
train_df['bovw'] = train_df['Filename'].apply(lambda x: bovw_train[x])

# assign BOVF feature to test df
test_df['bovw'] = test_df['Filename'].apply(lambda x: bovw_test[x])
# map agg user ratings to test data
test_df['rating'] = test_df['Filename'].apply(lambda x: test_info[x])

In [79]:
train_df_flt = train_df[['Filename', 'male', 'asian','bovw','PCA_1', 'PCA_2','rating']]
test_df_flt = test_df[['Filename', 'male', 'asian','bovw','PCA_1', 'PCA_2','rating']]

In [80]:
# save our files
# Can be read in from train_df & test_df

#with open('data/SCUT-FBP5500_v2/train_df', 'wb') as d:
#    pickle.dump(train_df_flt,d)
    
#with open('data/SCUT-FBP5500_v2/test_df', 'wb') as d:
#    pickle.dump(test_df_flt,d)

In [None]:
# Discovered that our feature mapping didn't work for our binary variables
# correcting that here
test_df_flt['male'] = test_df_flt['male'].fillna(0)
test_df_flt['asian'] = test_df_flt['asian'].fillna(0)

train_df_flt['male'] = train_df_flt['male'].fillna(0)
train_df_flt['asian'] = train_df_flt['asian'].fillna(0)

# Final Steps
## We need to explode our feature arrays into columns

### Training Data: All Features

In [None]:
n = 250  #chunk row size
list_df_train = [train_df_flt[i:i+n] for i in range(0,train_df_flt.shape[0],n)]
# need to unravel all the columns with matrices

#del train_df
# set compression
compression_opts = dict(method='zip',archive_name='out.csv')

for ix, batch in enumerate(list_df):

      print(f"\nstarting batch {ix}")
      # hold row in list bc concat expense
      tmp_df = []
      # flatten arrays into columns
      for i,row in batch.iterrows():
          tmp = []
          tmp.append(row['rating'])
          tmp.append(row['Filename'])
          tmp.append(row['male'])
          tmp.append(row['asian'])
          tmp.extend(np.ravel(row['bovw']))
          tmp.extend(np.ravel(row['PCA_1']))
          tmp.extend(np.ravel(row['PCA_2']))
          tmp_df.append(tmp)

      # build output
      rebuilt = pd.DataFrame(tmp_df)  
      rebuilt.rename(columns={0:'rating',1:'filename',2:'male',3:'asian'}, inplace= True) 

      # Save output
      with open(f"data/SCUT-FBP5500_v2/train_batch_00{ix}.zip", 'wb') as out:
        rebuilt.to_csv(out, index=False,compression=compression_opts)

      print(f'done with batch {ix}\n')    

#rebuilt = pd.concat([rebuilt, pd.DataFrame([tmp])])
#

### Train Data: Bag of Visual Features and no PCA

In [None]:
# this is a version with only bovw
tmp_df = []
# flatten arrays into columns
for i,row in train_df_flt.iterrows():
    tmp = []
    tmp.append(row['rating'])
    tmp.append(row['Filename'])
    tmp.append(row['male'])
    tmp.append(row['asian'])
    tmp.extend(np.ravel(row['bovw']))
    tmp_df.append(tmp)

# build output
rebuilt = pd.DataFrame(tmp_df)  
rebuilt.rename(columns={0:'rating',1:'filename',2:'male',3:'asian'}, inplace= True) 

# Save output
with open(f"data/SCUT-FBP5500_v2/train_bovw.zip", 'wb') as out:
  rebuilt.to_csv(out, index=False,compression=compression_opts)
print('done')   

### Test Data: All Features

In [64]:
n = 250  #chunk row size
list_df_test = [test_df_flt[i:i+n] for i in range(0,test_df_flt.shape[0],n)]
# need to unravel all the columns with matrices

#del train_df
# set compression
compression_opts = dict(method='zip',archive_name='out.csv')

for ix, batch in enumerate(list_df_test):

      print(f"\nstarting batch {ix}")
      # hold row in list bc concat expense
      tmp_df = []
      # flatten arrays into columns
      for i,row in batch.iterrows():
          tmp = []
          tmp.append(row['rating'])
          tmp.append(row['Filename'])
          tmp.append(row['male'])
          tmp.append(row['asian'])
          tmp.extend(np.ravel(row['bovw']))
          tmp.extend(np.ravel(row['PCA_1']))
          tmp.extend(np.ravel(row['PCA_2']))
          tmp_df.append(tmp)

      # build output
      rebuilt = pd.DataFrame(tmp_df)  
      rebuilt.rename(columns={0:'rating',1:'filename',2:'male',3:'asian'}, inplace= True) 

      # Save output
      with open(f"data/SCUT-FBP5500_v2/test_batch_00{ix}.zip", 'wb') as out:
        rebuilt.to_csv(out, index=False,compression=compression_opts)

      print(f'done with batch {ix}\n')    

#rebuilt = pd.concat([rebuilt, pd.DataFrame([tmp])])
#

(2200, 7)

### Test Data: Bag of Visual Features and no PCA

In [None]:
# this is a version with only bovw
tmp_df = []
# flatten arrays into columns
for i,row in test_df_flt.iterrows():
    tmp = []
    tmp.append(row['rating'])
    tmp.append(row['Filename'])
    tmp.append(row['male'])
    tmp.append(row['asian'])
    tmp.extend(np.ravel(row['bovw']))
    tmp_df.append(tmp)

# build output
rebuilt = pd.DataFrame(tmp_df)  
rebuilt.rename(columns={0:'rating',1:'filename',2:'male',3:'asian'}, inplace= True) 

# Save output
with open(f"data/SCUT-FBP5500_v2/test_bovw.zip", 'wb') as out:
  rebuilt.to_csv(out, index=False,compression=compression_opts)
print('done')   

# Done