In [2]:
"""
Authors      : Aditya Jain and Safwan Jamal
Date started : November 15, 2022
About        : Convex Optimization project; generate training data lists for different lambda
"""

import os
import glob
import numpy as np
import pickle
import cvxpy as cp
import pandas as pd
import random
import seaborn as sns

#### Save list for test images

In [33]:
def test_list(data_dir, save_dir, no_imgs_per_class=40):
    """
    saves test images list 
    
    Args:
        data_dir         : root directory containing the data
        no_imgs_per_class: number of images to save; Optional; default=150 
        save_dir         : directory to save the list
    Returns:
        None
    """
    
    data = []    
    for category in os.listdir(data_dir): 
        image_names = os.listdir(data_dir + category)[:no_imgs_per_class]        
        for image in image_names:
            data.append([image, category])
    
    data_df = pd.DataFrame(data, columns =['image', 'category'])
    data_df.to_csv(save_dir + 'test.csv', index=False)
    
cifar_data_dir = './cifar-10-images/test/'
save_list_dir  = './image_list_cifar/'
test_list(cifar_data_dir, save_list_dir)

#### Save list for all train images

In [34]:
def train_list_all(data_dir, save_dir, no_imgs_per_class=150):
    """
    saves training images list for all of the training set
    
    Args:
        data_dir         : root directory containing the data        
        save_dir         : directory to save the list
        no_imgs_per_class: number of images to save; Optional; default=150 
    Returns:
        None
    """
    
    data = []    
    for category in os.listdir(data_dir): 
        image_names = sorted(os.listdir(data_dir + category))[:no_imgs_per_class]        
        for image in image_names:
            data.append([image, category])
    
    data_df = pd.DataFrame(data, columns =['image', 'category'])
    data_df.to_csv(save_dir + 'train_full.csv', index=False)
    
cifar_data_dir = './cifar-10-images/train/'
save_list_dir = './image_list_cifar/'
train_list_all(cifar_data_dir, save_list_dir)

#### Save list for random train images

In [35]:
def train_list_random(data_dir, save_dir, percent, no_imgs_per_class=150):
    """
    saves training images list for a random subset of the training set
    
    Args:
        data_dir : root directory containing the data 
        save_dir : directory to save the list
        percent  : percentage of samples to sample from training set
        no_imgs_per_class: number of images in full training set; Optional; default=150 
    Returns:
        None
    """
    
    data = []    
    for category in os.listdir(data_dir): 
        image_names  = sorted(os.listdir(data_dir + category))[:no_imgs_per_class] 
        random.shuffle(image_names)
        total_images = len(image_names)
        image_names  = image_names[:int(percent*total_images)]
        for image in image_names:
            data.append([image, category])
    
    data_df = pd.DataFrame(data, columns =['image', 'category'])
    data_df.to_csv(save_dir + 'train_random_per_' + str(int(percent*100)) + '.csv', index=False)
    
cifar_data_dir = './cifar-10-images/train/'
save_list_dir = './image_list_cifar/'
percent        = 0.8
train_list_random(cifar_data_dir, save_list_dir, percent)

#### Save list for exemplar train images

In [24]:
def train_list_cvopt_exemplar(image_data_dir, diss_data_dir, category, save_dir, percent, lambda_fac):
    """
    saves training images list for a particular class given the percetage value needed
    
    Args:
        image_data_dir   : root directory containing the image data
        diss_data_dir    : directory containing the dissimilarity data
        category         : class for which optimization needs to be done
        save_dir         : directory to save the list
        percent          : percentage of training points needed
        lambda_fac       : lambda factor to be applied to lambda_max
    """
    
    filename  = open(diss_data_dir + category + '_dissimilarity_matrix_150x150.pickle', 'rb')
    D         = pickle.load(filename)
    n         = len(D)
    
    # calculate lambda_max
    lambda_max = 0
    for i in range(n):
        for j in range(i+1, n):
            lambda_cur = np.linalg.norm(D[i, :] - D[j, :], ord=1)/2  #l-infinity norm
        if lambda_cur>lambda_max:
            lambda_max = lambda_cur
    
    ## convex optimization
    # variable definitions
    Z             = cp.Variable((n, n))
    lambda_t      = lambda_fac*lambda_max

    # objective function
    cost_encoding = cp.trace(D.T@Z)        # cost of encoding all data points using representatives
    cost_no_repr  = 0                      # cost associated with no. of representatives
    for i in range(n):
        cost_no_repr += cp.max(Z[i, :])  # l-infinity norm
    cost_no_repr = lambda_t*cost_no_repr

    # objective function
    objective    = cp.Minimize(cost_encoding + cost_no_repr)

    # constraints
    # probab. should be >=0
    # probabilities should sum to one for every column
    constraints = [Z>=0, np.ones((1,n))@Z == np.ones((1,n))]

    # optimization program
    prob = cp.Problem(objective, constraints)
    prob.solve()    
    Z = np.round(Z.value, 2)
    
    # fetching the specific image names from original data
    filename  = open(diss_data_dir + category + '_image_list_150x150.pickle', 'rb')
    data      = pickle.load(filename)
    orig_image_set = sorted(data)
    
    # check for non-zero rows
    num_non_zeros_rows = 0
    img_idx_list       = []
    data               = [] 
    for i in range(n):
        z_row = Z[i, :]
        if np.any(z_row):
            num_non_zeros_rows += 1
            img_idx_list.append(i)
            data.append([orig_image_set[i], category]) 
            
    print(f'Number of non-zero rows are {num_non_zeros_rows}')
#     print(img_idx_list)
    
    # save training list
    data_df = pd.DataFrame(data, columns =['image', 'category'])
    data_df.to_csv(save_dir + 'train_exemplar_' + category + '_per_' + str(int(percent*100)) + '.csv', index=False)
    

cifar_data_dir = './cifar-10-images/train/'
diss_data_dir  = './dissimilarity_data_cifar/'
category       = 'airplane'
save_list_dir  = './image_list_cifar/'
percent        = 0.2
lambda_fac     = 0.05
train_list_cvopt_exemplar(cifar_data_dir, diss_data_dir, category, save_list_dir, percent, lambda_fac)

Number of non-zero rows are 10
