# Pattern recognition CW2

In [1]:
from scipy.io import loadmat
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
import time

### Import file with 6 main components:

In [4]:
""" 
camId : which camera was used to get the shot (1 or 2)
filelist: names of the images (with format x_label_camId_index.png)
labels: class of the image (which person's image is it?)
query_idx: indexes of test set
gallery_idx: indexes of test set used for kNN 
train_idx: indexes of training and validation set
"""   
train_idxs = loadmat('cuhk03_new_protocol_config_labeled.mat')

### Import file with the feature vectors of all images:

In [5]:
import json
with open('PR_data/feature_data.json', 'r') as f:
    features = json.load(f)
features_np = np.array(features) #list of features converted to an array 
features_np.shape

(14096, 2048)

In [241]:
train_idxs['labels'].shape

array([[   1],
       [   1],
       [   1],
       ...,
       [1467],
       [1467],
       [1467]], dtype=uint16)

In [240]:
train_idxs['gallery_idx'].shape

array([[   21],
       [   23],
       [   24],
       ...,
       [14062],
       [14064],
       [14065]], dtype=uint16)

In [242]:
train_idxs['train_idx']

array([[    1],
       [    2],
       [    3],
       ...,
       [14094],
       [14095],
       [14096]], dtype=uint16)

In [9]:
7368+5328

12696

In [10]:
train_idxs['query_idx'].shape

(1400, 1)

In [11]:
12696+1400

14096

## kNN Classification for query set on gallery set

In [108]:
def extract_info_from_filename(image_index): 
    name = str(train_idxs['filelist'][image_index][0][0])
    [dc, label, camId, index] = name.split('_')
    return label, camId

def delete_images(query_image, query_index, gallery_images, gallery_labels):
    gallery_names = train_idxs['filelist'][train_idxs['gallery_idx'].flatten()]
    query_label, query_camId = extract_info_from_filename(query_index)
    new_gallery_images = []
    new_gallery_labels = []
    for img, name in zip(gallery_images, gallery_names):
        [dc, label_g, camId_g, index] = str(name[0][0]).split('_')
        if [int(label_g), int(camId_g)] != [int(query_label), int(query_camId)]:
            new_gallery_images.append(img)
            new_gallery_labels.append(int(label_g))
    return new_gallery_images, new_gallery_labels

def score_rank(query_label, rank_k, new_gallery_labels):
    #print(rank_k)
    rank_k_labels = np.array(new_gallery_labels)[rank_k #extract the labels which are of rank k
    return (query_label[0] in rank_k_labels) #return true if query label in rank k, else false

In [229]:
train_idxs['gallery_idx'].flatten()

array([   21,    23,    24, ..., 14062, 14064, 14065], dtype=uint16)

In [109]:
def NNClassification_deletions(query_image, query_index, gallery_images, gallery_labels, k):
    neigh = NearestNeighbors(n_neighbors=k, n_jobs=-1)
    new_gallery_images, new_gallery_labels = delete_images(query_image, query_index, gallery_images, gallery_labels)
    neigh.fit(new_gallery_images)   #new_gallery_labels)
    distances, indices = neigh.kneighbors(query_image, k)
    return distances, indices, new_gallery_labels #neigh.score(query_image, query_label), neigh.predict(query_image), 

In [110]:
gallery_labels = train_idxs['labels'][train_idxs['gallery_idx'].flatten()]
query_labels = train_idxs['labels'][train_idxs['query_idx'].flatten()]

query_images = features_np[train_idxs['query_idx'].flatten()]
gallery_images = features_np[train_idxs['gallery_idx'].flatten()]

def get_accuracy_for_k_ranks(k):
    list_of_truths = []
    start = time.time() #time tracking - start time of process
    for index, query_index in enumerate(train_idxs['query_idx'].flatten().tolist()):
        if(index < 101):
            if (index % 10 == 0):
                print("Index: ", index, " & time taken: ", time.time() - start)

            #perform NN Classification and extract top k ranks 
            dist, rank_k_indices, new_gallery_labels = NNClassification_deletions(query_images[index].reshape(1, 2048), query_index, gallery_images, gallery_labels, k)
            #calculate the score (true/false) for top k images and query image recognition
            score = score_rank(query_labels[index], rank_k_indices, new_gallery_labels)
            #create a list of scores to get overall accuracy later on
            list_of_truths.append(score)
    
    #over-all accuracy  
    acc = sum(list_of_truths)/len(list_of_truths)

In [111]:
train_idxs['gallery_idx'].flatten()

array([   21,    23,    24, ..., 14062, 14064, 14065], dtype=uint16)

In [235]:
train_idxs['filelist'][204]

array([array(['1_022_2_06.png'], dtype='<U14')], dtype=object)

In [228]:
query_images

array([[0.98361254, 0.33195475, 0.26549727, ..., 0.02350602, 0.08491972,
        0.00514889],
       [1.30696166, 0.22219124, 0.62375975, ..., 0.02954952, 0.37348923,
        0.05303315],
       [0.29322624, 0.55588913, 0.06961903, ..., 0.15892459, 0.21527511,
        0.09551907],
       ...,
       [0.7593497 , 0.24537075, 2.19513273, ..., 0.47061139, 1.35971546,
        0.46198806],
       [0.48667279, 0.49618778, 0.30824399, ..., 0.04450084, 1.28799629,
        0.37683338],
       [0.05719076, 0.60635394, 0.58300596, ..., 0.40994745, 1.37061644,
        0.55975795]])

In [359]:
gallery_labels = train_idxs['labels'][train_idxs['gallery_idx'].flatten()]
query_labels = train_idxs['labels'][train_idxs['query_idx'].flatten()]

query_images = features_np[train_idxs['query_idx'].flatten()]
gallery_images = features_np[train_idxs['gallery_idx'].flatten()]

N=100


def remove_indices(n_indices, query_index):
    query_label, query_camId = extract_info_from_filename(query_index)
    n_names = train_idxs['filelist'][n_indices]
    #k_images = features_np[k_indices]
    #print(k_names.shape, k_names[0], k_images[0])
    final_n_indices = []
    final_n_labels = []
    for index, name in zip(n_indices, n_names):
        [dc, label_g, camId_g, index_g] = str(name[0][0]).split('_')
        #print(str(name[0][0]).split('_'))
        #print(index)
        if [int(label_g), int(camId_g)] != [int(query_label), int(query_camId)]:
            final_n_indices.append(index)
            final_n_labels.append(int(label_g))
    return final_n_indices, final_n_labels

def get_accuracy(k, N):
    list_of_truths = []
    start = time.time() #time tracking - start time of process
    neigh = NearestNeighbors(n_neighbors=N, n_jobs=-1)
    neigh.fit(gallery_images)   #new_gallery_labels)
    for index, query_index in enumerate(train_idxs['query_idx'].flatten().tolist()):
        if (index % 10 == 0):
            print("Index: ", index, " & time taken: ", time.time() - start)
        N_distances, N_indices = neigh.kneighbors(query_images[index].reshape(1, 2048), N)
        #print(query_labels[index], query_index, N_indices)
        topN_gallery_images = gallery_images[N_indices[0]]
        topN_gallery_indices = []
        for image in topN_gallery_images:
            topN_gallery_indices.append(np.where(np.all(features_np == image, axis=1))[0][0])
        print('topN indices', topN_gallery_indices)
        print('n_indices', N_indices[0])
        reduced_topN_indices, reduced_topN_labels = remove_indices(topN_gallery_indices, query_index)
        if query_labels[index][0] in reduced_topN_labels[:k]:
            list_of_truths.append(True)
        else:
            list_of_truths.append(False)
        if index == 30:
            break
    #over-all accuracy  
    acc = sum(list_of_truths)/len(list_of_truths) 
    print(acc)

In [360]:
features_np[[2, 3]].shape

(2, 2048)

In [361]:
%prun get_accuracy(100, 100)

Index:  0  & time taken:  1.3450255393981934
topN indices [23, 21, 483, 7193, 480, 13227, 481, 13941, 478, 486, 13738, 13231, 7189, 1141, 13736, 5039, 24, 28, 485, 7061, 13739, 26, 129, 13938, 1140, 6326, 13229, 13230, 13940, 7064, 7060, 482, 734, 733, 130, 25, 990, 13735, 484, 13490, 13733, 7063, 127, 6430, 943, 1139, 7192, 4012, 13555, 946, 945, 818, 6432, 949, 6434, 3732, 8235, 7195, 6436, 5098, 2592, 6325, 5185, 5037, 7051, 13234, 6518, 7191, 9826, 5699, 4604, 736, 13226, 7194, 5280, 2462, 6596, 5698, 1651, 1753, 7196, 13051, 13050, 7488, 2460, 7074, 1751, 2465, 5701, 7494, 13224, 4610, 4011, 7489, 10197, 2459, 6520, 13152, 8558, 2277]
n_indices [   1    0  150 2775  147 5048  148 5284  146  153 5218 5051 2772  387
 5217 1968    2    5  152 2725 5219    4   31 5282  386 2420 5049 5050
 5283 2727 2724  149  243  242   32    3  339 5216  151 5151 5214 2726
   30 2464  309  385 2774 1590 5161  312  311  271 2465  315 2467 1479
 3213 2777 2469 2014 1043 2419 2037 1966 2717 5053 2495 27

KeyboardInterrupt: 

In [None]:
get_accuracy_for_k_ranks(5)

In [41]:
%prun get_accuracy_for_k_ranks(10)

Index:  0  & time taken:  0.0
Index:  10  & time taken:  23.64891242980957
Index:  20  & time taken:  47.139809370040894
Index:  30  & time taken:  71.5410213470459
Index:  40  & time taken:  95.63419580459595
Index:  50  & time taken:  119.49739098548889
Index:  60  & time taken:  142.7541127204895
Index:  70  & time taken:  165.4745020866394
Index:  80  & time taken:  188.19400596618652
Index:  90  & time taken:  211.17892265319824
Index:  100  & time taken:  233.9523696899414
 