In [1]:
import numpy as np
from sklearn import svm
from collections import Counter
from sklearn.neighbors import KNeighborsClassifier

In [2]:
def get_str2numb_numb2dict(vect):
    str_to_ind_dict = {}
    count = 0
    for v in vect:
        if v not in str_to_ind_dict.keys():
            str_to_ind_dict[v] = count
            count += 1
    reverse_dict = {v:k for k, v in str_to_ind_dict.items()}
    return str_to_ind_dict, reverse_dict

def apply_dict(dict_keys, X):
    res = []
    for x in X:
        res.append(dict_keys[x])
    return res

def arithmetic_round(x):
    a = int(x)
    b = x - a
    if (b < 0.5): 
        return round(x)
    else:
        return round(x + 0.01)

In [3]:
def identify_knn(X, y, test_features, test_y, ind_to_str_dict, metric='chebyshev'):
    neigh = KNeighborsClassifier(n_neighbors=2, metric=metric)
    neigh.fit(X, y) 
    res = neigh.predict(test_features) 
    predict = apply_dict(ind_to_str_dict, res)
    count = 0
    for i,j in zip(predict, test_y):
        if i == j:
            count += 1

    print("top-1 autor accuracy: {}%".format(arithmetic_round(100 * count / len(test_y))))

    predict = neigh.predict_proba(test_features)
    count = 0
    for i, res in enumerate(predict):
        ind = np.argpartition(res, -4)[-4:]
        ind = apply_dict(ind_to_str_dict , ind)
        if test_y[i] in ind:
            count += 1

    print("top-5 autor accuracy: {}%".format(arithmetic_round(100 * count / len(test_y))))

# Load codebook embeddings with kernel_size 13

In [4]:
train_features = np.load('data/train_embeddings_13.npy')
val_features = np.load('data/val_embeddings_13.npy')
test_features = np.load('data/test_embeddings_13.npy')

train_y = np.load('data/train_y.npy')
val_y = np.load('data/val_y.npy')
test_y = np.load('data/test_y.npy')

In [5]:
str_to_ind_dict, ind_to_str_dict = get_str2numb_numb2dict(train_y)

X, y = train_features, train_y 
y = np.array(apply_dict(str_to_ind_dict, y))

# Fit KNN:

### Euclidean distance:

In [6]:
identify_knn(X, y, test_features, test_y, ind_to_str_dict, metric='euclidean')

top-1 autor accuracy: 37%
top-5 autor accuracy: 64%


### Manhattan distance:

In [7]:
identify_knn(X, y, test_features, test_y, ind_to_str_dict, metric='manhattan')

top-1 autor accuracy: 43%
top-5 autor accuracy: 69%


### Chebyshev distance:

In [8]:
identify_knn(X, y, test_features, test_y, ind_to_str_dict, metric='chebyshev')

top-1 autor accuracy: 21%
top-5 autor accuracy: 39%


# Increase the number of training data

In [9]:
X = np.concatenate((train_features, val_features), axis=0)
y = np.concatenate((train_y, val_y)) 
y = np.array(apply_dict(str_to_ind_dict, y))

# Fit KNN:

### Euclidean distance:

In [10]:
identify_knn(X, y, test_features, test_y, ind_to_str_dict, metric='euclidean')

top-1 autor accuracy: 54%
top-5 autor accuracy: 68%


### Manhattan distance:

In [11]:
identify_knn(X, y, test_features, test_y, ind_to_str_dict, metric='manhattan')

top-1 autor accuracy: 56%
top-5 autor accuracy: 75%


### Chebyshev distance:

In [12]:
identify_knn(X, y, test_features, test_y, ind_to_str_dict, metric='chebyshev')

top-1 autor accuracy: 29%
top-5 autor accuracy: 46%


# Add OBIFs

In [13]:
train_obifs = np.load('OBIFs/train_obifs.npy')
val_obifs = np.load('OBIFs/val_obifs.npy')
test_obifs = np.load('OBIFs/test_obifs.npy')


train_features = np.concatenate((train_features, train_obifs), axis = 1)
val_features = np.concatenate((val_features, val_obifs), axis = 1)
test_features = np.concatenate((test_features, test_obifs), axis = 1)

In [14]:
X, y = train_features, train_y 
y = np.array(apply_dict(str_to_ind_dict, y))

# Fit KNN with OBIFs:

### Euclidean distance:

In [15]:
identify_knn(X, y, test_features, test_y, ind_to_str_dict, metric='euclidean')

top-1 autor accuracy: 40%
top-5 autor accuracy: 76%


### Manhattan distance:

In [16]:
identify_knn(X, y, test_features, test_y, ind_to_str_dict, metric='manhattan')

top-1 autor accuracy: 51%
top-5 autor accuracy: 80%


### Chebyshev distance:

In [17]:
identify_knn(X, y, test_features, test_y, ind_to_str_dict, metric='chebyshev')

top-1 autor accuracy: 35%
top-5 autor accuracy: 64%


# Increase the number of training data

In [18]:
X = np.concatenate((train_features, val_features), axis=0)
y = np.concatenate((train_y, val_y)) 
y = np.array(apply_dict(str_to_ind_dict, y))

# Fit KNN with OBIFs:

### Euclidean distance:

In [19]:
identify_knn(X, y, test_features, test_y, ind_to_str_dict, metric='euclidean')

top-1 autor accuracy: 71%
top-5 autor accuracy: 89%


### Manhattan distance:

In [20]:
identify_knn(X, y, test_features, test_y, ind_to_str_dict, metric='manhattan')

top-1 autor accuracy: 75%
top-5 autor accuracy: 92%


### Chebyshev distance:

In [21]:
identify_knn(X, y, test_features, test_y, ind_to_str_dict, metric='chebyshev')

top-1 autor accuracy: 60%
top-5 autor accuracy: 74%
