This supervised variant of NPE adds a regularization term to the loss function that encourages data points of the same class to be embedded closer to the same point.

https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6889368

In [41]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt


In [31]:
DATA_DIR = "./data/News/"

X_train = np.load(f"{DATA_DIR}train_embeddings.npy")
y_train = np.load(f"{DATA_DIR}train_labels.npy")
X_test = np.load(f"{DATA_DIR}test_embeddings.npy")
y_test = np.load(f"{DATA_DIR}test_labels.npy")

In [38]:
def p1solver(data, x, neighbors):
    Z = data[neighbors].T
    Z = Z - np.repeat(x.T, len(neighbors), axis=0).reshape(data.shape[1], len(neighbors))
    C = Z.T @ Z

    #conditioning C
    C = C + (np.eye(C.shape[0]) * 1e-5)
    
    w = np.linalg.solve(C, np.ones(len(neighbors)))
    w_final = np.zeros(len(data))
    w_final[neighbors] = w
    return w_final / np.sum(w_final)

def knn(k, data, test):
    return np.argsort(np.sum(data**2, axis=1) - 2 * test.dot(data.T), axis=0)[1:k+1]

def SNPE(X, y, n_neighbors, beta, c):
    '''
    X: n x m data matrix
    y: n x 1 labels vector
    n_neighbors: # neighbors to use for constructing KNN graph
    beta: hyperparam for importance of label information regularization term
    c: # of different classes

    returns: A: m x c matrix. Take X @ A to be the embedding of data X in c dimensions.
    '''

    W = []
    for i in range((len(X))): 
        neighbors = knn(n_neighbors, X, X[i])
        W.append(p1solver(X, X[i], neighbors))
    W = np.asarray(W)
    I = np.eye(X.shape[0])
    M = (I - W).T @ (I - W)

    H = np.zeros((c, len(y)))
    H[y, np.arange(len(y))] = 1

    return np.linalg.inv(X.T @ M @ X + beta * X.T @ X).T @ X.T @ H.T

In [39]:
classifier = KNeighborsClassifier(n_neighbors=20)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
baseline_acc = np.mean(y_test == y_pred)
print(f'Baseline accuracy: {baseline_acc}')

Baseline accuracy: 0.7192256341789052


In [44]:
betas = [1, 5, 10, 100]
n_neighbors_arr = [10, 20, 30, 50, 100, 200, 300, 500]
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
best_models = {} # maps KNN n_neighbors --> (best beta, test_accuracy) 

for n_neighbors in n_neighbors_arr:
    best_acc = 0
    for beta in betas:
        A = SNPE(X_train, y_train, n_neighbors=100, beta=beta, c=20)
        X_train_embed = X_train @ A
        classifier = KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-2)
        classifier.fit(X_train_embed, y_train)
        X_val_embed = X_val @ A
        y_pred = classifier.predict(X_val_embed)
        val_acc = np.mean(y_val == y_pred)
        if val_acc > best_acc: 
            best_acc, best_A, best_beta = val_acc, A, beta

    X_test_embed = X_test @ best_A
    y_pred = classifier.predict(X_test_embed)
    test_acc = np.mean(y_test == y_pred)
    best_models[n_neighbors] = (best_beta, test_acc)
    print(f'KNN on SNPE embeddings classification accuracy, n_neighbors = {n_neighbors}: best_beta={best_beta}, test_acc={test_acc}')


KNN on SNPE embeddings classification accuracy, n_neighbors = 10: best_beta=5, test_acc=0.706675567423231
KNN on SNPE embeddings classification accuracy, n_neighbors = 20: best_beta=1, test_acc=0.7070761014686249
KNN on SNPE embeddings classification accuracy, n_neighbors = 30: best_beta=10, test_acc=0.7052069425901202
KNN on SNPE embeddings classification accuracy, n_neighbors = 50: best_beta=10, test_acc=0.7049399198931909
KNN on SNPE embeddings classification accuracy, n_neighbors = 100: best_beta=5, test_acc=0.7050734312416556
KNN on SNPE embeddings classification accuracy, n_neighbors = 200: best_beta=5, test_acc=0.7054739652870494
KNN on SNPE embeddings classification accuracy, n_neighbors = 300: best_beta=5, test_acc=0.702803738317757
KNN on SNPE embeddings classification accuracy, n_neighbors = 500: best_beta=10, test_acc=0.6975967957276369
