In [9]:
%load_ext autoreload
%autoreload 2

import numpy as np
# import pandas as pd

# import pathlib
# import string
import os
import sys
module_path = os.path.abspath(os.path.join('../NIR'))

import torch

from sklearn.neighbors import KDTree
from numpy.linalg import norm
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from models import Segmentation

import methods
import embedding
import loader
import metric

# class AttrDict(dict):
#     __getattr__ = dict.__getitem__
#     __setattr__ = dict.__setitem__
#     __delattr__ = dict.__delitem__

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Создание модели UV-Net Segmentation

In [2]:
checkpoint = 'D:/NIR/results/segmentation100/0925/222631/best.ckpt'
device = torch.device('cpu:0')
model = Segmentation.load_from_checkpoint(checkpoint).model.to(device = device)

  stream(template_mgs % msg_args)


## Загрузка датасета SolidLetters

In [3]:
dataset = loader.get_loaders(model)

D:\NIR\SolidLetters\graph_with_eattr\a_Martel_lower.bin True


100%|██████████████████████████████████████████████████████████████████████████████| 2600/2600 [00:29<00:00, 87.06it/s]


Done loading 2600 files
D:\NIR\SolidLetters\graph_with_eattr\a_Frijole_upper.bin True


100%|██████████████████████████████████████████████████████████████████████████████| 2600/2600 [00:26<00:00, 96.36it/s]


Done loading 2600 files


## Получение векторных представлений

In [4]:
embs, label = embedding.get_embs(dataset, model, device)
K=5

# Расчет метрики

### L2-Нормализация

In [10]:
iteration = len(embs)
norm_embs = []
for i in range(iteration):
    norms = norm(embs[i], axis=1)
    norm_embs.append(embs[i] / norms[:, np.newaxis])

### Понижение размерности PCA

In [11]:
transformer = PCA(n_components=64)
norm_pca = []
for i in range(iteration):
    norm_pca.append(transformer.fit_transform(norm_embs[i]))

### Понижение размерности TSNE

In [12]:
norm_tsne = []
for i in range(iteration):
    norm_tsne.append(TSNE(n_components=3, learning_rate='auto', metric='cosine', init='pca', perplexity=1)
                     .fit_transform(norm_embs[i]))

### Поиск ближайших KD-Tree cosine distance

In [13]:
norm_ind = []
norm_pca_ind = []
norm_tsne_ind = []
for i in range(iteration):
    cosine_tree = methods.KDTree(norm_embs[i], 0)
    norm_ind.append(cosine_tree.search(norm_embs[i]))

    cosine_tree = methods.KDTree(norm_pca[i], 0)
    norm_pca_ind.append(cosine_tree.search(norm_pca[i]))

    cosine_tree = methods.KDTree(norm_tsne[i], 0)
    norm_tsne_ind.append(cosine_tree.search(norm_tsne[i]))

### Метрика

In [14]:
norm_metric = metric.calc_map(norm_ind, label)
norm_pca_metric = metric.calc_map(norm_pca_ind, label)
norm_tsne_metric = metric.calc_map(norm_tsne_ind, label)
print("Исходные эмбеддинги - %s\nPCA - %s\nTSNE - %s" % (norm_metric, norm_pca_metric, norm_tsne_metric))

Исходные эмбеддинги - 0.7382552083333332
PCA - 0.7346419270833333
TSNE - 0.8526570638020834


# HNSW vs KD-Tree vs Точный подсчет

## Точный подсчет

In [15]:
acc_ind = []
for i in range(iteration):
    acc_count = norm_embs[i] @ norm_embs[i].T
    acc_ind.append(metric.get_nearest_idxs(acc_count))
acc_metr = metric.calc_map(acc_ind, label)

## KD-Tree

In [16]:
norm_kdtree_ind = []
for i in range(iteration):
    cosine_tree = methods.KDTree(norm_embs[i], 0)
    norm_kdtree_ind.append(cosine_tree.search(norm_embs[i]))
kd_tree_metr = metric.calc_map(norm_kdtree_ind, label)

## HNSW

In [17]:
ind = []
for i in range(iteration):
    hnsw = methods.HNSW(norm_embs[i])
    ind.append(hnsw.search(norm_embs[i], norm_embs[i], K+1))
hnsw_metr = metric.calc_map(ind, label)

## Метрика

In [18]:
print("Точный подсчет - %s\nKD-Tree - %s\nHNSW - %s" % (acc_metr, kd_tree_metr, hnsw_metr))

Точный подсчет - 0.7399457465277778
KD-Tree - 0.7382552083333332
HNSW - 0.7328995768229166


# Исходные ненормализованные эмбеддинги

### Sk learn KD-Tree

In [20]:
indexes = []
for i in range(iteration):
    tree = KDTree(embs[i], leaf_size=40)  # creating kd tree
    _, ind = tree.query(embs[i], k=K+1)# quering nearest items
    indexes.append(ind)
print("Значение метрики", metric.calc_map(indexes, label))

Значение метрики 0.7313761393229166


### KD-Tree with cosine distance

In [21]:
indexes = []
for i in range(iteration):
    cosine_tree = methods.KDTree(embs[i])
    indexes.append(cosine_tree.search(embs[i]))
print("Значение метрики", metric.calc_map(indexes, label))

Значение метрики 0.6737527126736111


### KNN with cosine distance

In [22]:
indexes = []
for i in range(iteration):
    cosine_knn = methods.DistributedCosineKnn(K)
    ind, _ = cosine_knn.fit(embs[i])
    indexes.append(ind)
print("Значение метрики", metric.calc_map(indexes, label))

Значение метрики 0.7399457465277778


### HNSW

In [23]:
ind = []
for i in range(iteration):
    hnsw = methods.HNSW(embs[i])
    ind.append(hnsw.search(embs[i], embs[i], K+1))
print("Значение метрики", metric.calc_map(ind, label))

Значение метрики 0.733205837673611


# PCA

In [24]:
transformer = PCA(n_components=64)
pca_data = []
for i in range(iteration):
    pca_data.append(transformer.fit_transform(embs[i]))

### Sklearn KD-Tree

In [25]:
indexes = []
for i in range(iteration):
    tree = KDTree(pca_data[i], leaf_size=40)  # creating kd tree
    _, ind = tree.query(pca_data[i], k=K+1)# quering nearest items
    indexes.append(ind)
print("Значение метрики", metric.calc_map(indexes, label))

Значение метрики 0.7203445095486111


### Cosine KD-Tree

In [26]:
ind = []
for i in range(iteration):
    cosine_tree = methods.KDTree(pca_data[i], 0)
    ind.append(cosine_tree.search(pca_data[i]))
print("Значение метрики", metric.calc_map(ind, label))

Значение метрики 0.6579511176215278


### KNN with cosine distance

In [27]:
indexes = []
for i in range(iteration):
    cosine_knn = methods.DistributedCosineKnn(K)
    ind, _ = cosine_knn.fit(pca_data[i])
    indexes.append(ind)
print("Значение метрики", metric.calc_map(indexes, label))

Значение метрики 0.7159098307291666


### HNSW

In [28]:
ind = []
for i in range(iteration):
    hnsw = methods.HNSW(pca_data[i])
    ind.append(hnsw.search(pca_data[i], pca_data[i], K+1))
print("Значение метрики", metric.calc_map(ind, label))

Значение метрики 0.707060818142361


# TSNE

In [29]:
X_embedded = []
for i in range(iteration):
    X_embedded.append(TSNE(n_components=3, learning_rate='auto', metric='cosine', init='pca', perplexity=1)
                     .fit_transform(embs[i]))

### Sklearn KD-Tree

In [30]:
indexes = []
for i in range(iteration):
    tree = KDTree(X_embedded[i], leaf_size=40)  # creating kd tree
    _, ind = tree.query(X_embedded[i], k=K+1)# quering nearest items
    indexes.append(ind)
print("Значение метрики", metric.calc_map(indexes, label))

Значение метрики 0.7913316514756945


### Cosine KD-Tree

In [31]:
ind = []
for i in range(iteration):
    cosine_tree = methods.KDTree(X_embedded[i], 0)
    ind.append(cosine_tree.search(X_embedded[i]))
print("Значение метрики", metric.calc_map(ind, label))

Значение метрики 0.8553398980034722


### KNN with cosine distance

In [32]:
indexes = []
for i in range(iteration):
    cosine_knn = methods.DistributedCosineKnn(K)
    ind, _ = cosine_knn.fit(X_embedded[i])
    indexes.append(ind)
print("Значение метрики", metric.calc_map(indexes, label))

Значение метрики 0.8108884006076389


### HNSW

In [33]:
ind = []
for i in range(iteration):
    hnsw = methods.HNSW(X_embedded[i])
    ind.append(hnsw.search(X_embedded[i], X_embedded[i], K+1))
print("Значение метрики", metric.calc_map(ind, label))

Значение метрики 0.8168608940972222
