In [2]:
!pip install unrar

import numpy as np
import random
import torch
import matplotlib.pyplot as plt

def set_random_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)


set_random_seed(42)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

!unrar x /content/dataset.rar


UNRAR 6.11 beta 1 freeware      Copyright (c) 1993-2022 Alexander Roshal


Extracting from /content/dataset.rar

Extracting  x_train_red.npy                                               65% 78%  OK 
Extracting  x_val_red.npy                                                 97%  OK 
Extracting  y_test_red.npy                                                97%  OK 
Extracting  y_train_red.npy                                               97%  OK 
Extracting  y_val_red.npy                                                 97%  OK 
Extracting  x_test_red.npy                                               100%  OK 
All OK


In [3]:
from google.colab import files
src = list(files.upload().values())[0]
open('data_and_model.py','wb').write(src)
from data_and_model import SiameseNet

Saving data_and_model.py to data_and_model (1).py


In [4]:
latent_dim = 64

# Тест (знакомые классы)

In [5]:
model = SiameseNet(latent_dim).to(device)
model.load_state_dict(torch.load('/content/model_weights'))

<All keys matched successfully>

In [None]:
def accuracy_(pred, cor):
  accuracy = np.zeros((4))
  total = 0
  for i in range(cor.shape[0]):
    intersect = np.intersect1d(cor[i], pred[i])
    error = len(cor[i]) - len(intersect)
    for j in range(accuracy.shape[0]-error):
      accuracy[j] = accuracy[j] +1
    total = total + 1
  return accuracy/total

In [None]:
x_test = np.load('/content/x_test_red.npy')
y_test = np.load('/content/y_test_red.npy')

In [None]:
x_test = x_test[2000:]
y_test = y_test[2000:]

In [None]:
molecules = []
molecules_embedding = []
label = []
for i in range(400, 500):
  molecules = []
  embedding = np.zeros((20, latent_dim))
  for j in range(0, 20):
    molecule = x_train[20*i + j]/np.max(x_train[20*i + j]) + np.random.normal(0,0.005,5000)
    molecules.append(molecule)
  molecules = np.array(molecules)
  molecules = torch.Tensor(molecules).to(device)
  embedding = embedding + model.predict(molecules).detach().cpu().numpy()
  embedding = np.mean(embedding, axis = 0)
  molecules_embedding.append(embedding/np.linalg.norm(embedding))
  label.append(i)
label = np.array(label)

In [None]:
label_train = np.zeros(shape = (len(label)*(len(label)-1)*(len(label) - 2)*(len(label)-3)//2//3//4, 4))
mixture_train = np.zeros(shape = (len(label)*(len(label)-1)*(len(label) - 2)*(len(label)-3)//2//3//4, 64))
l = 0
for i in range(len(molecules_embedding)):
  for j in range(i+1, len(molecules_embedding)):
    for k in range(j+1, len(molecules_embedding)):
      for m in range(k+1, len(molecules_embedding)):
        emb = molecules_embedding[i] + molecules_embedding[j] + molecules_embedding[k] + molecules_embedding[m]
        mixture_train[l] = emb/np.linalg.norm(emb)
        label_train[l] = np.array([label[i], label[j], label[k], label[m]])
        l = l + 1

In [None]:
mixtures_test = []
correct_test = []
for i in range(0, 500):
  a = x_test[i]
  idxs = np.concatenate((np.arange(x_test[:].shape[0])[0 : i//5 * 5],
                                      np.arange(x_test[:].shape[0])[(i//5 + 1) * 5 :]))
  b_idx = np.random.choice(idxs)
  b = x_test[b_idx]
  c_idx = np.random.choice(idxs)
  c = x_test[c_idx]
  d_idx = np.random.choice(idxs)
  d = x_test[d_idx]
  mixtures_test.append((a+b+c+d)/np.max(a+b+c+d) + np.random.normal(0,0.005,a.shape[0]))
  correct_test.append([y_test[i], y_test[b_idx], y_test[c_idx], y_test[d_idx]])
mixtures_test = torch.Tensor(mixtures_test).to(device)
classification_data_test = model.predict(mixtures_test).detach().cpu().numpy()
correct_test = np.array(correct_test).reshape(-1, 4)

  mixtures_test = torch.Tensor(mixtures_test).to(device)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(mixture_train, label_train)
preds = neigh.predict(classification_data_test)

In [None]:
accuracy = accuracy_(preds, correct_test)

In [None]:
print('.....Accuracy.....')
print('1 of 4 ......', accuracy[0])
print('2 of 4 ......', accuracy[1])
print('3 of 4 ......', accuracy[2])
print('4 of 4 ......', accuracy[3])

.....Accuracy.....
1 of 4 ...... 1.0
2 of 4 ...... 0.998
3 of 4 ...... 0.962
4 of 4 ...... 0.822


# Тест (новые классы)

In [None]:
model = SiameseNet(latent_dim).to(device)
model.load_state_dict(torch.load('/content/model_weights'))

In [None]:
def accuracy_(pred, cor):
  accuracy = np.zeros((4))
  total = 0
  for i in range(cor.shape[0]):
    intersect = np.intersect1d(cor[i], pred[i])
    error = len(cor[i]) - len(intersect)
    for j in range(accuracy.shape[0]-error):
      accuracy[j] = accuracy[j] +1
    total = total + 1
  return accuracy/total

In [None]:
x_test = np.load('/content/x_test_red.npy')
y_test = np.load('/content/y_test_red.npy')

In [None]:
x_test = x_test[:500]
y_test = y_test[:500]

In [None]:
molecules = []
molecules_embedding = []
label = []
for i in range(0, 100):
  molecules = []
  embedding = np.zeros((20, latent_dim))
  for j in range(0, 20):
    molecule = x_train[20*i + j]/np.max(x_train[20*i + j]) + np.random.normal(0,0.005,5000)
    molecules.append(molecule)
  molecules = np.array(molecules)
  molecules = torch.Tensor(molecules).to(device)
  embedding = embedding + model.predict(molecules).detach().cpu().numpy()
  embedding = np.mean(embedding, axis = 0)
  molecules_embedding.append(embedding/np.linalg.norm(embedding))
  label.append(i)
label = np.array(label)

In [None]:
label_train = np.zeros(shape = (len(label)*(len(label)-1)*(len(label) - 2)*(len(label)-3)//2//3//4, 4))
mixture_train = np.zeros(shape = (len(label)*(len(label)-1)*(len(label) - 2)*(len(label)-3)//2//3//4, 64))
l = 0
for i in range(len(molecules_embedding)):
  for j in range(i+1, len(molecules_embedding)):
    for k in range(j+1, len(molecules_embedding)):
      for m in range(k+1, len(molecules_embedding)):
        emb = molecules_embedding[i] + molecules_embedding[j] + molecules_embedding[k] + molecules_embedding[m]
        mixture_train[l] = emb/np.linalg.norm(emb)
        label_train[l] = np.array([label[i], label[j], label[k], label[m]])
        l = l + 1

In [None]:
mixtures_test = []
correct_test = []
for i in range(0, 500):
  a = x_test[i]
  idxs = np.concatenate((np.arange(x_test[:].shape[0])[0 : i//5 * 5],
                                      np.arange(x_test[:].shape[0])[(i//5 + 1) * 5 :]))
  b_idx = np.random.choice(idxs)
  b = x_test[b_idx]
  c_idx = np.random.choice(idxs)
  c = x_test[c_idx]
  d_idx = np.random.choice(idxs)
  d = x_test[d_idx]
  mixtures_test.append((a+b+c+d)/np.max(a+b+c+d) + np.random.normal(0,0.005,a.shape[0]))
  correct_test.append([y_test[i], y_test[b_idx], y_test[c_idx], y_test[d_idx]])
mixtures_test = torch.Tensor(mixtures_test).to(device)
classification_data_test = model.predict(mixtures_test).detach().cpu().numpy()
correct_test = np.array(correct_test).reshape(-1, 4)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(mixture_train, label_train)
preds = neigh.predict(classification_data_test)

In [None]:
accuracy = accuracy_(preds, correct_test)

In [None]:
print('.....Accuracy.....')
print('1 of 4 ......', accuracy[0])
print('2 of 4 ......', accuracy[1])
print('3 of 4 ......', accuracy[2])
print('4 of 4 ......', accuracy[3])

.....Accuracy.....
1 of 4 ...... 1.0
2 of 4 ...... 0.986
3 of 4 ...... 0.928
4 of 4 ...... 0.722


# Тест (неравные количества компонентов в смесях)

In [None]:
model = SiameseNet(latent_dim).to(device)
model.load_state_dict(torch.load('/content/model_weights'))

In [None]:
def accuracy_(pred, cor):
  accuracy = np.zeros((4))
  total = 0
  for i in range(cor.shape[0]):
    intersect = np.intersect1d(cor[i], pred[i])
    error = len(cor[i]) - len(intersect)
    for j in range(accuracy.shape[0]-error):
      accuracy[j] = accuracy[j] +1
    total = total + 1
  return accuracy/total

In [None]:
x_test = np.load('/content/x_test_red.npy')
y_test = np.load('/content/y_test_red.npy')

In [None]:
x_test = x_test[:500]
y_test = y_test[:500]

In [None]:
molecules = []
molecules_embedding = []
label = []
for i in range(0, 100):
  molecules = []
  embedding = np.zeros((20, latent_dim))
  for j in range(0, 20):
    molecule = x_train[20*i + j]/np.max(x_train[20*i + j])
    molecules.append(molecule)
  molecules = np.array(molecules)
  molecules = torch.Tensor(molecules).to(device)
  embedding = embedding + model.predict(molecules).detach().cpu().numpy()
  embedding = np.mean(embedding, axis = 0)
  molecules_embedding.append(embedding/np.linalg.norm(embedding))
  label.append(i)
label = np.array(label)

In [None]:
label_train = np.zeros(shape = (len(label)*(len(label)-1)*(len(label) - 2)*(len(label)-3)//2//3//4, 4))
mixture_train = np.zeros(shape = (len(label)*(len(label)-1)*(len(label) - 2)*(len(label)-3)//2//3//4, 64))
l = 0
for i in range(len(molecules_embedding)):
  for j in range(i+1, len(molecules_embedding)):
    for k in range(j+1, len(molecules_embedding)):
      for m in range(k+1, len(molecules_embedding)):
        emb = molecules_embedding[i] + molecules_embedding[j] + molecules_embedding[k] + molecules_embedding[m]
        mixture_train[l] = emb/np.linalg.norm(emb)
        label_train[l] = np.array([label[i], label[j], label[k], label[m]])
        l = l + 1

In [None]:
mixtures_test = []
correct_test = []
for i in range(0, 500):
  a = 0.15*x_test[i]
  idxs = np.concatenate((np.arange(x_test[:].shape[0])[0 : i//5 * 5],
                                      np.arange(x_test[:].shape[0])[(i//5 + 1) * 5 :]))
  b_idx = np.random.choice(idxs)
  b = 0.22*x_test[b_idx]
  c_idx = np.random.choice(idxs)
  c = 0.23*x_test[c_idx]
  d_idx = np.random.choice(idxs)
  d = 0.4*x_test[d_idx]
  mixtures_test.append((a+b+c+d)/np.max(a+b+c+d) + np.random.normal(0,0.005,a.shape[0]))
  correct_test.append([y_test[i], y_test[b_idx], y_test[c_idx], y_test[d_idx]])
mixtures_test = torch.Tensor(mixtures_test).to(device)
classification_data_test = model.predict(mixtures_test).detach().cpu().numpy()
correct_test = np.array(correct_test).reshape(-1, 4)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(mixture_train, label_train)
preds = neigh.predict(classification_data_test)

In [None]:
accuracy = accuracy_(preds, correct_test)

In [None]:
print('.....Accuracy.....')
print('1 of 4 ......', accuracy[0])
print('2 of 4 ......', accuracy[1])
print('3 of 4 ......', accuracy[2])
print('4 of 4 ......', accuracy[3])

.....Accuracy.....
1 of 4 ...... 0.996
2 of 4 ...... 0.942
3 of 4 ...... 0.726
4 of 4 ...... 0.398


# Бинарные смеси

In [None]:
model = SiameseNet(latent_dim).to(device)
model.load_state_dict(torch.load('/content/model_weights'))

In [None]:
def accuracy_(pred, cor):
  accuracy = np.zeros((2))
  total = 0
  for i in range(cor.shape[0]):
    intersect = np.intersect1d(cor[i], pred[i])
    error = len(cor[i]) - len(intersect)
    for j in range(accuracy.shape[0]-error):
      accuracy[j] = accuracy[j] +1
    total = total + 1
  return accuracy/total

In [None]:
x_test = np.load('/content/x_test_red.npy')
y_test = np.load('/content/y_test_red.npy')

In [None]:
x_test = x_test[:500]
y_test = y_test[:500]

In [None]:
molecules = []
molecules_embedding = []
label = []
for i in range(0, 100):
  molecules = []
  embedding = np.zeros((20, latent_dim))
  for j in range(0, 20):
    molecule = x_train[20*i + j]/np.max(x_train[20*i + j]) + np.random.normal(0,0.005,5000)
    molecules.append(molecule)
  molecules = np.array(molecules)
  molecules = torch.Tensor(molecules).to(device)
  embedding = embedding + model.predict(molecules).detach().cpu().numpy()
  embedding = np.mean(embedding, axis = 0)
  molecules_embedding.append(embedding/np.linalg.norm(embedding))
  label.append(i)
label = np.array(label)

In [None]:
label_train = np.zeros(shape = (len(label)*(len(label)-1)//2, 2))
mixture_train = np.zeros(shape = (len(label)*(len(label)-1)//2, 64))
l = 0
for i in range(len(molecules_embedding)):
  for j in range(i+1, len(molecules_embedding)):
    emb = molecules_embedding[i] + molecules_embedding[j]
    mixture_train[l] = emb/np.linalg.norm(emb)
    label_train[l] = np.array([label[i], label[j]])
    l = l + 1

In [None]:
mixtures_test = []
correct_test = []
for i in range(0, 500):
  a = x_test[i]
  idxs = np.concatenate((np.arange(x_test[:].shape[0])[0 : i//5 * 5],
                                      np.arange(x_test[:].shape[0])[(i//5 + 1) * 5 :]))
  b_idx = np.random.choice(idxs)
  b = x_test[b_idx]
  mixtures_test.append((a+b)/np.max(a+b) + np.random.normal(0,0.005,a.shape[0]))
  correct_test.append([y_test[i], y_test[b_idx]])
mixtures_test = torch.Tensor(mixtures_test).to(device)
classification_data_test = model.predict(mixtures_test).detach().cpu().numpy()
correct_test = np.array(correct_test).reshape(-1, 2)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(mixture_train, label_train)
preds = neigh.predict(classification_data_test)

In [None]:
accuracy = accuracy_(preds, correct_test)

In [None]:
print('.....Accuracy.....')
print('1 of 2 ......', accuracy[0])
print('2 of 2 ......', accuracy[1])

.....Accuracy.....
1 of 2 ...... 0.998
2 of 2 ...... 0.966


# Тройные смеси

In [None]:
model = SiameseNet(latent_dim).to(device)
model.load_state_dict(torch.load('/content/model_weights'))

In [None]:
def accuracy_(pred, cor):
  accuracy = np.zeros((3))
  total = 0
  for i in range(cor.shape[0]):
    intersect = np.intersect1d(cor[i], pred[i])
    error = len(cor[i]) - len(intersect)
    for j in range(accuracy.shape[0]-error):
      accuracy[j] = accuracy[j] +1
    total = total + 1
  return accuracy/total

In [None]:
x_test = np.load('/content/x_test_red.npy')
y_test = np.load('/content/y_test_red.npy')

In [None]:
x_test = x_test[:500]
y_test = y_test[:500]

In [None]:
molecules = []
molecules_embedding = []
label = []
for i in range(0, 100):
  molecules = []
  embedding = np.zeros((20, latent_dim))
  for j in range(0, 20):
    molecule = x_train[20*i + j]/np.max(x_train[20*i + j]) + np.random.normal(0,0.005,5000)
    molecules.append(molecule)
  molecules = np.array(molecules)
  molecules = torch.Tensor(molecules).to(device)
  embedding = embedding + model.predict(molecules).detach().cpu().numpy()
  embedding = np.mean(embedding, axis = 0)
  molecules_embedding.append(embedding/np.linalg.norm(embedding))
  label.append(i)
label = np.array(label)

In [None]:
label_train = np.zeros(shape = (len(label)*(len(label)-1)*(len(label) - 2)//2//3, 3))
mixture_train = np.zeros(shape = (len(label)*(len(label)-1)*(len(label) - 2)//2//3, 64))
l = 0
for i in range(len(molecules_embedding)):
  for j in range(i+1, len(molecules_embedding)):
    for k in range(j+1, len(molecules_embedding)):
      emb = molecules_embedding[i] + molecules_embedding[j] + molecules_embedding[k]
      mixture_train[l] = emb/np.linalg.norm(emb)
      label_train[l] = np.array([label[i], label[j], label[k]])
      l = l + 1

In [None]:
mixtures_test = []
correct_test = []
for i in range(0, 500):
  a = x_test[i]
  idxs = np.concatenate((np.arange(x_test[:].shape[0])[0 : i//5 * 5],
                                      np.arange(x_test[:].shape[0])[(i//5 + 1) * 5 :]))
  b_idx = np.random.choice(idxs)
  b = x_test[b_idx]
  c_idx = np.random.choice(idxs)
  c = x_test[c_idx]
  mixtures_test.append((a+b+c)/np.max(a+b+c) + np.random.normal(0,0.005,a.shape[0]))
  correct_test.append([y_test[i], y_test[b_idx], y_test[c_idx]])
mixtures_test = torch.Tensor(mixtures_test).to(device)
classification_data_test = model.predict(mixtures_test).detach().cpu().numpy()
correct_test = np.array(correct_test).reshape(-1, 3)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(mixture_train, label_train)
preds = neigh.predict(classification_data_test)

In [None]:
accuracy = accuracy_(preds, correct_test)

In [None]:
print('.....Accuracy.....')
print('1 of 3 ......', accuracy[0])
print('2 of 3 ......', accuracy[1])
print('3 of 3 ......', accuracy[2])

.....Accuracy.....
1 of 3 ...... 1.0
2 of 3 ...... 0.994
3 of 3 ...... 0.868
