In [1]:
import math as m
import numpy as np

In [2]:
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv('../data/embeddings.csv', chunksize=10000, names=['image', 'label'])

In [5]:
from json import loads

In [6]:
from torch.nn.functional import cosine_similarity
from torch import nn
import torch
from torch.nn.functional import normalize

In [7]:
device = torch.device('cuda:1')

In [8]:
# from sklearn.metrics.pairwise import cosine_similarity
# def new_euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False): 
#     return cosine_similarity(X,Y)

# # monkey patch (ensure cosine dist function is used)
# from sklearn.cluster import k_means_
# k_means_.euclidean_distances = new_euclidean_distances

In [9]:
import matplotlib.pyplot as plt
%matplotlib inline

In [10]:
embeddings_file = '../data/embeddings.csv'
train_file = '../data/train.csv'
validation_file = '../data/validation.csv'

In [11]:
# df = pd.DataFrame.from_records([], columns=['image', 'label'])
# df.to_csv(train_file, index=False)
# df.to_csv(validation_file, index=False)

In [12]:
def split(file_path, train_file, validation_file, chunksize=10000):
    count = 0
    df = pd.read_csv(file_path, chunksize=chunksize, header=0)
    for i, chunk in enumerate(df):
        print(count + len(chunk))
        X = np.array([np.fromstring(item[1:-1], dtype=np.float32, sep=' ') for item in chunk['image'].values])
        X_train, X_test, y_train, y_test = train_test_split(X, chunk['label'].values, test_size=0.3,
                                                            random_state=42, shuffle=True)
#         print(X_train)
        
        data = list(zip(X_train, y_train))
        df = pd.DataFrame.from_records(data, columns=['image', 'label'])
        df.to_csv(train_file, index=False, header=False, mode='a')
        
        data = list(zip(X_test, y_test))
        df = pd.DataFrame.from_records(data, columns=['image', 'label'])
        df.to_csv(validation_file, index=False, header=False, mode='a')
        
        count += len(chunk)

In [13]:
# split(embeddings_file, train_file, validation_file)

In [14]:
from sklearn.metrics import balanced_accuracy_score

In [15]:
class BalancedAccuracyMeter():
    def __init__(self):
        self.y = []
        self.y_hat = []
        
    def add(self, y, y_hat):
        self.y.append(y)
        self.y_hat.append(y_hat)
        
    def calculate(self):
        y = torch.cat(self.y, 0)
        y_hat = torch.cat(self.y_hat, 0)
        
        return torch.tensor([self.accuracy(y, y_hat) 
                            for y, y_hat in list(zip(y, y_hat))])
    def accuracy(self, y, y_hat):
        tp = torch.sum(y & y_hat).type(torch.float32)
        tn = torch.sum(~y & ~y_hat).type(torch.float32)
        p_count = torch.sum(y).type(torch.float32)
        n_count = torch.sum(~y).type(torch.float32)
        
        return (tp / p_count + tn / n_count) / 2
    def reset(self):
        self.__init__()

In [16]:
def cosine_distance(x1, x2, dim=1):
    x1 = normalize(x1, dim=dim)
    x2 = normalize(x2, dim=dim)
    return 1.0 - torch.sum(x1 * x2, dim=dim)

In [17]:
class CosineDistance(nn.Module):
    def __init__(self, dim=1):
        super(CosineDistance, self).__init__()
        self.dim = dim
        
    def forward(self, x1, x2):
        return cosine_distance(x1, x2, dim=self.dim)

In [18]:
def read_string_array(arr, dtype=None):
    return np.fromstring(arr[1:-1], dtype=None, sep=' ')


In [19]:
def read_string_ndarray(arr, dtype=None):
    return np.array([read_string_array(target, dtype=None) for target in arr])

In [20]:
def read_chunk(chunk, device=torch.device('cpu')):
    X = np.array(chunk['image'].values.tolist(), dtype=np.float32)
    y = chunk['label'].to_numpy(dtype=np.int)
    
    
    X = torch.from_numpy(X).to(device)
    y = torch.from_numpy(y).to(device)
    
    return X, y

In [21]:
# def get_y_true(y_train, y_test):
#     return y_test[:, None] == y_train

In [22]:
# def get_y_hat(similiarity, treshold):
#     return similiarity > treshold

In [23]:
# def predict(X_train, y_train, X_test, y_test, treshold):
#     sim = cosine_similarity(target, X_train)
                
#     y_hat = get_y_hat(sim, treshold)
#     y = get_y_true(y_train, y_test)
    
#     return y, y_hat

In [24]:
class Counter:
    def __init__(self):
        self.__count = 0
    
    def count(self, arr):
        self.__count += len(arr)
        
        return self.get()

    def get(self):
        return self.__count
    
    def reset(self):
        self.__init__()

In [25]:
class Model:
    def __init__(self):
        self.distance = torch.nn.CosineSimilarity(dim=1)
        self.distance = nn.DataParallel(self.distance, device_ids=[1, 2, 3])
        
    def predict(self, X_train, y_train, X_test, y_test, treshold):
        start = time()
        sim = torch.stack([self.distance(torch.stack([X] * 3), X_train) for X in X_test])
        
                
        y_hat = self._get_y_hat(sim, treshold)
        y = self._get_y_true(y_train, y_test)

        return y, y_hat
    def _get_y_hat(self, sim, treshold):
        return sim > torch.tensor(treshold, device=device)
    def _get_y_true(self, y_train, y_test):
        return y_test[:, None] == y_train

In [26]:
from sklearn.metrics.pairwise import cosine_similarity

In [27]:
from time import time

In [28]:
class CV:
    def __init__(self,  model, tresholds):
        self.tresholds = tresholds
        self.model = model
        
    def calculate(self, train_file, validation_file, chunksize=100000, validation_chunksize=1, validation_limit=None):
        scores = []
        
        acc = BalancedAccuracyMeter()
        counter = Counter()
        validation_counter = Counter()
        
        for treshold in tresholds:
            score = []
            
            validation_df = pd.read_csv(validation_file, 
                                        chunksize=validation_chunksize,
                                        header=0,
                                        converters={'image': read_string_array})
            
            for validation_chunk in validation_df:
                start = time()
                X_test, y_test = read_chunk(validation_chunk, device)
                print('Validation read:', time() - start)
                
                if validation_limit and validation_counter.count(X_test) > validation_limit:
                    break
                
                
                
                for chunk in train_df:
                    print('Treshold', treshold,
                          'Train:', counter.count(chunk),
                          'Validation:', validation_counter.get())

                    start = time()
                    X_train, y_train = read_chunk(chunk, device)
                    print('Train read:', time() - start)
                    y, y_hat = self.model.predict(X_train, y_train, X_test, y_test, treshold)
                    del X_train
                    del y_train
                    acc.add(y, y_hat)

                accuracy = acc.calculate()
                score.append(accuracy)
                print('Accuracy:', torch.mean(accuracy))
        
                acc.reset()
                counter.reset()
                
                del X_test
                del y_test
            score = torch.cat(score, 0)
            print(score)
            mean_score = torch.mean(score)
            scores.append(mean_score)
            validation_counter.reset()
            
        scores = torch.tensor(scores)
        best_index = torch.argmax(scores)
        best = self.tresholds[best_index]
        best_score = scores[best_index]
        
        return best, best_score.cpu().numpy(), scores.cpu().numpy()

In [29]:
#                          start end step
tresholds = list(np.arange(0.0, 1.1, 0.1))

In [30]:
model = Model()

In [31]:
cv = CV(model, tresholds)

In [32]:
def save_treshold(treshold, file='../data/treshold.npy'):
    np.save(file, [treshold])
def load_treshold(file='../data/treshold.npy'):
    return np.load(file)[0]
    

In [33]:
torch.cuda.empty_cache()

In [34]:
best, best_score, scores = cv.calculate(train_file, validation_file, chunksize=350000, validation_chunksize=3000, validation_limit=9000)
# best, best_score = cv.calculate(train_file, validation_file, chunksize=100, validation_chunksize=10)
best, best_score, scores

Validation read: 2.844519853591919
Treshold 0.0 Train: 343350 Validation: 3000
Train read: 20.031020879745483
Accuracy: tensor(0.7309)
Validation read: 0.004582405090332031
Treshold 0.0 Train: 343350 Validation: 6000
Train read: 1.2230401039123535


RuntimeError: CUDA out of memory. Tried to allocate 3.84 GiB (GPU 1; 10.92 GiB total capacity; 6.42 GiB already allocated; 2.83 GiB free; 6.44 GiB reserved in total by PyTorch)

In [None]:
save_treshold(best.cpu().numpy())

In [None]:
plt.figure()
plt.xlabel('Treshold')
plt.ylabel('Accuracy')
plt.plot(tresholds, scores)
plt.savefig('0.jpg')

In [None]:
tresholds = list(np.arange( float(best - 0.1), float(best + 0.09), 0.01))

In [None]:
best, best_score, scores = cv.calculate(train_file, validation_file, chunksize=100000, validation_chunksize=1000, validation_limit=9000)
best, best_score, scores

In [None]:
save_treshold(best.cpu().numpy())

In [None]:
plt.figure()
plt.xlabel('Treshold')
plt.ylabel('Accuracy')
plt.plot(tresholds, scores)
plt.savefig('1.jpg')

In [None]:
treshold = load_treshold()
treshold