# Hierarchical Vector Concatenation

In this notebook, we are going to simulate the concatenation of two vectors using the HVC technique

### defining functions

In [30]:
import pandas as pd
import random
from itertools import combinations, product
from scipy.spatial import distance
import numpy as np
from sklearn.neighbors import NearestNeighbors
import json
import math

def vector_(vector, new_vector_length, x):
    vector = [(vector[i], vector[i]*x)[i < new_vector_length] for i in range(len(vector))]
    vector = np.array(vector)
    return vector

def test_knn(df, class_n, minimum_scale_coefficient, test_amount, sample_id=None, minimum_sample_n=0, verbose=False):
    '''
    tests each individual class sample vs all the other sample in the dataset
    when minimum_scale_coefficient==-1 means calculation cannot be done: WE DO NOT RAISE ERROR
    '''
    if minimum_scale_coefficient == -1:
        if verbose:
            print('minimum_scale_coefficient == -1')
            print('inconsistencies:', -1)
        return True

    class Found(Exception): pass
    # df = pd.read_parquet(f'{exp}/df_hierarchical_vector.parquet') #we reset the vectors so they do not kee scaling per iteration
    df_ = df.copy()

    # test for inefficiencies
    key1 = class_n
    if sample_id:
        samples = [sample_id]
    else:
        samples = [*df_[df_['class_1']==key1].index]

    for r in samples:
        if r >= minimum_sample_n:

            if verbose : print('**', '\tsample:', r, '\tmsc:', minimum_scale_coefficient)
            
            for msc_variation in [['less_than', minimum_scale_coefficient-test_amount], ['more_than', minimum_scale_coefficient+test_amount]]:
                
                # df = pd.read_parquet(f'{exp}/df_hierarchical_vector.parquet') #we reset the vectors so they do not keep scaling per iteration
                df_ = df.copy()
                df_['vector_full'] = df_['vector_full'].apply(lambda x : vector_(x, new_vector_length, msc_variation[1]))

                # classes
                nbrs = NearestNeighbors(n_neighbors=len(df_), metric='euclidean').fit(df_['vector_full'].values.tolist())
                distances, indices = nbrs.kneighbors([df_['vector_full'].iloc[r]]) # EDIT VECTOR 0 HERE

                list1 = [df_.iloc[x]['class_1'].values.tolist() for x in indices][0]
                counter = 1
                for i in range(len(list1)-1):
                    # count contaminations
                    el = list1[i], list1[i+1]
                    if el[0] != el[1]:
                        # print(r, indices[0][i:i+1], list1[i:i+1], distance.euclidean(df_.iloc[indices[0][0]]['vector_full'], df_.iloc[indices[0][i]]['vector_full']))
                        counter += 1

                inconsistencies = counter-n_classes
                if msc_variation[0] == 'less_than':
                    # should have inconsistencies, because we test it on a scaler < msc+test_amount

                    if inconsistencies == 0:
                        if verbose : 
                            print('<', 'ERR')
                            print('<', 'inconsistencies:', inconsistencies)
                        # raise Found
                        return False
                    else:
                        if verbose :
                            print('<', 'inconsistencies:', inconsistencies)
                        
                if msc_variation[0] == 'more_than':
                    # should not have inconsistencies, because we test it on a scaler > msc+test_amount

                    if inconsistencies == 0:
                        if verbose : 
                            print('>', 'inconsistencies:', inconsistencies)
                        
                    else:
                        if verbose :
                            print('>', 'ERR')
                            print('>', 'inconsistencies:', inconsistencies)
                        return False
                        # raise Found
                
            # has not encountered exit because of errors, so far
            return True
                        
        if sample_id is not None:
            return True
            
def triangular_extension(A, B, C, concat_vector_length):
    
    if not len(A)==len(B)==len(C):
        raise Exception('all vectors must be of the same length')

    n = len(A)
    m = concat_vector_length
    numerator = 0
    for i in range(m, n):
        numerator += (2*A[i]*(B[i]-C[i]) + C[i]**2 - B[i]**2)

    denominator = 0
    for i in range(m):
        denominator += ( 2*A[i]*(C[i]-B[i]) + B[i]**2 - C[i]**2 )
    
    try:
        # AB will never be larger than AC
        # issue 1: negative quotient (sqrt is impossible)
        # issue 2: denominator is 0
        total = numerator/denominator
        total = math.sqrt(total)
        return total   
    except:
        # AB can be larger than AC
        return -1
    
def get_msc(df_total, sample_id, verbose=False):
    """
    we get the exact minimum scale coefficient given a sample.
    in total, a given A, B, C will have the same samples with an msc score for each step of a single path
    for a given A, we need to get the highest msc of the sample for it to work
    """
    df_ = df_total[(df_total['index']==sample_id)]
    if verbose :
        display(df_)
    return df_.minimum_scale_coefficient.max()

letters = 'abcdefghijklmnopqrstwxyz'
letters = letters.upper()
def create_class(k):
    class_name = ''.join(random.choices(letters, k=k))
    return class_name

### creating sample dataset

In [None]:
#   non-labeled df
initial_vector_n = 2000 # 1h computation
initial_vector_length = 80
df = pd.DataFrame([[[random.random()*2-1 for x in range(initial_vector_length)] for k in range(initial_vector_n)]]).T
df.columns = ['vector_0']
df['vector_full'] = df['vector_0'] 
df

#   labeled df
for depth in range(1):
    # we start counting from 1, the previous vector start from depth 0
    depth += 1
    n_classes = 8
    new_vector_length = 100

    # the vector for each class has to be identical
    classes = [(str(x), '0'+str(x))[x<10] for x in range(n_classes)]
    # classes = ['A', 'B', 'C', 'D']
    arr1 = {key:[random.random()*2-1 for x in range(new_vector_length)] for key in classes}
    df_new = pd.DataFrame([[x, arr1[x]] for x in random.choices(classes, k=len(df))])
    df_new.columns = [f'class_{depth}', f'vector_{depth}']
    df_new_clean = df_new.drop_duplicates(f'class_{depth}')

    df = pd.concat([df_new, df], axis=1)

    # for now we just sum it: we scale it later
    df['vector_full'] = df[f'vector_{depth}'] + df['vector_full']
    df['vector_new'] = df[f'vector_{depth}']

exp = 'exp1'

# save records
df.to_parquet(f'{exp}/df_hierarchical_vector.parquet', index=None)

### computing msc for every sample in the dataset

In [4]:
exp = 'exp1'
depth = 1
df = pd.read_parquet(f'{exp}/df_hierarchical_vector.parquet')
df_new_clean = df[['class_1', 'vector_1']].drop_duplicates(f'class_{depth}')
n_classes = len(df_new_clean.class_1.unique())
new_vector_length = len(df.vector_1[0])
selection_method = 'precise' # 'approximate' 'precise'

seq = dict()
for key_A in [*df_new_clean['class_1']]:
    nbrs = NearestNeighbors(n_neighbors=len(df_new_clean), metric='euclidean').fit(df_new_clean['vector_1'].values.tolist())
    distances, indices = nbrs.kneighbors(df_new_clean[df_new_clean['class_1']==key_A]['vector_1'].values.tolist())
    seq[key_A] = [*df_new_clean['class_1'].iloc[indices[0]].values]

t = list()
total = list()
for key_A in [*seq.keys()]:
    if key_A not in ['m']: #[*df_total[0].unique()][:-1]:

        df = pd.read_parquet(f'{exp}/df_hierarchical_vector.parquet')

        for _ in seq[key_A]:
            # find the path from key_A to all the following keys in order of distance
            sequence_ = list()
            for index in range(len(seq[key_A])-1):
                sequence_.append([seq[key_A][index], seq[key_A][index+1]])
            sequence_

        for s in sequence_:
            key_B = s[0]
            key_C = s[1]

            if selection_method == 'precise':
                # we take into account ALL THE OTHER POINTS: heavy computations required
                nbrs_B = NearestNeighbors(n_neighbors=len(df[df['class_1']==key_B]), metric='euclidean').fit(df[df['class_1']==key_B]['vector_full'].values.tolist())
                nbrs_C = NearestNeighbors(n_neighbors=len(df[df['class_1']==key_C]), metric='euclidean').fit(df[df['class_1']==key_C]['vector_full'].values.tolist())
            elif selection_method == 'approximate':
                # we apply k-means to every class
                # df_sample = 
                pass
                
            counter = 0
            for k in df[df['class_1']==key_A].reset_index().values.tolist():
                point_A = k[0] # index is the absolute one
                point_A_vector = k[4]

                distances_B, indices_B = nbrs_B.kneighbors([point_A_vector])
                distances_C, indices_C = nbrs_C.kneighbors([point_A_vector])

                # given point_A, find the furthest in cluster_B
                point_B = indices_B[0][-1]
                point_B_vector = df[df['class_1']==key_B]['vector_full'].iloc[point_B].tolist()
                longest_vectors = (point_A_vector, point_B_vector)

                # given point_A, find the shortest in cluster_C
                point_C = indices_C[0][0]
                point_C_vector = df[df['class_1']==key_C]['vector_full'].iloc[point_C].tolist()
                shortest_vectors = (point_A_vector, point_C_vector)

                x = triangular_extension(
                    point_A_vector, 
                    point_B_vector,
                    point_C_vector, 
                    concat_vector_length=new_vector_length
                )

                total.append([key_A, key_B, key_C, point_A, x])
                t.append([point_A_vector, point_B_vector, point_C_vector])

                # print(point_A_vector, point_B_vector, point_C_vector)
                counter += 1           

            # print insights
            df_total = pd.DataFrame(total)
            df_total.columns = ['A', 'B', 'C', 'index', 'minimum_scale_coefficient']
            minimum_scale_coefficient_key_B = df_total[(df_total['A']==key_A) & (df_total['B']==key_B)]['minimum_scale_coefficient'].max()
            print('A:', key_A, '\tB:', key_B, '\tC:', key_C, '\t**', minimum_scale_coefficient_key_B)

df_total.to_parquet(f'{exp}/total.parquet', index=None)

A: 06 	B: 06 	C: 00 	** 0.9913065877383754
A: 06 	B: 00 	C: 02 	** 7.555059784706563
A: 06 	B: 02 	C: 07 	** 2.8713008958269493
A: 06 	B: 07 	C: 04 	** 14.298526483669757
A: 06 	B: 04 	C: 03 	** 7.432602857308244
A: 06 	B: 03 	C: 01 	** 15.224730932063627
A: 06 	B: 01 	C: 05 	** 2.0379467940107223
A: 04 	B: 04 	C: 02 	** 1.0681853264605454
A: 04 	B: 02 	C: 07 	** 2.8806884968049773
A: 04 	B: 07 	C: 00 	** 3.8841708656663663
A: 04 	B: 00 	C: 06 	** 7.859493981125199
A: 04 	B: 06 	C: 01 	** 3.51339300834529
A: 04 	B: 01 	C: 05 	** 4.586826363396587
A: 04 	B: 05 	C: 03 	** 3.178070112717601
A: 02 	B: 02 	C: 04 	** 1.018051413885821
A: 02 	B: 04 	C: 06 	** 3.800601126084943
A: 02 	B: 06 	C: 00 	** 2.2572550229747934
A: 02 	B: 00 	C: 05 	** 5.486983854769195
A: 02 	B: 05 	C: 03 	** 13.158939926284905
A: 02 	B: 03 	C: 01 	** 12.068375689970424
A: 02 	B: 01 	C: 07 	** 9.27043678279616
A: 05 	B: 05 	C: 07 	** 0.9104126771652138
A: 05 	B: 07 	C: 00 	** 3.3305959902558055
A: 05 	B: 00 	C: 02 	**

In [42]:
df_total

Unnamed: 0,A,B,C,index,minimum_scale_coefficient,pass
0,06,06,00,0,0.916085,False
1,06,06,00,3,0.842967,False
2,06,06,00,13,0.895831,False
3,06,06,00,33,0.831733,False
4,06,06,00,37,0.857994,False
...,...,...,...,...,...,...
13995,01,05,07,1949,5.130371,False
13996,01,05,07,1956,4.680801,False
13997,01,05,07,1962,4.649634,False
13998,01,05,07,1967,4.495262,False


### testing a single sample
By using the test knn function on a sample (0), we are going to measure the number of concatenations after multiplying the concatenated vector with msc.

In [31]:
exp = 'exp1'
depth = 1
df = pd.read_parquet(f'{exp}/df_hierarchical_vector.parquet')
df_new_clean = df[['class_1', 'vector_1']].drop_duplicates(f'class_{depth}')
n_classes = len(df_new_clean.class_1.unique())
new_vector_length = len(df.vector_1[0])

In [32]:
df = pd.read_parquet(f'{exp}/df_hierarchical_vector.parquet') #we reset the vectors so they do not kee scaling per iteration
df_total = pd.read_parquet(f'{exp}/total.parquet')

# test msc of single sample
sample_id = 0
# IF WE ADD TO OR SUBTRACT ANY NUMBER FROM msc, it should fail
msc = get_msc(df_total, sample_id=sample_id, verbose=False)

test_amount = 0.00001
class_n = [*df_total[df_total['index']==sample_id]['A']][0]
test_knn(df, class_n=class_n, minimum_scale_coefficient=msc, test_amount=test_amount, sample_id=sample_id, verbose=False)

True

### visualize class separation

In [35]:
df_ = df.copy()

#
nbrs = NearestNeighbors(n_neighbors=len(df_), metric='euclidean').fit(df_['vector_full'].values.tolist())
distances, indices = nbrs.kneighbors([df_['vector_full'].iloc[sample_id]]) # EDIT VECTOR 0 HERE
print('regular concatenation')
print(df_['class_1'].iloc[indices[0]].values.tolist()[0:1000], '\n')

#
msc = df_total['minimum_scale_coefficient'].max()
df_['vector_full'] = df_['vector_full'].apply(lambda x : vector_(x, new_vector_length, msc))
nbrs = NearestNeighbors(n_neighbors=len(df_), metric='euclidean').fit(df_['vector_full'].values.tolist())
distances, indices = nbrs.kneighbors([df_['vector_full'].iloc[sample_id]]) # EDIT VECTOR 0 HERE
print('hierarchical concatenation')
print(df_['class_1'].iloc[indices[0]].values.tolist()[0:1000])

regular concatenation
['06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06', '06',

### counting geometrical impossibilities, if any

In [9]:
# count geometrical impossibilities
len(df_total[df_total['minimum_scale_coefficient']==-1])

0

### testing the entire dataset by adding/removing epsilon (test_score)

In [60]:
from tqdm import tqdm

class Found(Exception): pass

for key_A in [*df_total['A'].unique()]:
    for sample_index in tqdm(range(len(df_total[df_total['A']==key_A]['index'])), desc=f'class: {key_A}'):
        sample_id = df_total[df_total['A']==key_A]['index'].iloc[sample_index]
        msc = get_msc(df_total, sample_id=sample_id, verbose=False)
        test = test_knn(
            df, 
            class_n=key_A, 
            minimum_scale_coefficient=msc, 
            test_amount=test_amount,
            sample_id=sample_id, 
            verbose=False
        )
        if test == False : 
            raise Found

class: 06: 100%|██████████| 1701/1701 [07:55<00:00,  3.57it/s]
class: 04: 100%|██████████| 1792/1792 [08:17<00:00,  3.60it/s]
class: 02: 100%|██████████| 1799/1799 [08:38<00:00,  3.47it/s]
class: 05: 100%|██████████| 1736/1736 [08:19<00:00,  3.47it/s]
class: 03: 100%|██████████| 1848/1848 [08:41<00:00,  3.54it/s]
class: 07: 100%|██████████| 1743/1743 [08:11<00:00,  3.54it/s]
class: 00: 100%|██████████| 1631/1631 [07:42<00:00,  3.53it/s]
class: 01: 100%|██████████| 1750/1750 [08:16<00:00,  3.52it/s]


### computing merge function

In [93]:
import plotly.graph_objects as go

def plot_series(series):
    x_values = [item[0] for item in series]
    y_values = [item[1] for item in series]
    
    fig = go.Figure(data=go.Scatter(x=x_values, y=y_values, mode='lines'))
    fig.update_layout(
        title='HVC-function',
        xaxis_title='msc',
        yaxis_title='contaminations',
        template='plotly_dark'
    )
    fig.show()

# count contaminationsA
fn_passing = list()
passing_score = 0
while passing_score < df_total['minimum_scale_coefficient'].max():
    passing_score += 0.05
    df_total['pass'] = df_total['minimum_scale_coefficient'].apply(lambda x : x > passing_score)
    merge_intensity = len(df_total[df_total['pass']]) #/len(df_total)
    fn_passing.append([passing_score, merge_intensity])

plot_series(fn_passing)

By inverting the function we can input the desired number of contaminations, and get the msc required to obtain it

In [94]:
import plotly.graph_objects as go

def plot_series(series):
    x_values = [item[1] for item in series]
    y_values = [item[0] for item in series]
    
    fig = go.Figure(data=go.Scatter(x=x_values, y=y_values, mode='lines'))
    fig.update_layout(
        title='HVC-function',
        xaxis_title='contaminations',
        yaxis_title='msc',
        template='plotly_dark'
    )
    fig.show()

# count contaminationsA
fn_passing = list()
passing_score = 0
while passing_score < df_total['minimum_scale_coefficient'].max():
    passing_score += 0.05
    df_total['pass'] = df_total['minimum_scale_coefficient'].apply(lambda x : x > passing_score)
    merge_intensity = len(df_total[df_total['pass']]) #/len(df_total)
    fn_passing.append([passing_score, merge_intensity])

plot_series(fn_passing)

After normalization, we can simply input the desired merge intensity and get the msc required to obtain it

In [95]:
def plot_series(series):
    x_values = [item[0] for item in series]
    y_values = [item[1] for item in series]
    
    fig = go.Figure(data=go.Scatter(x=x_values, y=y_values, mode='lines'))
    fig.update_layout(
        title='HVC-function',
        xaxis_title='merge_intensity',
        yaxis_title='msc',
        template='plotly_dark'
    )
    fig.show()

# count contaminations
fn_passing = list()
passing_score = 0
while passing_score < df_total['minimum_scale_coefficient'].max():
    passing_score += 0.05
    df_total['pass'] = df_total['minimum_scale_coefficient'].apply(lambda x : x > passing_score)
    merge_intensity = len(df_total[df_total['pass']])/len(df_total)
    fn_passing.append([merge_intensity, passing_score])

plot_series(fn_passing)