To run this notebook you need to pip install the following libraries:
- sentence_transformers==3.0.1
- pandas==2.2.2
- numpy==1.26.4
- tqdm==4.66.2
- plotly==5.23.0

In [3]:
import pandas as pd
from quantization import *

df = pd.read_parquet('games.parquet')
df['Tags'] = df['Tags'].apply(lambda x : x.split(','))
df[['Name']]

Unnamed: 0,Name
0,Galactic Bowling
1,Train Bandit
2,TD Worlds
3,MazM: Jekyll and Hyde
4,Deadlings: Rotten Edition
...,...
41931,Drop Doll
41932,Ant Farm Simulator
41933,The Holyburn Witches
41934,Digital Girlfriend


Regrettably, because the vector file is over 100MB I could not upload it to Github. <br>
It will take around 1h to perform the encoding from scratch, if you wish to replicate the experiment.

In [None]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import numpy as np
tqdm.pandas()

# perform encoding, only has to be done once
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cpu')
df['vector'] = df['Name'].progress_apply(lambda x : model.encode(str(x)))
df.to_parquet('encoded_name.parquet', index=None)

In [None]:
# load dataset (in case we have shut down the notebook) and store vectors in an object
df = pd.read_parquet('encoded_name.parquet')
sample_vectors = np.array(df['vector'].tolist())

### ft-29 visualization

In [None]:
import plotly.graph_objects as go

numbers = iter(range(len(sample_vectors)))
# Use the next function as a callable
get_next_number = lambda: next(numbers)

In [1218]:
# find out the non-normalized features
sorted([[sample_vectors[:, x].mean(), x] for x in range(384)])[0:5]

[[-0.06290641, 29],
 [-0.05425583, 218],
 [-0.045538213, 215],
 [-0.0421522, 165],
 [-0.04160071, 296]]

In [1219]:
# next_n = get_next_number()
next_n = 1
array = sample_vectors[:, 29]

# Create a histogram figure
fig = go.Figure(data=[go.Histogram(x=array, xbins=dict(size=0.01))]) # cut

# Update the layout
fig.update_layout(
    title=str(next_n),
    xaxis_title='Value',
    yaxis_title='Frequency',
    xaxis=dict(range=[-1, 1], nticks=100)
)

# Show the plot
fig.show()

In [None]:
quantizer_list = list()
for feature_n in range(sample_vectors.shape[-1]):
	array = sample_vectors[:, feature_n].copy()
	scaled_arr, scaled_arr_q = feature_mapping(array, method='density', data_type='int8')
	quantizer_list.append({'scaled_arr': scaled_arr, 'scaled_arr_q': scaled_arr_q})

quantized_tfQ = quantize_vector(sample_vectors, quantizer_list).round()
dequantized_tfQ = dequantize_vector(quantized_tfQ, quantizer_list)

# knn

### regular quantization

In [14]:
quantized_regular = quantize_regular(sample_vectors, method='quantize', data_type='binary')

In [12]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

# Create a KNN object
knn = NearestNeighbors(n_neighbors=10, metric='cosine')
knn.fit(quantized_regular.astype('bool'))

# Find the k-nearest neighbors for a new point
vector = quantize_regular(model.encode('medieval'), method='quantize', data_type='binary')
new_point = np.array([vector.astype('bool')])
distances, indices = knn.kneighbors(new_point)
df.iloc[indices[0]]['Name'].tolist()

['Going Medieval',
 'Medievalien',
 'Medieval Towns',
 'Surviving Medieval',
 'Medieval Dynasty',
 'Medieval Battlefields',
 'Medieval Story',
 'Medieval Monarch',
 'Medieval Playground',
 'Medieval Wars']

### ft-Q

In [17]:
# ft-Q
quantizer_list = list()
for feature_n in range(sample_vectors.shape[-1]):
	array = sample_vectors[:, feature_n].copy()
	scaled_arr, scaled_arr_q = feature_mapping(array, method='density', data_type='binary')
	quantizer_list.append({'scaled_arr': scaled_arr, 'scaled_arr_q': scaled_arr_q})
quantized_tfQ = quantize_vector(sample_vectors, quantizer_list).round()
dequantized_tfQ = dequantize_vector(quantized_tfQ, quantizer_list)

In [18]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

def quantize_query(array):
	quantized_features = list()
	for feature_index in range(384):
		scaled_arr = quantizer_list[feature_index]['scaled_arr']
		scaled_arr_q = quantizer_list[feature_index]['scaled_arr_q']
		quantized_ft = quantize_feature(array[feature_index], scaled_arr, scaled_arr_q)
		quantized_features.append(quantized_ft)
	return np.array(quantized_features).T.round()

# Create a KNN object
knn = NearestNeighbors(n_neighbors=10, metric='cosine')
knn.fit(quantized_tfQ.astype('bool'))

# Find the k-nearest neighbors for a new point
vector = quantize_query(model.encode('medieval'))
new_point = np.array([vector.astype('bool')])
distances, indices = knn.kneighbors(new_point)
df.iloc[indices[0]]['Name'].tolist()

['Going Medieval',
 'Surviving Medieval',
 'Medieval Wars',
 'Medieval Dynasty',
 'Medievalien',
 'Medieval Towns',
 'Medieval Monarch',
 'Grand Ages: Medieval',
 'Medieval Story',
 'Medieval Battlefields']

### validation

In [None]:
# regular
quantized_regular = quantize_regular(sample_vectors, method='quantize', data_type='binary')

# ft-Q
quantizer_list = list()
for feature_n in range(sample_vectors.shape[-1]):
	array = sample_vectors[:, feature_n].copy()
	scaled_arr, scaled_arr_q = feature_mapping(array, method='density', data_type='binary')
	quantizer_list.append({'scaled_arr': scaled_arr, 'scaled_arr_q': scaled_arr_q})
quantized_tfQ = quantize_vector(sample_vectors, quantizer_list).round()
dequantized_tfQ = dequantize_vector(quantized_tfQ, quantizer_list)

# we compute the error difference between the two quantization approaches
err_regular = .5-quantized_regular.sum()/sample_vectors.size
err_ftQ = .5-quantized_tfQ.sum()/sample_vectors.size
err_total = abs(err_regular)-abs(err_ftQ)
err_total

0.012901293538566672