In [1]:
import os
import warnings

# move directory to the root of this repo
os.chdir('\\'.join(os.getcwd().split('\\')[:-2]))
warnings.simplefilter("ignore")

import pandas as pd
import numpy as np
from simtag.filter import simtag_filter




In [2]:
# import raw data
df = pd.read_parquet('notebooks/steam-games/games.parquet').dropna()
df['Tags'] = df['Tags'].apply(lambda x : x.split(','))
df['Genres'] = df['Genres'].apply(lambda x : x.split(','))
df = df.drop(['game_vector', 'game_indices', 'Score', 'Recommendations'], axis=1)

# extract raw lists
sample_list = df['Tags'].values.tolist()

### client

In [25]:
# initiate recommender
engine = simtag_filter(
    sample_list=sample_list, 
    covariate_vector_length=384, 
    model_name='sentence-transformers/all-MiniLM-L6-v2'
)

In [26]:
# if not existing, compute M
M, df_M = engine.compute_M(method='encoding')
# df_M.to_parquet('notebooks/steam-games/M.parquet')

# if existing, load M
# df_M = pd.read_parquet('notebooks/steam-games/M.parquet')
# engine.load_M(df_M) # we need to apply encryption option

100%|██████████| 446/446 [00:08<00:00, 52.92it/s]


In [27]:
def shuffle_columns(M):
	"""Shuffle the columns of a matrix and return the shuffled column indices"""
	permutation = np.random.permutation(M.shape[0])
	M_shuffled = M[permutation, :]
	return M_shuffled, permutation

def generate_key(size):
	"""Generate a random key matrix"""
	return np.random.randint(0, 256, size=(size, size), dtype=np.uint8)

def orthogonal_transformation(M_shuffled, A):
	"""Apply the orthogonal_transformation"""
	M_transformed = np.dot(M_shuffled, A)
	return M_transformed

In [28]:
engine.df_M = df_M
engine.M = np.array(df_M['vector_tags'].tolist())

### encryption-1: shuffling
engine.M_shuffled, engine.permutation = shuffle_columns(engine.M)

### encryption-2: orthogonal_transformation
A, _ = np.linalg.qr(np.random.randn(engine.covariate_vector_length, engine.covariate_vector_length))
engine.M_transformed = orthogonal_transformation(engine.M_shuffled, A)

engine.M = engine.M_transformed
engine.M_encrypted_mean = np.mean(engine.M, axis=0)

# we either compress or expand M
engine.compute_adjusting_transformation()
# pca is now computed

In [29]:
from tqdm import tqdm

def encrypt_tags(query_tag_list, permutation):
	
	indexes = [engine.tag_list.index(x) for x in query_tag_list]
	encrypted_indexes = [np.where(permutation==x)[0].tolist()[0] for x in indexes]

	return indexes, encrypted_indexes

def encode_samples_encrypted(sample_list_encrypted, n_tags):

	def encode_sample_encrypted(encrypted_indexes, n):
		
		# convert into one hot
		vector_length = n_tags
		onehot_covariate_vector = np.zeros(vector_length)
		for index in encrypted_indexes:
			onehot_covariate_vector[index] = 1

		# adjust vector
		onehot_covariate_vector = engine.adjust_oneshot_vector(onehot_covariate_vector)

		return onehot_covariate_vector
	
	row_list = list()
	for sample_encrypted in tqdm(sample_list_encrypted, desc="processing samples"):
		row_list.append(encode_sample_encrypted(sample_encrypted, n_tags))

	return row_list

def encode_query_encrypted(encrypted_indexes, n_tags):

	# convert into one hot
	vector_length = n_tags
	onehot_covariate_vector = np.zeros(vector_length)
	for index in encrypted_indexes:
		onehot_covariate_vector[index] = 1

	# adjust vector
	onehot_covariate_vector = engine.adjust_oneshot_vector(onehot_covariate_vector)

	M_encrypted_mean = engine.M_encrypted_mean + onehot_covariate_vector

	return M_encrypted_mean

# recommendation process

In [30]:
# client
sample_list_encrypted = [encrypt_tags(x, engine.permutation)[1] for x in sample_list]
print(sample_list_encrypted[0:5])

# information to be shared with the db
engine_pca = engine.pca
n_tags = len(engine.tag_list)

[[442, 297, 82, 88], [442, 115, 43, 364, 180, 223, 71, 171, 273, 232, 222, 297, 142, 102, 189, 209, 392, 242, 289, 376], [214, 393, 251, 256, 332, 364, 329, 189, 396, 298, 375, 260, 249, 171, 306, 130, 412, 232, 240, 61], [390, 315, 427, 249, 232, 252], [115, 442, 390, 106, 223, 133]]


In [31]:
# database
engine.pca = engine_pca
sample_vectors_encrypted = encode_samples_encrypted(sample_list_encrypted, n_tags)
nbrs = engine.compute_nbrs(sample_vectors_encrypted, k=5)

processing samples: 100%|██████████| 41895/41895 [00:18<00:00, 2274.61it/s]


In [35]:
# client
query_tag_list = [
    'Fantasy', 
    'Dark Fantasy'
]

indexes, encrypted_indexes = encrypt_tags(query_tag_list, engine.permutation)
encrypted_indexes

[86, 349]

In [36]:
# database
query_vector = encode_query_encrypted(encrypted_indexes, n_tags)
indices, search_results = engine.soft_tag_filtering(nbrs, sample_list, query_vector)
indices

[36250, 32095, 23, 32019, 26902]

In [37]:
# client
search_results = [sample_list[x] for x in indices]
search_results

[['Strategy',
  'Turn-Based',
  'Fantasy',
  'Classic',
  'Turn-Based Strategy',
  'Dark Fantasy'],
 ['Action',
  'Indie',
  'Horror',
  'Fantasy',
  'Comedy',
  'Survival Horror',
  'Dark Fantasy'],
 ['Adventure',
  'Indie',
  'RPG',
  'RPGMaker',
  'Dark Fantasy',
  'Fantasy',
  'Magic',
  'Dark Comedy'],
 ['RPG', 'Fantasy'],
 ['Early Access',
  'RPG',
  'Indie',
  'Strategy',
  'Fantasy',
  'Hack and Slash',
  'Dark Fantasy',
  'Grand Strategy']]