https://www.kaggle.com/datasets/fronkongames/steam-games-dataset

In this example we are only using 50 tags (to test the compatibility of covariate encoding with n_tags < M.shape(0))

In [1]:
import os

# move directory to the root of this repo
os.chdir('\\'.join(os.getcwd().split('\\')[:-2]))

import pandas as pd
import numpy as np
from simtag.filter import simtag_filter

  from tqdm.autonotebook import tqdm, trange


In [2]:
# import raw data
df = pd.read_parquet('notebooks/steam-games/games.parquet').dropna()
df['Tags'] = df['Tags'].apply(lambda x : x.split(','))
df['Genres'] = df['Genres'].apply(lambda x : x.split(','))
df = df.drop(['game_vector', 'game_indices', 'Score', 'Recommendations'], axis=1)
df

# only pick the top 50 tags
tags_list = list()
[[tags_list.append(k) for k in x] for x in df['Tags'].tolist()]
valid_tags = pd.Series(tags_list).value_counts()[0:50].index.tolist()

def filter_tags(valid_tags, tags):

	final_tags = list()
	for tag in tags:
		if tag in valid_tags:
			final_tags.append(tag)

	return final_tags

df['Tags'] = df['Tags'].apply(lambda tags : filter_tags(valid_tags, tags))
df

# extract raw lists
sample_list = df['Tags'].values.tolist()

In [3]:
# initiate recommender
engine = simtag_filter(
    sample_list=sample_list,
    covariate_vector_length=384,
    model_name='sentence-transformers/all-MiniLM-L6-v2'
)



In [4]:
# if not existing, compute M
M, df_M = engine.compute_M(method='encoding')
# df_M.to_parquet('notebooks/steam-games/M.parquet')

# if existing, load M
# df_M = pd.read_parquet('notebooks/steam-games/M.parquet')
engine.load_M(df_M)

100%|██████████| 50/50 [00:00<00:00, 52.92it/s]


In [5]:
import numpy as np
from tqdm import tqdm

def encode_samples(sample_list):

	def encode_sample(list_tags):
			
		vector = np.zeros(384)
		indexes = [engine.tag_list.index(x) for x in list_tags]
		for index in indexes:
			T_range = engine.T_indexes[index]
			vector[T_range[0]:T_range[1]] = 1
			# vector[0:5] = 1

		compressed_vector = vector
		# print('@', compressed_vector)
		if engine.pca_vector_length is not None:
			compressed_vector = engine.pca.transform(vector.reshape(1, len(vector)))[0]
		return compressed_vector

	row_list = list()
	for sample in tqdm(sample_list, desc="processing samples"):
		row_list.append(encode_sample(sample))

	return row_list

In [6]:
# if not existing, compute and save samples_encoded.parquet
sample_vectors = engine.encode_samples(sample_list)
# samples_encoded = pd.DataFrame([sample_vectors], index=['vector']).T
# samples_encoded.to_parquet('samples_encoded_mini.parquet')

# if already existing, load samples_encoded.parquet
# sample_vectors = pd.read_parquet('notebooks/twitter-news/samples_encoded.parquet').vector.tolist()
nbrs = engine.compute_nbrs(sample_vectors, k=4)

# prepare search to input non-existing tags
engine.compute_nbrs_tags()

processing samples: 100%|██████████| 41895/41895 [00:01<00:00, 28160.71it/s]


In [7]:
query_tag_list = ['Action', 'Casual', '2D', 'Cute', 'Mystery']

# perform search
query_vector = engine.encode_query(list_tags=query_tag_list, allow_new_tags=True)
indices, search_results = engine.soft_tag_filtering(nbrs, sample_list, query_vector)
search_results

[['Indie', 'Action', '2D', 'Cute', 'Casual'],
 ['Casual', 'Action', 'Indie', '2D', 'Cute'],
 ['Action', 'Indie', 'Casual', 'Cute', '2D'],
 ['Action',
  'Casual',
  'Arcade',
  'Mystery',
  'Platformer',
  'Singleplayer',
  'Cute',
  '2D']]

In [8]:
query_tag_list = ['Action', 'Casual', '2D', 'Cute', 'Mystery']
result_index = 0

# semantic search
query_vector = engine.encode_query(list_tags=query_tag_list)
soft_indices, soft_filter_results = engine.soft_tag_filtering(nbrs, sample_list, query_vector)
soft_raw_scores, soft_mean_scores = engine.compute_neighbor_scores(soft_filter_results[result_index], query_tag_list, remove_max=False)

# traditional search
hard_indices, hard_filter_results = engine.hard_tag_filtering(sample_list, query_tag_list)
hard_raw_scores, hard_mean_scores = engine.compute_neighbor_scores(hard_filter_results[result_index], query_tag_list, remove_max=False)

In [9]:
engine.show_results(query_tag_list, soft_raw_scores, soft_filter_results[result_index], visualization_type='raw', power=0.4, title=f'{query_tag_list}', visualize=True, return_html=False)
print()
engine.show_results(query_tag_list, hard_raw_scores, hard_filter_results[result_index], visualization_type='raw', power=0.6, title=f'{query_tag_list}', visualize=True, return_html=False)


