In [1]:
import os

# move directory to the root of this repo
os.chdir('\\'.join(os.getcwd().split('\\')[:-1]))

import pandas as pd
import numpy as np
from simtag.filter import simtag_filter

# setup the library

### import data

In [2]:
# import raw data
df = pd.read_parquet('notebooks/files/games.parquet').dropna()
df['Tags'] = df['Tags'].apply(lambda x : x.split(','))
df['Genres'] = df['Genres'].apply(lambda x : x.split(','))

# extract raw lists
sample_list = df['Tags'].values.tolist()

### process data

In [3]:
# initiate recommender
recommender = simtag_filter(sample_list)

In [3]:
# if not existing, compute M
recommender.compute_M()
recommender.M.to_parquet('notebooks/files/M1.parquet')
recommender.M

Processing tags: 100%|██████████| 446/446 [1:18:52<00:00, 10.61s/it]


Unnamed: 0,Local Co-Op,Twin Stick Shooter,Silent Protagonist,Fantasy,Quick-Time Events,Foreign,Software Training,Competitive,Destruction,Trading,...,Music,Animation & Modeling,Runner,Mahjong,Logic,Artificial Intelligence,Pirates,Hero Shooter,Turn-Based,Underground
Local Co-Op,1.000000,0.067914,0.000646,0.033097,0.003688,0.001928,0.000000,0.042175,0.016940,0.002354,...,0.010276,0.000000,0.006455,0.0,0.008354,0.005068,0.006517,0.004714,0.012040,0.003621
Twin Stick Shooter,0.067914,1.000000,0.001572,0.006761,0.002786,0.000000,0.000000,0.010657,0.018112,0.000000,...,0.012966,0.000000,0.001799,0.0,0.000504,0.004695,0.001274,0.014085,0.000000,0.002674
Silent Protagonist,0.000646,0.001572,1.000000,0.002857,0.000000,0.005051,0.000000,0.001449,0.001613,0.000000,...,0.008794,0.000000,0.003008,0.0,0.006549,0.000000,0.000000,0.000000,0.002535,0.003311
Fantasy,0.033097,0.006761,0.002857,1.000000,0.004826,0.005515,0.000000,0.010073,0.006402,0.004958,...,0.009498,0.001587,0.010876,0.0,0.017552,0.006589,0.008365,0.004160,0.098415,0.007415
Quick-Time Events,0.003688,0.002786,0.000000,0.004826,1.000000,0.007143,0.000000,0.000000,0.004280,0.000000,...,0.004535,0.002457,0.006711,0.0,0.009975,0.004267,0.004762,0.011820,0.001203,0.005208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Artificial Intelligence,0.005068,0.004695,0.000000,0.006589,0.004267,0.004769,0.006329,0.012613,0.016393,0.010417,...,0.003247,0.002646,0.007326,0.0,0.019669,1.000000,0.001297,0.010390,0.005489,0.005464
Pirates,0.006517,0.001274,0.000000,0.008365,0.004762,0.000000,0.000000,0.002387,0.006536,0.027140,...,0.000000,0.000000,0.002457,0.0,0.002974,0.001297,1.000000,0.002028,0.006395,0.004444
Hero Shooter,0.004714,0.014085,0.000000,0.004160,0.011820,0.000000,0.000000,0.014388,0.014379,0.002012,...,0.004193,0.002088,0.009828,0.0,0.001183,0.010390,0.002028,1.000000,0.000576,0.008811
Turn-Based,0.012040,0.000000,0.002535,0.098415,0.001203,0.003785,0.000000,0.007253,0.002490,0.005797,...,0.000912,0.000583,0.000486,0.0,0.008609,0.005489,0.006395,0.000576,1.000000,0.002959


In [4]:
# if already existing, load M
recommender.M = pd.read_parquet('notebooks/files/M1.parquet')

In [5]:
# prepare search
sample_vectors = recommender.encode_samples(sample_list)
nbrs = recommender.compute_nbrs(sample_vectors, k=5)

processing samples: 100%|██████████| 41895/41895 [00:11<00:00, 3645.94it/s]


# soft tag search

### combined covariate encoding

In [6]:
query_tag_list = [
    'Sci-fi', 
    'Open World'
]

# perform search
query_vector = recommender.encode_query(query_tag_list=query_tag_list, j=5)
search_results = recommender.soft_tag_filtering(nbrs, sample_list, query_vector)
search_results[1]

['Indie',
 'Adventure',
 'Space',
 'Sci-fi',
 'Open World',
 'Procedural Generation']

### weighted combined covariate encoding

In [7]:
query_tag_dict = {
    'Shooter' : 0.4,
    'Open World' : 0.6,
}

# perform search
query_vector = recommender.encode_query(query_tag_dict=query_tag_dict, j=5)
search_results = recommender.soft_tag_filtering(nbrs, sample_list, query_vector)
search_results[0]

['Action',
 'Indie',
 'Adventure',
 'Early Access',
 'FPS',
 'Open World',
 'Shooter']

# validation

In [150]:
import statistics

query_tag_list = [
    'Sci-fi', 
    'Shooter'
]
result_index = 1

# soft search
query_vector = recommender.encode_query(query_tag_list=query_tag_list, negative_score=False, j=5)
soft_filter_results = recommender.soft_tag_filtering(nbrs, sample_list, query_vector)
soft_raw_scores, soft_mean_scores = recommender.compute_neighbor_scores(soft_filter_results[result_index], query_tag_list, remove_max=False)

# hard search
hard_filter_results = recommender.hard_tag_filtering(sample_list, query_tag_list)
hard_raw_scores, hard_mean_scores = recommender.compute_neighbor_scores(hard_filter_results[result_index], query_tag_list, remove_max=False)

# report
print(soft_filter_results[result_index])
print(soft_raw_scores)
print(soft_mean_scores)
print()
print(hard_filter_results[result_index])
print(hard_raw_scores)
print(hard_mean_scores)

['Action', 'Indie', 'FPS', 'Shooter', 'Sci-fi', 'First-Person']
[[0.120492   0.07813045 0.10530639 0.16427862 1.         0.10879849]
 [0.20861239 0.08034726 0.33807627 1.         0.16427862 0.20374736]]
0.2976723211928548

['Early Access', 'VR', 'Mechs', 'Action', 'Team-Based', 'Multiplayer', 'FPS', 'Simulation', 'Indie', 'Futuristic', 'Space', 'Robots', 'Cyberpunk', 'Dystopian', 'Shooter', 'Destruction', 'Strategy', 'Sci-fi', 'Retro', 'Mars', 'VR Only']
[[0.07694013 0.04726585 0.04818092 0.120492   0.01815706 0.08083832
  0.10530639 0.06116302 0.07813045 0.19881576 0.29035069 0.10684161
  0.11162683 0.05435294 0.16427862 0.02056203 0.08311411 1.
  0.08624646 0.01754386 0.02098501]
 [0.08576682 0.06179458 0.02662192 0.20861239 0.04723552 0.13072923
  0.33807627 0.04548833 0.08034726 0.0717162  0.08867596 0.05253283
  0.05268574 0.02714255 1.         0.03387271 0.04182744 0.16427862
  0.10200053 0.0050424  0.03306288]]
0.1306833872035555


In [151]:
from IPython.display import HTML

for tag_index in range(len(query_tag_list)):
	data = list(zip(soft_raw_scores[tag_index], soft_filter_results[result_index]))
	tag = query_tag_list[tag_index]
	html_code = f"{tag}:<br>"
	for intensity, word in data:
		power = 0.6  # adjust this value to control the rate of color change
		scaled_intensity = intensity ** power
		r = int(20 * (1 - scaled_intensity))
		g = int(255 * scaled_intensity * 0.7)  # adjust the green component
		color = f"rgb({r},{g},0)"
		html_code += f"<span style='background-color:{color}'>{word}</span> "
		# html_code += f"<span style='color:{color}'>{word}</span> "

	display(HTML(html_code))

In [152]:
from IPython.display import HTML

for tag_index in range(len(query_tag_list)):
	data = list(zip(hard_raw_scores[tag_index], hard_filter_results[result_index]))
	tag = query_tag_list[tag_index]
	html_code = f"{tag}:<br>"
	for intensity, word in data:
		power = 0.6  # adjust this value to control the rate of color change
		scaled_intensity = intensity ** power
		r = int(20 * (1 - scaled_intensity))
		g = int(255 * scaled_intensity * 0.7)  # adjust the green component
		color = f"rgb({r},{g},0)"
		html_code += f"<span style='background-color:{color}'>{word}</span> "
		# html_code += f"<span style='color:{color}'>{word}</span> "

	display(HTML(html_code))

In [17]:
[recommender.compute_neighbor_scores(soft_filter_results[x], query_tag_list, remove_max=True)[1] for x in range(5)]

[array(0.2004249),
 array(0.19842347),
 array(0.19842347),
 array(0.17286856),
 array(0.17286856)]

In [16]:
[recommender.compute_neighbor_scores(hard_filter_results[x], query_tag_list, remove_max=True)[1] for x in range(5)]

[array(0.10270942),
 array(0.09802689),
 array(0.11805023),
 array(0.10857919),
 array(0.10065173)]