In [1]:
import os
import warnings

# move directory to the root of this repo
os.chdir('\\'.join(os.getcwd().split('\\')[:-2]))
warnings.simplefilter("ignore")

import pandas as pd
import numpy as np
from simtag.filter import simtag_filter
from sklearn.neighbors import NearestNeighbors




In [2]:
# import raw data
df = pd.read_parquet('notebooks/steam-games/games.parquet').dropna()
df['Tags'] = df['Tags'].apply(lambda x : x.split(','))
df['Genres'] = df['Genres'].apply(lambda x : x.split(','))
df = df.drop(['game_vector', 'game_indices', 'Score', 'Recommendations'], axis=1)

# extract raw lists
sample_list = df['Tags'].values.tolist()

In [3]:
# initiate engine
engine = simtag_filter(
    sample_list=sample_list,
    model_name='sentence-transformers/all-MiniLM-L6-v2'
)
tag2index, indexed_sample_list = engine.index_samples(sample_list)

In [4]:
# if not existing, compute M
M, df_M = engine.compute_M(method='encoding')
# df_M.to_parquet('notebooks/steam-games/M.parquet')

# if existing, load M
# df_M = pd.read_parquet('notebooks/steam-games/M.parquet')

100%|██████████| 446/446 [00:06<00:00, 68.31it/s]


In [5]:
engine.load_M(df_M, covariate_transformation='dot_product')

In [6]:
# prepare search
sample_vectors = engine.encode_samples(sample_list)
nbrs = engine.compute_nbrs(sample_vectors, k=50)

processing samples: 100%|██████████| 41895/41895 [00:06<00:00, 6365.11it/s]


### covariate tagging

In [25]:
from sentence_transformers.util import cos_sim
from collections import Counter
import random
import time
import textwrap

def montecarlo_tagging(str1, top_tags=100, technique='scrolling_window', min_window=2, max_window=7, mc1=1000, top_mc1=20, score_threshold=0.01, verbose=False):
	startTime = time.time()
	if verbose : print(textwrap.fill(str1, width=120))  # adjust the width to your liking

	base_vector = engine.model.encode(str1)
	nbrs_tags = NearestNeighbors(n_neighbors=top_tags, metric='cosine').fit(df_M['vector_tags'].values.tolist())
	distances, indices = nbrs_tags.kneighbors([base_vector])
	indices = indices[0].tolist()
	top_tags = [engine.tag_list[x] for x in indices]
	# print(top_tags)
	top_tags

	if technique == 'scrolling_window':
		mc = list()
		for j in range(min_window, max_window+1):
			for k in range(int(len(top_tags)/j)+1):
				ranges = [k*j, (k+1)*j]
				choice = top_tags[ranges[0]:ranges[1]]
				if len(choice) > 0:
					mc.append(choice)
		mc_encoded = engine.encode_samples(mc, quantize_samples=False, show_progress=False)
		nbrs_mc = NearestNeighbors(n_neighbors=top_mc1, metric='cosine').fit(mc_encoded)
		distances, indices = nbrs_mc.kneighbors([engine.model.encode(str1)])

	elif technique == 'monte_carlo':
		mc = [list(set(random.choices(top_tags, k=random.randint(min_window, max_window)))) for x in range(mc1)]
		mc_encoded = engine.encode_samples(mc, quantize_samples=False, show_progress=False)
		nbrs_mc = NearestNeighbors(n_neighbors=top_mc1, metric='cosine').fit(mc_encoded)
		distances, indices = nbrs_mc.kneighbors([engine.model.encode(str1)])

	results = [mc[x] for x in indices[0]]
	top_tags = [x for xs in results for x in xs] # in case we want to pass it to the next monte-carlo
	# print(results[0:10])

	tag_freq = Counter(tag for game_tags in results for tag in game_tags)
	tag_freq = dict(tag_freq)
	# final_values = sorted([[sum([tag_freq[k] for k in x]), x] for x in results])[::-1][0][1]
	# final_values

	top_m = sorted([[tag_freq[key], key] for index, key in enumerate(tag_freq)])[::-1]
	# print(top_m)
	top_m = [x[1] for x in top_m]
	len(top_m)

	accepted = list()
	#
	best_selection_score = 0
	for m in range(len(top_m)):
		best_tag = ''
		score = 0
		for k in top_m:
			if k not in accepted:
				iteration = accepted + [k]
				vector_iteration = engine.encode_samples([iteration], quantize_samples=False, show_progress=False)[0]
				base_vector = base_vector.astype(vector_iteration.dtype)  # cast m2 to match m1's dtype
				vector_score = cos_sim(base_vector, vector_iteration).tolist()[0][0]
				if vector_score > score:
					score = vector_score
					best_tag = k
				else:
					pass
		sequence_score = score
		# print(sequence_score, best_selection_score, accepted)
		accepted.append(best_tag)

		if sequence_score > best_selection_score and m > 0 and ((sequence_score-best_selection_score)/best_selection_score) > score_threshold:
			best_selection_score = sequence_score
		elif m == 0:
			best_selection_score = sequence_score
		else:
			# print(sequence_score, best_selection_score, 'BREAK', accepted)
			break
			# pass

	endTime = time.time()
	howMuchTime = endTime - startTime

	if verbose : print('\nEXECUTION_TIME', round(howMuchTime, 4))

	return accepted

accepted = montecarlo_tagging(df['About the game'][195], top_tags=100, technique='scrolling_window', min_window=5, max_window=7, score_threshold=0.01, verbose=True)
accepted

Light up the world! As Plug, you are charged with restoring the expansive Amp-Tree-System, and thwarting an enigmatic
intruder. Solve puzzles and explore in 1000 Amps! Key Features: Illuminate the darkness by lighting up whatever you
touch. Teleport into any un-occupied space with a simple mouse click. Explore the expansive and labyrinthine Amp-Tree-
System, finding new power ups strewn throughout. Tackle challenges in whatever order you like, thanks to the open world
design. Over 150 rooms to complete and explore. Save anywhere, at anytime.

EXECUTION_TIME 0.3299


['Electronic Music',
 'Open World',
 'Utilities',
 'Base-Building',
 'Escape Room',
 "Shoot 'Em Up",
 'Tutorial']