In [1]:
import os
import warnings

# move directory to the root of this repo
os.chdir('\\'.join(os.getcwd().split('\\')[:-2]))
warnings.simplefilter("ignore")

import pandas as pd
import numpy as np
from simtag.filter import simtag_filter
from sklearn.neighbors import NearestNeighbors

In [2]:
# import raw data
df = pd.read_parquet('notebooks/steam-games/games.parquet').dropna()
df['Tags'] = df['Tags'].apply(lambda x : x.split(','))
df['Genres'] = df['Genres'].apply(lambda x : x.split(','))
df = df.drop(['game_vector', 'game_indices', 'Score', 'Recommendations'], axis=1)

# extract raw lists
sample_list = df['Tags'].values.tolist()

In [3]:
# initiate engine
engine = simtag_filter(
    sample_list=sample_list,
    model_name='sentence-transformers/all-MiniLM-L6-v2'
)

In [6]:
# if not existing, compute M
# M, df_M = engine.compute_M(method='co-occurrence')
# df_M.to_parquet('notebooks/steam-games/M.parquet')

# if existing, load M
df_M = pd.read_parquet('notebooks/steam-games/M.parquet')

# dot_product

In [7]:
engine.load_M(df_M, covariate_transformation='dot_product')

In [8]:
engine.df_M

Unnamed: 0,tags,vector_tags
0,1980s,"[1.0, 0.14484356894553882, 0.01661538461538461..."
1,1990's,"[0.14484356894553882, 1.0, 0.04522096608427543..."
2,2.5D,"[0.016615384615384615, 0.045220966084275435, 1..."
3,2D,"[0.039648773422648685, 0.05325286234135085, 0...."
4,2D Fighter,"[0.015748031496062992, 0.018890920170627667, 0..."
...,...,...
441,World War I,"[0.0, 0.00156128024980484, 0.00313807531380753..."
442,World War II,"[0.001583531274742676, 0.00426829268292683, 0...."
443,Wrestling,"[0.004784688995215311, 0.002461033634126333, 0..."
444,Zombies,"[0.007773205304069501, 0.012529365700861394, 0..."


In [6]:
# visualize co-occurrent matrix
cM = pd.DataFrame(df_M['vector_tags'].values.tolist())
cM.index = engine.tag_list
cM.columns = engine.tag_list[0:len(cM.columns)]
cM

Unnamed: 0,1980s,1990's,2.5D,2D,2D Fighter,2D Platformer,360 Video,3D,3D Fighter,3D Platformer,...,Well-Written,Werewolves,Western,Wholesome,Word Game,World War I,World War II,Wrestling,Zombies,e-sports
1980s,1.000000,0.144844,0.016615,0.039649,0.015748,0.050382,0.0,0.021869,0.008160,0.012099,...,0.001235,0.001149,0.004073,0.000000,0.018817,0.000000,0.001584,0.004785,0.007773,0.003690
1990's,0.144844,1.000000,0.045221,0.053253,0.018891,0.063454,0.0,0.033869,0.006061,0.017914,...,0.000000,0.001599,0.007364,0.000797,0.016734,0.001561,0.004268,0.002461,0.012529,0.009615
2.5D,0.016615,0.045221,1.000000,0.026592,0.048212,0.037666,0.0,0.019259,0.009491,0.028558,...,0.000000,0.001078,0.006750,0.001074,0.002517,0.003138,0.009153,0.001115,0.010724,0.014159
2D,0.039649,0.053253,0.026592,1.000000,0.031077,0.150200,0.0,0.015679,0.001001,0.004579,...,0.000093,0.001675,0.004433,0.002795,0.019035,0.001578,0.005589,0.001306,0.023439,0.004943
2D Fighter,0.015748,0.018891,0.048212,0.031077,1.000000,0.040374,0.0,0.004314,0.029299,0.003217,...,0.000000,0.001767,0.004418,0.000000,0.001202,0.001678,0.003132,0.017078,0.006889,0.056604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
World War I,0.000000,0.001561,0.003138,0.001578,0.001678,0.000972,0.0,0.002617,0.002392,0.001351,...,0.000000,0.000000,0.003425,0.000000,0.000000,1.000000,0.055351,0.000000,0.001989,0.000000
World War II,0.001584,0.004268,0.009153,0.005589,0.003132,0.002068,0.0,0.009484,0.001279,0.001628,...,0.000000,0.000000,0.004587,0.000000,0.000000,0.055351,1.000000,0.000000,0.009693,0.001319
Wrestling,0.004785,0.002461,0.001115,0.001306,0.017078,0.000000,0.0,0.001982,0.031700,0.000704,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.003003
Zombies,0.007773,0.012529,0.010724,0.023439,0.006889,0.011128,0.0,0.043820,0.008197,0.009424,...,0.000000,0.010232,0.006940,0.000000,0.001146,0.001989,0.009693,0.000000,1.000000,0.001179


In [7]:
# prepare search
sample_vectors = engine.encode_samples(sample_list)
nbrs_covariate = engine.compute_nbrs(sample_vectors, k=5)

processing samples: 100%|██████████| 41895/41895 [00:09<00:00, 4637.43it/s]


### covariate search

In [34]:
query_tag_dict = [ 'Shooter', 'Dark Fantasy', 'Sci-fi']

# perform search
query_vector = engine.encode_query(list_tags=query_tag_dict, allow_new_tags=False, print_new_tags=True)
indices, search_results = engine.soft_tag_filtering(nbrs_covariate, sample_list, query_vector)
for s in search_results[0:5]:
    print(s)

['Action', 'FPS', 'Sci-fi', 'Shooter']
['Action', 'Third-Person Shooter', 'Sci-fi', 'Aliens', 'Space', 'Great Soundtrack', 'Shooter', 'Atmospheric', 'Futuristic']
['Action', 'FPS', 'Sci-fi', 'Shooter', 'First-Person', 'Singleplayer', 'Space', 'Difficult']
['Action', 'Shooter', 'Sci-fi', 'Classic', 'First-Person', 'FPS', 'Arcade']
['Action', 'FPS', 'Shooter', 'Singleplayer', 'First-Person', 'Arena Shooter', 'Futuristic', 'PvE', 'Robots', 'Sci-fi', 'Difficult']


In [33]:
query_tag_dict = {
    'Voxel' : 0.8,
    'Shooter' : 0.2,
    'Open World' : 0.6,
}

# perform search
query_vector = engine.encode_query(dict_tags=query_tag_dict)
indices, search_results = engine.soft_tag_filtering(nbrs_covariate, sample_list, query_vector)
for s in search_results[0:5]:
    print(s)

['Adventure', 'Action', 'Simulation', 'Open World', 'Survival', 'Voxel', 'Sci-fi', 'Early Access']
['Open World', 'Massively Multiplayer', 'Building', 'Space Sim', 'Simulation', 'Sandbox', 'Space', 'Sci-fi', 'Action', 'Early Access', 'FPS', 'Voxel', 'Crafting', 'Destruction', 'Programming', 'Exploration', 'Robots', 'Multiplayer', 'Open World Survival Craft', 'First-Person']
['Early Access', 'Adventure', 'Sandbox', 'MMORPG', 'Voxel', 'Crafting', 'Base-Building', 'Massively Multiplayer', 'Procedural Generation', 'Action RPG', 'FPS', 'Third-Person Shooter', 'Colorful', 'First-Person', 'Third Person', 'Open World', 'Character Customization', 'Combat', 'Inventory Management', 'PvE']
['Strategy', 'Action', 'Adventure', 'Simulation', 'Survival', 'Open World', 'Voxel', 'Sci-fi', 'FPS']
['Survival', 'Zombies', 'Voxel', 'Open World', 'Open World Survival Craft', 'Multiplayer', 'Post-apocalyptic', 'Base-Building', 'Online Co-Op', 'Exploration', 'Simulation', 'Sandbox', 'Building', 'Strategy', 'Ch