https://www.kaggle.com/datasets/fronkongames/steam-games-dataset

In [1]:
import os
import warnings

# move directory to the root of this repo
os.chdir('\\'.join(os.getcwd().split('\\')[:-2]))
warnings.simplefilter("ignore")

import pandas as pd
from simtag.filter import simtag_filter

# setup the library

### import data

In [2]:
# import raw data
df = pd.read_parquet('notebooks/steam-games/games.parquet').dropna()
df['Tags'] = df['Tags'].apply(lambda x : x.split(','))
df['Genres'] = df['Genres'].apply(lambda x : x.split(','))
df = df.drop(['game_vector', 'game_indices', 'Score', 'Recommendations'], axis=1)

# extract raw lists
sample_list = df['Tags'].values.tolist()

### process data

In [3]:
# initiate recommender
engine = simtag_filter(
    sample_list=sample_list, 
    covariate_vector_length=446, 
    model_name='sentence-transformers/all-MiniLM-L6-v2'
)

In [4]:
# if not existing, compute M
# M, df_M = engine.compute_M(method='co-occurrence')
# df_M.to_parquet('notebooks/steam-games/M.parquet')

# if existing, load M
df_M = pd.read_parquet('notebooks/steam-games/M.parquet')
engine.load_M(df_M)

In [5]:
# visualize co-occurrent matrix
cM = pd.DataFrame(df_M['vector_tags'].values.tolist())
cM.index = engine.tag_list
cM.columns = engine.tag_list[0:len(cM.columns)]
cM

Unnamed: 0,1980s,1990's,2.5D,2D,2D Fighter,2D Platformer,360 Video,3D,3D Fighter,3D Platformer,...,Well-Written,Werewolves,Western,Wholesome,Word Game,World War I,World War II,Wrestling,Zombies,e-sports
1980s,1.000000,0.144844,0.016615,0.039649,0.015748,0.050382,0.0,0.021869,0.008160,0.012099,...,0.001235,0.001149,0.004073,0.000000,0.018817,0.000000,0.001584,0.004785,0.007773,0.003690
1990's,0.144844,1.000000,0.045221,0.053253,0.018891,0.063454,0.0,0.033869,0.006061,0.017914,...,0.000000,0.001599,0.007364,0.000797,0.016734,0.001561,0.004268,0.002461,0.012529,0.009615
2.5D,0.016615,0.045221,1.000000,0.026592,0.048212,0.037666,0.0,0.019259,0.009491,0.028558,...,0.000000,0.001078,0.006750,0.001074,0.002517,0.003138,0.009153,0.001115,0.010724,0.014159
2D,0.039649,0.053253,0.026592,1.000000,0.031077,0.150200,0.0,0.015679,0.001001,0.004579,...,0.000093,0.001675,0.004433,0.002795,0.019035,0.001578,0.005589,0.001306,0.023439,0.004943
2D Fighter,0.015748,0.018891,0.048212,0.031077,1.000000,0.040374,0.0,0.004314,0.029299,0.003217,...,0.000000,0.001767,0.004418,0.000000,0.001202,0.001678,0.003132,0.017078,0.006889,0.056604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
World War I,0.000000,0.001561,0.003138,0.001578,0.001678,0.000972,0.0,0.002617,0.002392,0.001351,...,0.000000,0.000000,0.003425,0.000000,0.000000,1.000000,0.055351,0.000000,0.001989,0.000000
World War II,0.001584,0.004268,0.009153,0.005589,0.003132,0.002068,0.0,0.009484,0.001279,0.001628,...,0.000000,0.000000,0.004587,0.000000,0.000000,0.055351,1.000000,0.000000,0.009693,0.001319
Wrestling,0.004785,0.002461,0.001115,0.001306,0.017078,0.000000,0.0,0.001982,0.031700,0.000704,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.003003
Zombies,0.007773,0.012529,0.010724,0.023439,0.006889,0.011128,0.0,0.043820,0.008197,0.009424,...,0.000000,0.010232,0.006940,0.000000,0.001146,0.001989,0.009693,0.000000,1.000000,0.001179


In [6]:
# prepare search
sample_vectors = engine.encode_samples(sample_list)
nbrs = engine.compute_nbrs(sample_vectors, k=5)

processing samples: 100%|██████████| 41895/41895 [00:01<00:00, 28511.35it/s]


# semantic tag search

### naive

In [20]:
query_tag_list = [
    'Shooter', 
    'Fantasy', 
    'Cartoon'
]

# perform search
query_vector = engine.encode_query(list_tags=query_tag_list)
indices, search_results = engine.soft_tag_filtering(nbrs, sample_list, query_vector)
print(search_results[0:5])

[['Action', 'Adventure', 'VR', 'Casual', 'Sports', 'Fantasy', 'Shooter', 'First-Person', 'Cartoon', 'Archery'], ['Indie', 'Fantasy', 'Fighting', 'Split Screen', 'Cartoon'], ['Action', 'Adventure', 'First-Person', 'Tower Defense', 'Cartoon', 'Shooter', 'Combat', 'Indie', 'Singleplayer', 'Fantasy', 'FPS', '3D', 'Colorful', 'Linear', 'Story Rich', 'Gore', 'Violent'], ['RPG', 'Action', 'Shooter', '3D', 'Magic', 'Fantasy', 'Singleplayer', "Shoot 'Em Up"], ['Hentai', 'Mature', 'Sexual Content', 'NSFW', 'Nudity', 'Anime', 'Fantasy', 'Swordplay', 'Casual', 'Match 3', 'Idler', 'Singleplayer', 'Cute', 'Cartoon', 'Dating Sim', 'Cartoony', '2D Platformer', 'Puzzle', 'Shooter']]


### weighted

In [15]:
query_tag_dict = {
    'Shooter' : 0.9,
    'Open World' : 0.1,
}

# perform search
query_vector = engine.encode_query(dict_tags=query_tag_dict)
indices, search_results = engine.soft_tag_filtering(nbrs, sample_list, query_vector)
search_results[0:5]

[['Racing', 'Indie', 'Arcade', 'Retro', '1980s'],
 ['Casual', '2D', 'Arcade', '1980s', 'Singleplayer'],
 ['Simulation', 'Submarine', 'Retro', 'Naval', 'Classic', '1980s'],
 ['Action', 'Adventure', 'Indie', '1980s', 'Stealth', 'Political'],
 ['Indie', 'Horror', 'First-Person', 'Mystery', '1980s', 'Supernatural']]

# validation

In [9]:
query_tag_list = [
    'Simulation', 
    'Exploration',
    'Open World',
]
result_index = 0

# semantic search
query_vector = engine.encode_query(list_tags=query_tag_list)
soft_indices, soft_filter_results = engine.soft_tag_filtering(nbrs, sample_list, query_vector)
soft_raw_scores, soft_mean_scores = engine.compute_neighbor_scores(soft_filter_results[result_index], query_tag_list, exp=1.3, remove_max=False)

# traditional search
hard_indices, hard_filter_results = engine.hard_tag_filtering(sample_list, query_tag_list)
hard_raw_scores, hard_mean_scores = engine.compute_neighbor_scores(hard_filter_results[result_index], query_tag_list, remove_max=False)

### lookup data

In [10]:
# semantic tag filtering
df.iloc[soft_indices]

Unnamed: 0,index,Name,Release date,Estimated owners,Peak CCU,Price,About the game,Total_reviews,Genres,Tags
3220,5247,Escape: Sierra Leone,"Dec 5, 2016",0 - 20000,0,9.99,This game is currently a Beta in Early Access ...,46,"[Adventure, Simulation, Early Access]","[Adventure, Early Access, Simulation, Survival..."
21494,35033,Copoka,"Feb 15, 2017",20000 - 50000,0,4.99,Copoka is an open-world exploratory indie game...,231,"[Casual, Indie]","[Casual, Indie, Exploration, Atmospheric, Flig..."
16312,26506,Village businessman,"Sep 28, 2020",0 - 20000,0,7.99,"In Village businessman, you play as a person w...",2,[Simulation],"[Simulation, Open World, 3D, Third Person, Sin..."
10489,17026,Lunar Rover,"Jul 17, 2020",0 - 20000,0,3.99,Features Drive the Lunar Roving Vehicle used d...,6,[Simulation],"[Simulation, Open World, Space, Exploration, D..."
35586,58172,World of Motors,"Feb 16, 2022",0 - 20000,0,11.99,A game tailored to your requirements. buy cars...,16,"[Indie, Racing, Simulation]","[Simulation, Racing, Exploration, 3D, Driving,..."


In [11]:
# traditional tag filtering
df.iloc[hard_indices]

Unnamed: 0,index,Name,Release date,Estimated owners,Peak CCU,Price,About the game,Total_reviews,Genres,Tags
13,22,Aerofly FS 2 Flight Simulator,"Nov 20, 2017",100000 - 200000,19,37.49,Aerofly FS 2 lets you explore the world of fly...,1898,"[Action, Indie, Racing, Simulation]","[Flight, Simulation, VR, Racing, Physics, Open..."
41,57,Forza Horizon 4,"Mar 9, 2021",2000000 - 5000000,7571,59.99,Dynamic seasons change everything at the world...,137634,[Racing],"[Racing, Open World, Driving, Multiplayer, Onl..."
60,96,Oxygen Not Included,"Jul 30, 2019",2000000 - 5000000,7507,24.99,In the space-colony simulation game Oxygen Not...,85916,"[Indie, Simulation]","[Colony Sim, Base-Building, Survival, Resource..."
145,223,Ships 2017,"Oct 19, 2016",20000 - 50000,0,14.99,Take control over 3 special purpose naval vess...,177,"[Indie, Simulation]","[Simulation, Indie, Free to Play, Adventure, S..."
382,617,JCB Pioneer: Mars,"Sep 1, 2017",0 - 20000,0,24.99,WHAT IS JCB PIONEER: MARS? JCB Pioneer: Mars d...,143,"[Action, Adventure, Indie, Simulation, Early A...","[Early Access, Simulation, Action, Adventure, ..."
...,...,...,...,...,...,...,...,...,...,...
41747,83359,廃集落探索,"Nov 17, 2023",0 - 20000,1,4.99,Abandoned Village Exploration This is a first-...,7,[Indie],"[Horror, Psychological, Dark, Walking Simulato..."
41752,83397,Under A New Sun,"Nov 13, 2023",0 - 20000,31,16.99,The plan was simple - the ship would wake you ...,41,"[Adventure, Indie, Simulation, Early Access]","[Open World Survival Craft, Survival, Open Wor..."
41815,83843,Sledders,"Dec 5, 2023",0 - 20000,306,24.99,Shred your sled in open-world backcountries. S...,293,"[Racing, Simulation, Sports, Early Access]","[Simulation, Snow, Offroad, Racing, Sports, Mo..."
41817,83846,Soulash 2,"Dec 4, 2023",0 - 20000,108,16.19,"In a realm where gods have perished, a new era...",58,"[Adventure, Indie, RPG, Strategy, Early Access]","[Early Access, RPG, Sandbox, Procedural Genera..."


### visualize flattened results

In [12]:
engine.show_results(query_tag_list, soft_raw_scores, soft_filter_results[result_index], visualization_type='mean', power=0.6, title=f'{query_tag_list}', visualize=True, return_html=False)
engine.show_results(query_tag_list, hard_raw_scores, hard_filter_results[result_index], visualization_type='mean', power=0.6, title=f'{query_tag_list}', visualize=True, return_html=False)

In [13]:
engine.show_results(query_tag_list, soft_raw_scores, soft_filter_results[result_index], visualization_type='mean', power=0.6, title=f'{query_tag_list}', visualize=True, return_html=True)

"['Simulation', 'Exploration', 'Open World']:<br><span style='background-color:rgb(107,7,0); color:white'>Adventure</span> <span style='background-color:rgb(99,8,0); color:white'>Early Access</span> <span style='background-color:rgb(127,5,0); color:white'>Simulation</span> <span style='background-color:rgb(100,8,0); color:white'>Survival</span> <span style='background-color:rgb(132,5,0); color:white'>Exploration</span> <span style='background-color:rgb(132,5,0); color:white'>Open World</span> <span style='background-color:rgb(106,8,0); color:white'>First-Person</span> <span style='background-color:rgb(96,9,0); color:white'>Realistic</span> "

### visualize granular results

In [14]:
engine.show_results(query_tag_list, soft_raw_scores, soft_filter_results[result_index], visualization_type='raw', power=0.6, title=f'{query_tag_list}', visualize=True, return_html=False)
print()
engine.show_results(query_tag_list, hard_raw_scores, hard_filter_results[result_index], visualization_type='raw', power=0.6, title=f'{query_tag_list}', visualize=True, return_html=False)


