In [144]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import polars as pl

from website import Base
from website.models import Game, Genre, Theme, Rating

In [145]:
database_uri = f'mysql+pymysql://root:root@localhost:3306/db'
engine = create_engine(database_uri)
Base.metadata.bind = engine
DBSession = sessionmaker(bind=engine)
session = DBSession()
games = session.query(Game).all()

In [146]:
def extract_name(items):
    return [x.name for x in items]

data = {
        'id': [g.id for g in games],
        'genres': [extract_name(g.genres) for g in games],
        'themes': [extract_name(g.themes) for g in games],
        'keywords': [extract_name(g.keywords) for g in games],
}

features_df = pl.DataFrame(data)
features_df.head()

id,genres,themes,keywords
i64,list[str],list[str],list[str]
1,"[""Shooter"", ""Simulator"", ""Adventure""]","[""Action"", ""Fantasy"", ""Stealth""]","[""ghosts"", ""thief"", … ""immersive""]"
2,"[""Simulator"", ""Adventure""]","[""Action"", ""Fantasy"", ""Stealth""]","[""ghosts"", ""assassin"", … ""male protagonist""]"
3,"[""Shooter"", ""Simulator"", ""Adventure""]","[""Action"", ""Fantasy"", ""Stealth""]","[""ghosts"", ""assassin"", … ""immersive""]"
4,"[""Shooter"", ""Adventure""]","[""Action"", ""Stealth"", ""Sandbox""]","[""thief"", ""steampunk"", … ""male protagonist""]"
5,"[""Role-playing (RPG)""]","[""Fantasy""]",[]


In [147]:
data = {
        'id': [g.id for g in games],
        'name': [g.name for g in games],
        'similar_games': [g.similar_games for g in games]
}

df = pl.DataFrame(data)
df.head()

id,name,similar_games
i64,str,list[i64]
1,"""Thief II: The …","[2, 3, … 105049]"
2,"""Thief: The Dar…","[1, 3, … 11118]"
3,"""Thief: Deadly …","[1, 2, … 19441]"
4,"""Thief""","[1, 533, … 19441]"
5,"""Baldur's Gate""","[19127, 19404, … 113910]"


In [148]:
similar_games_df = df.explode('similar_games')
similar_games_df = similar_games_df.rename({'similar_games':'similar_game_id'})
similar_games_df

id,name,similar_game_id
i64,str,i64
1,"""Thief II: The …",2
1,"""Thief II: The …",3
1,"""Thief II: The …",4
1,"""Thief II: The …",471
1,"""Thief II: The …",9377
1,"""Thief II: The …",17379
1,"""Thief II: The …",36198
1,"""Thief II: The …",47823
1,"""Thief II: The …",55038
1,"""Thief II: The …",105049


In [149]:
names_df = similar_games_df.join(df[['id', 'name']], left_on="similar_game_id", right_on="id", how="left")
names_df.head()

id,name,similar_game_id,name_right
i64,str,i64,str
1,"""Thief II: The …",2,"""Thief: The Dar…"
1,"""Thief II: The …",3,"""Thief: Deadly …"
1,"""Thief II: The …",4,"""Thief"""
1,"""Thief II: The …",471,"""Crysis 2"""
1,"""Thief II: The …",9377,"""Death to Spies…"


In [150]:
names_df.null_count()

id,name,similar_game_id,name_right
u32,u32,u32,u32
0,0,403,28797


In [151]:
result_df = names_df.drop_nulls(["similar_game_id", "name_right"])
#result_df = result_df.group_by("name", maintain_order=True).agg(pl.col("name_right"))

In [152]:
result_df = result_df.drop(["name","name_right"])
result_df.head(20)

id,similar_game_id
i64,i64
1,2
1,3
1,4
1,471
1,9377
1,17379
1,36198
1,47823
1,55038
1,105049


In [153]:
result_df.describe()

describe,id,similar_game_id
str,f64,f64
"""count""",177946.0,177946.0
"""null_count""",0.0,0.0
"""mean""",41461.406477,50716.342649
"""std""",50775.29402,39415.204884
"""min""",1.0,1.0
"""25%""",7467.0,19150.0
"""50%""",20182.0,35994.0
"""75%""",54791.0,96217.0
"""max""",281275.0,261154.0


In [154]:
features_df.head()

id,genres,themes,keywords
i64,list[str],list[str],list[str]
1,"[""Shooter"", ""Simulator"", ""Adventure""]","[""Action"", ""Fantasy"", ""Stealth""]","[""ghosts"", ""thief"", … ""immersive""]"
2,"[""Simulator"", ""Adventure""]","[""Action"", ""Fantasy"", ""Stealth""]","[""ghosts"", ""assassin"", … ""male protagonist""]"
3,"[""Shooter"", ""Simulator"", ""Adventure""]","[""Action"", ""Fantasy"", ""Stealth""]","[""ghosts"", ""assassin"", … ""immersive""]"
4,"[""Shooter"", ""Adventure""]","[""Action"", ""Stealth"", ""Sandbox""]","[""thief"", ""steampunk"", … ""male protagonist""]"
5,"[""Role-playing (RPG)""]","[""Fantasy""]",[]


In [156]:
def clean_and_join(row):
    return " ".join(sorted(["".join(filter(str.isalpha, i)) for i in row]))

features = ["genres", "themes", "keywords"]

features_df = features_df.with_columns([
        pl.col(features).map_elements(lambda row: clean_and_join(row))
])

features_df = features_df.with_columns(
     pl.concat_str(
        pl.col(features), separator=" "
    ).alias("features")
)

features_df = features_df.drop(features)

features_df.head()

id,features
i64,str
1,"""Adventure Shoo…"
2,"""Adventure Simu…"
3,"""Adventure Shoo…"
4,"""Adventure Shoo…"
5,"""RoleplayingRPG…"


In [157]:
features_df.sample(1).select("id", "features")

id,features
i64,str
24979,"""Adventure Plat…"


In [158]:
features_df.row(1)

(2,
 'Adventure Simulator Action Fantasy Stealth actionadventure assassin basketball betrayal blackjack bowandarrow bread crime darkness difficultylevel e ghosts immersive instantkill invisibility magic maleprotagonist medieval melee movingbodies murder pacifistplaythrough particlesystem pickpocketing plottwist polygonald potion realtimecombat reluctanthero retailgameswithsteamactivation shadowstealth singleplayeronly stealthkill steampunk swimming swordplay theft thief tombraiding voiceacting')

In [159]:
result_df.row(1)

(1, 3)

In [160]:
similar = features_df.filter(pl.col("id") == 1).select("features")
features = similar['features'].to_list()[0]
print(features)

Adventure Shooter Simulator Action Fantasy Stealth actionadventure blackjack bowandarrow crime darkness death difficultylevel ghosts immersive invisibility melee movingbodies pacifistplaythrough particlesystem pickpocketing polygonald potion realtimecombat shadowstealth singleplayeronly stealthkill steampunk swimming swordplay theft thief


In [161]:
def extract_features(id):
    similar = features_df.filter(pl.col("id") == id).select("features")
    return similar['features'].to_list()[0]

def find_dissimilar_game(similar_features):
    dissimilar_games = features_df.filter(
        ~features_df['features'].str.contains('|'.join(similar_features.split(" ")))
    )
    if dissimilar_games.select(pl.count()).item() < 1:
        dissimilar = features_df.sample(1).select("id", "features")
    else:
        dissimilar = dissimilar_games.sample(1).select("id", "features")
    return {
        'id': dissimilar['id'].item(),
        'features': dissimilar['features'].item()
    }

In [162]:
find_dissimilar_game(features)

{'id': 21470,
 'features': 'Strategy TurnbasedstrategyTBS Sciencefiction Xexploreexpandexploitandexterminate '}

In [163]:
training_data = []

for row in result_df.iter_rows(named=True):
    anchor_game = {
        'id': row['id'],
        'features': extract_features(row['id'])
    }
    similar_game = {
        'id': row['similar_game_id'],
        'features': extract_features(row['similar_game_id'])
    }
    dissimilar_game = find_dissimilar_game(similar_game['features'])

    training_data.append({
        'anchor': anchor_game,
        'positive': similar_game,
        'negative': dissimilar_game
    })
  

In [164]:
 
import json
with open('training_data.json', 'w') as f:
    json.dump(training_data, f)