In [1]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import polars as pl

from website import Base
from website.models import Game

In [2]:
database_uri = f'mysql+pymysql://root:root@localhost:3306/db'
engine = create_engine(database_uri)
Base.metadata.bind = engine
DBSession = sessionmaker(bind=engine)
session = DBSession()
games = session.query(Game).all()

In [3]:
def extract_name(items):
    return [x.name for x in items]

data = {
        'id': [g.id for g in games],
        'summary': [g.summary for g in games],
        'storyline': [g.storyline for g in games],
        'genres': [extract_name(g.genres) for g in games],
        'themes': [extract_name(g.themes) for g in games],
        'keywords': [extract_name(g.keywords) for g in games],
}

full_df = pl.DataFrame(data)
full_df.head()

id,summary,storyline,genres,themes,keywords
i64,str,str,list[str],list[str],list[str]
1,"""The ultimate t…","""The game begin…","[""Shooter"", ""Simulator"", ""Adventure""]","[""Action"", ""Fantasy"", ""Stealth""]","[""ghosts"", ""thief"", … ""immersive""]"
2,"""Thief is a fir…","""None""","[""Simulator"", ""Adventure""]","[""Action"", ""Fantasy"", ""Stealth""]","[""ghosts"", ""assassin"", … ""male protagonist""]"
3,"""In the third i…","""None""","[""Shooter"", ""Simulator"", ""Adventure""]","[""Action"", ""Fantasy"", ""Stealth""]","[""ghosts"", ""assassin"", … ""immersive""]"
4,"""There is a ris…","""Garrett, the M…","[""Shooter"", ""Adventure""]","[""Action"", ""Stealth"", ""Sandbox""]","[""thief"", ""steampunk"", … ""male protagonist""]"
5,"""Baldur's Gate …","""Candlekeep is …","[""Role-playing (RPG)""]","[""Fantasy""]",[]


In [6]:
features_df = full_df.with_columns([
    pl.col("summary").fill_null("").str.replace("no info|none|No info|None", ""),
    pl.col("storyline").fill_null("").str.replace("no info|none|No info|None", "")
])
features_df = features_df.with_columns(
    pl.concat_str(
        [
            pl.col('summary'),
            pl.col('storyline')
        ],
        separator=" "
    ).alias("description")
)
features_df = features_df.drop(["summary", "storyline"])
features_df = features_df.with_columns([
    pl.when((pl.col('description').str.len_chars() == 1) | pl.col('description').str.starts_with("None None"))
    .then(None)
    .otherwise(pl.col('description'))
    .alias("description")
])
print("before")
print(features_df.null_count())
features_df = features_df.drop_nulls()
print("after")
print(features_df.null_count())

before
shape: (1, 5)
┌─────┬────────┬────────┬──────────┬─────────────┐
│ id  ┆ genres ┆ themes ┆ keywords ┆ description │
│ --- ┆ ---    ┆ ---    ┆ ---      ┆ ---         │
│ u32 ┆ u32    ┆ u32    ┆ u32      ┆ u32         │
╞═════╪════════╪════════╪══════════╪═════════════╡
│ 0   ┆ 0      ┆ 0      ┆ 0        ┆ 397         │
└─────┴────────┴────────┴──────────┴─────────────┘
after
shape: (1, 5)
┌─────┬────────┬────────┬──────────┬─────────────┐
│ id  ┆ genres ┆ themes ┆ keywords ┆ description │
│ --- ┆ ---    ┆ ---    ┆ ---      ┆ ---         │
│ u32 ┆ u32    ┆ u32    ┆ u32      ┆ u32         │
╞═════╪════════╪════════╪══════════╪═════════════╡
│ 0   ┆ 0      ┆ 0      ┆ 0        ┆ 0           │
└─────┴────────┴────────┴──────────┴─────────────┘


In [7]:
def clean_and_join(row):
    return " ".join(sorted(["".join(filter(str.isalpha, i)) for i in row]))

features = ["genres", "themes", "keywords"]

features_df = features_df.with_columns([
        pl.col(features).map_elements(lambda row: clean_and_join(row))
])

features_df = features_df.with_columns(
     pl.concat_str(
        pl.col(features), separator=" "
    ).alias("features")
)

features_df = features_df.drop(features)

features_df.head()

id,description,features
i64,str,str
1,"""The ultimate t…","""Adventure Shoo…"
2,"""Thief is a fir…","""Adventure Simu…"
3,"""In the third i…","""Adventure Shoo…"
4,"""There is a ris…","""Adventure Shoo…"
5,"""Baldur's Gate …","""RoleplayingRPG…"


In [8]:
data = {
        'id': [g.id for g in games],
        'name': [g.name for g in games],
        'similar_games': [g.similar_games for g in games]
}

df = pl.DataFrame(data)
df.head()

id,name,similar_games
i64,str,list[i64]
1,"""Thief II: The …","[2, 3, … 105049]"
2,"""Thief: The Dar…","[1, 3, … 11118]"
3,"""Thief: Deadly …","[1, 2, … 19441]"
4,"""Thief""","[1, 533, … 19441]"
5,"""Baldur's Gate""","[19127, 19404, … 113910]"


In [9]:
similar_games_df = df.explode('similar_games')
similar_games_df = similar_games_df.rename({'similar_games':'similar_game_id'})
similar_games_df

id,name,similar_game_id
i64,str,i64
1,"""Thief II: The …",2
1,"""Thief II: The …",3
1,"""Thief II: The …",4
1,"""Thief II: The …",471
1,"""Thief II: The …",9377
1,"""Thief II: The …",17379
1,"""Thief II: The …",36198
1,"""Thief II: The …",47823
1,"""Thief II: The …",55038
1,"""Thief II: The …",105049


In [10]:
names_df = similar_games_df.join(df[['id', 'name']], left_on="similar_game_id", right_on="id", how="left")
names_df.head()

id,name,similar_game_id,name_right
i64,str,i64,str
1,"""Thief II: The …",2,"""Thief: The Dar…"
1,"""Thief II: The …",3,"""Thief: Deadly …"
1,"""Thief II: The …",4,"""Thief"""
1,"""Thief II: The …",471,"""Crysis 2"""
1,"""Thief II: The …",9377,"""Death to Spies…"


In [11]:
names_df.null_count()

id,name,similar_game_id,name_right
u32,u32,u32,u32
0,0,403,28797


In [12]:
result_df = names_df.drop_nulls(["similar_game_id", "name_right"])
#result_df = result_df.group_by("name", maintain_order=True).agg(pl.col("name_right"))

In [13]:
result_df = result_df.drop(["name","name_right"])
result_df.head(20)

id,similar_game_id
i64,i64
1,2
1,3
1,4
1,471
1,9377
1,17379
1,36198
1,47823
1,55038
1,105049


In [14]:
result_df.describe()

describe,id,similar_game_id
str,f64,f64
"""count""",177946.0,177946.0
"""null_count""",0.0,0.0
"""mean""",41461.406477,50716.342649
"""std""",50775.29402,39415.204884
"""min""",1.0,1.0
"""25%""",7467.0,19150.0
"""50%""",20182.0,35994.0
"""75%""",54791.0,96217.0
"""max""",281275.0,261154.0


In [15]:
similar = features_df.filter(pl.col("id") == 1).select("features")
features = similar['features'].to_list()[0]
print(features)

Adventure Shooter Simulator Action Fantasy Stealth actionadventure blackjack bowandarrow crime darkness death difficultylevel ghosts immersive invisibility melee movingbodies pacifistplaythrough particlesystem pickpocketing polygonald potion realtimecombat shadowstealth singleplayeronly stealthkill steampunk swimming swordplay theft thief


In [37]:
def get_row(id):
    return features_df.filter(pl.col('id') == id)

def extract_features(id):
    similar = get_row(id)
    features = similar['features'].to_list()
    description = similar['description'].to_list()
    if len(features) < 1 or len(description) < 1: 
        return None
    return {
        'id': id,
        'features': features[0],
        'description': description[0]
    }


def find_dissimilar_game(similar_features):
    dissimilar_games = features_df.filter(
        ~features_df['features'].str.contains('|'.join(similar_features.split(" ")))
    )
    if dissimilar_games.select(pl.count()).item() < 1:
        return None
    else:
        dissimilar = dissimilar_games.sample(1).select("id", "features", "description")
    return {
        'id': dissimilar['id'].item(),
        'features': dissimilar['features'].item(),
        'description': dissimilar['description'].item()
    }

In [38]:
extract_features(2)

{'id': 2,
 'features': 'Adventure Simulator Action Fantasy Stealth actionadventure assassin basketball betrayal blackjack bowandarrow bread crime darkness difficultylevel e ghosts immersive instantkill invisibility magic maleprotagonist medieval melee movingbodies murder pacifistplaythrough particlesystem pickpocketing plottwist polygonald potion realtimecombat reluctanthero retailgameswithsteamactivation shadowstealth singleplayeronly stealthkill steampunk swimming swordplay theft thief tombraiding voiceacting',
 'description': 'Thief is a first-person stealth game that likes the dark. You sneak through the ruins of haunted cathedrals, subterranean ruins, and forbidding prisons, in a dark and sinister city - heavily inspired by Steampunk and the Dark Ages. Garrett finds an ally in the shadows, as he steals for money and uncovers the hidden agendas of allies and enemies. The story that unravels is one of deception and revenge. '}

In [39]:
find_dissimilar_game(features)

{'id': 5280,
 'features': 'Puzzle Strategy  chess highscore tacticalturnbasedcombat',
 'description': 'You can now test your Chess skills against the Wii, with the latest Touch Generations title from Wii. Wii chess is considered one of the most powerful chess computers. Wii chess uses loop express, a conversion of the loop chess engine which ranked 3rd in the 2007 World computer chess championship in Amsterdam. To improve your Chess skills Wii chess allows you record, review and replay previous games. So you can hone your chess skills, ready to battle any opponent. Wii chess comes with 10 difficulty levels from beginner to expert, and when you have finished going up against the Wii you can challenge players across Europe online, using the Nintendo WiFi connections from the comfort of your own home. '}

In [40]:
training_data = []
get_negative = True
count_positive = 0
count_negative = 0


for row in result_df.iter_rows(named=True):
    anchor_game = extract_features(row['id'])
    similar_game = extract_features(row['similar_game_id'])

    if anchor_game is None or similar_game is None:
        continue

    ## for pairs
    training_data.append({
        'anchor': anchor_game,
        'game': similar_game,
        'label': 1
    })
    count_positive += 1
    
    if get_negative:
        dissimilar_game = find_dissimilar_game(similar_game['features'])
        if dissimilar_game:
            training_data.append({
                'anchor': anchor_game,
                'game': dissimilar_game,
                'label': 0
            })
            count_negative += 1
    get_negative = not get_negative
    
    ## for triples
    # training_data.append({
    #     'anchor': anchor_game,
    #     'positive': similar_game,
    #     'negative': dissimilar_game
    # })
print(f'positive pairs: {count_positive}, negative pairs: {count_negative}')

positive pairs: 177113, negative pairs: 50573


In [41]:
from sklearn.model_selection import train_test_split

train_data, temp_data = train_test_split(training_data, test_size=0.8, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

data_to_file = {
    'train_data': train_data,
    'val_data': val_data,
    'test_data': test_data
}

In [43]:
import json

with open('training_data_pairs.json', 'w') as f:
    json.dump(data_to_file, f)