In [1]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import polars as pl

from website import Base
from website.models import Game

In [2]:
database_uri = f'mysql+pymysql://root:root@localhost:3306/db'
engine = create_engine(database_uri)
Base.metadata.bind = engine
DBSession = sessionmaker(bind=engine)
session = DBSession()
games = session.query(Game).all()

In [3]:
def extract_name(items):
    return [x.name for x in items]

data = {
        'id': [g.id for g in games],
        'summary': [g.summary for g in games],
        'storyline': [g.storyline for g in games],
        'genres': [extract_name(g.genres) for g in games],
        'themes': [extract_name(g.themes) for g in games],
        'keywords': [extract_name(g.keywords) for g in games],
}

full_df = pl.DataFrame(data)
full_df.head()

id,summary,storyline,genres,themes,keywords
i64,str,str,list[str],list[str],list[str]
1,"""The ultimate t…","""The game begin…","[""Shooter"", ""Simulator"", ""Adventure""]","[""Action"", ""Fantasy"", ""Stealth""]","[""ghosts"", ""thief"", … ""immersive""]"
2,"""Thief is a fir…","""None""","[""Simulator"", ""Adventure""]","[""Action"", ""Fantasy"", ""Stealth""]","[""ghosts"", ""assassin"", … ""male protagonist""]"
3,"""In the third i…","""None""","[""Shooter"", ""Simulator"", ""Adventure""]","[""Action"", ""Fantasy"", ""Stealth""]","[""ghosts"", ""assassin"", … ""immersive""]"
4,"""There is a ris…","""Garrett, the M…","[""Shooter"", ""Adventure""]","[""Action"", ""Stealth"", ""Sandbox""]","[""thief"", ""steampunk"", … ""male protagonist""]"
5,"""Baldur's Gate …","""Candlekeep is …","[""Role-playing (RPG)""]","[""Fantasy""]",[]


In [4]:
full_df.describe()

describe,id,summary,storyline,genres,themes,keywords
str,f64,str,str,str,str,str
"""count""",21185.0,"""21185""","""21185""","""21185""","""21185""","""21185"""
"""null_count""",0.0,"""0""","""0""","""0""","""0""","""0"""
"""mean""",42296.599434,,,,,
"""std""",50837.188312,,,,,
"""min""",1.0,""""" I hope that …",""""" Visions—The …",,,
"""25%""",8039.0,,,,,
"""50%""",21116.0,,,,,
"""75%""",55474.0,,,,,
"""max""",281275.0,"""雪山上，木屋里，我们被这个世…","""﻿As the wizard…",,,


In [5]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('words')

[nltk_data] Downloading package stopwords to /home/aneta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/aneta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /home/aneta/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [6]:
features_df = full_df.with_columns([
    pl.col("summary").fill_null("").str.replace("no info|none|No info|None|game", ""),
    pl.col("storyline").fill_null("").str.replace("no info|none|No info|None|game", "")
])
features_df = features_df.with_columns(
    pl.concat_str(
        [
            pl.col('summary'),
            pl.col('storyline')
        ],
        separator=" "
    ).alias("description")
)
features_df = features_df.drop(["summary", "storyline"])

In [7]:
from nltk import word_tokenize
from nltk.corpus import stopwords, words
import string

stopwords = set(stopwords.words('english')) | set(string.punctuation)
english_words = set(words.words())

def remove_stopwords(row):
    temp = ''.join([char for char in row if not char.isdigit()])
    temp = [i for i in word_tokenize(temp.lower()) if i in english_words and i not in stopwords]
    temp = ' '.join([w for w in temp if len(w) > 3])
    return temp

remove_stopwords(
    'The ultimate thief is back! Tread softly as you make your way through 15 new complex, non-linear levels full of loot to steal and guards to outsmart. Improved enemy AI, new gadgets and a riveting story will draw you into the world of Thief II: The Metal Age, a place of powerful new technologies, fanatical religions and corruption. Amigo, lo quiero orwsjtgoistgnji. 雪山上，木屋里，我们被这个世界绑在了一起。我和她只能相爱，不能分离。这是自由的感情，还是被操纵的爱？多结局的清淡小短片《茸雪》，故事里有小茸和小雪，还有懂得何为爱情的你')

'ultimate thief back tread softly make complex full loot steal outsmart enemy riveting story draw world thief metal place powerful fanatical corruption'

In [8]:
features_df = features_df.with_columns([
    pl.col('description').map_elements(lambda row: remove_stopwords(row))
])
features_df.head()

id,genres,themes,keywords,description
i64,list[str],list[str],list[str],str
1,"[""Shooter"", ""Simulator"", ""Adventure""]","[""Action"", ""Fantasy"", ""Stealth""]","[""ghosts"", ""thief"", … ""immersive""]","""ultimate thief…"
2,"[""Simulator"", ""Adventure""]","[""Action"", ""Fantasy"", ""Stealth""]","[""ghosts"", ""assassin"", … ""male protagonist""]","""thief stealth …"
3,"[""Shooter"", ""Simulator"", ""Adventure""]","[""Action"", ""Fantasy"", ""Stealth""]","[""ghosts"", ""assassin"", … ""immersive""]","""third thief se…"
4,"[""Shooter"", ""Adventure""]","[""Action"", ""Stealth"", ""Sandbox""]","[""thief"", ""steampunk"", … ""male protagonist""]","""rising tide fe…"
5,"[""Role-playing (RPG)""]","[""Fantasy""]",[],"""gate fantasy v…"


In [9]:
features_df = features_df.with_columns([
    pl.when((pl.col('description').str.len_chars() < 2) | pl.col('description').str.starts_with("None None"))
    .then(None)
    .otherwise(pl.col('description'))
    .alias("description")
])
print("before")
print(features_df.null_count())
features_df = features_df.drop_nulls()
print("after")
print(features_df.null_count())

before
shape: (1, 5)
┌─────┬────────┬────────┬──────────┬─────────────┐
│ id  ┆ genres ┆ themes ┆ keywords ┆ description │
│ --- ┆ ---    ┆ ---    ┆ ---      ┆ ---         │
│ u32 ┆ u32    ┆ u32    ┆ u32      ┆ u32         │
╞═════╪════════╪════════╪══════════╪═════════════╡
│ 0   ┆ 0      ┆ 0      ┆ 0        ┆ 410         │
└─────┴────────┴────────┴──────────┴─────────────┘
after
shape: (1, 5)
┌─────┬────────┬────────┬──────────┬─────────────┐
│ id  ┆ genres ┆ themes ┆ keywords ┆ description │
│ --- ┆ ---    ┆ ---    ┆ ---      ┆ ---         │
│ u32 ┆ u32    ┆ u32    ┆ u32      ┆ u32         │
╞═════╪════════╪════════╪══════════╪═════════════╡
│ 0   ┆ 0      ┆ 0      ┆ 0        ┆ 0           │
└─────┴────────┴────────┴──────────┴─────────────┘


In [10]:
features_df['description'].describe()

statistic,value
str,i64
"""count""",20775
"""null_count""",0
"""unique""",20709


In [11]:
temp_df = features_df.with_columns([
    pl.col("description").str.len_chars().alias("len")
])
temp_df.sort("len").head(20)

id,genres,themes,keywords,description,len
i64,list[str],list[str],list[str],str,u32
3747,"[""Point-and-click"", ""Adventure""]","[""Educational"", ""Kids""]","[""kid friendly"", ""scumm""]","""case""",4
19253,"[""Racing"", ""Sport""]",[],"[""extreme sports"", ""anti-gravity racing"", … ""e3 2004""]","""made""",4
32003,"[""Point-and-click"", ""Puzzle"", ""Adventure""]",[],"[""hidden object"", ""steam"", … ""steam cloud""]","""face""",4
52039,"[""Sport""]",[],"[""golf"", ""achievements"", … ""online multiplayer achievements""]","""golf""",4
69348,"[""Fighting"", ""Racing"", … ""Arcade""]","[""Party""]","[""animals"", ""minigames"", … ""competitve""]","""game""",4
74849,"[""Shooter"", ""Role-playing (RPG)"", ""Indie""]","[""Action""]","[""post-apocalyptic"", ""first person shooter"", ""mmorpg""]","""live""",4
80236,[],[],[],"""back""",4
97110,"[""Indie""]",[],[],"""hide""",4
123866,"[""Racing"", ""Indie""]",[],[],"""fall""",4
173072,"[""Platform"", ""Simulator"", ""Indie""]",[],"[""casual"", ""free-to-play"", ""lo-fi""]","""idle""",4


In [12]:
def clean_and_join(row):
    words = sorted(["".join(filter(str.isalpha, i)) for i in row])
    return ' '.join([w for w in words if len(w)>3])

features = ["genres", "themes", "keywords"]

features_df = features_df.with_columns([
        pl.col(features).map_elements(lambda row: clean_and_join(row))
])

features_df = features_df.with_columns(
     pl.concat_str(
        pl.col(features), separator=" "
    ).alias("features")
)

features_df = features_df.drop(features)

features_df.head()

id,description,features
i64,str,str
1,"""ultimate thief…","""Adventure Shoo…"
2,"""thief stealth …","""Adventure Simu…"
3,"""third thief se…","""Adventure Shoo…"
4,"""rising tide fe…","""Adventure Shoo…"
5,"""gate fantasy v…","""RoleplayingRPG…"


In [13]:
features_df = features_df.with_columns(pl.lit(0).alias('count'))

In [14]:
# features_df = features_df.with_columns(pl.when(pl.col('id') == 1)
#                         .then(pl.col('count') + 1)
#                         .otherwise(pl.col('count'))
#                         .alias("count"))
features_df.head()

id,description,features,count
i64,str,str,i32
1,"""ultimate thief…","""Adventure Shoo…",0
2,"""thief stealth …","""Adventure Simu…",0
3,"""third thief se…","""Adventure Shoo…",0
4,"""rising tide fe…","""Adventure Shoo…",0
5,"""gate fantasy v…","""RoleplayingRPG…",0


In [15]:
data = {
        'id': [g.id for g in games],
        'name': [g.name for g in games],
        'similar_games': [g.similar_games for g in games]
}

df = pl.DataFrame(data)
df.head()

id,name,similar_games
i64,str,list[i64]
1,"""Thief II: The …","[2, 3, … 105049]"
2,"""Thief: The Dar…","[1, 3, … 11118]"
3,"""Thief: Deadly …","[1, 2, … 19441]"
4,"""Thief""","[1, 533, … 19441]"
5,"""Baldur's Gate""","[19127, 19404, … 113910]"


In [16]:
similar_games_df = df.explode('similar_games')
similar_games_df = similar_games_df.rename({'similar_games':'similar_game_id'})
similar_games_df

id,name,similar_game_id
i64,str,i64
1,"""Thief II: The …",2
1,"""Thief II: The …",3
1,"""Thief II: The …",4
1,"""Thief II: The …",471
1,"""Thief II: The …",9377
1,"""Thief II: The …",17379
1,"""Thief II: The …",36198
1,"""Thief II: The …",47823
1,"""Thief II: The …",55038
1,"""Thief II: The …",105049


In [17]:
names_df = similar_games_df.join(df[['id', 'name']], left_on="similar_game_id", right_on="id", how="left")
names_df.head()

id,name,similar_game_id,name_right
i64,str,i64,str
1,"""Thief II: The …",2,"""Thief: The Dar…"
1,"""Thief II: The …",3,"""Thief: Deadly …"
1,"""Thief II: The …",4,"""Thief"""
1,"""Thief II: The …",471,"""Crysis 2"""
1,"""Thief II: The …",9377,"""Death to Spies…"


In [18]:
names_df.null_count()

id,name,similar_game_id,name_right
u32,u32,u32,u32
0,0,401,29208


In [19]:
result_df = names_df.drop_nulls(["similar_game_id", "name_right"])
#result_df = result_df.group_by("name", maintain_order=True).agg(pl.col("name_right"))

In [20]:
result_df = result_df.drop(["name","name_right"])
result_df.head(20)

id,similar_game_id
i64,i64
1,2
1,3
1,4
1,471
1,9377
1,17379
1,36198
1,47823
1,55038
1,105049


In [21]:
result_df.describe()

describe,id,similar_game_id
str,f64,f64
"""count""",179033.0,179033.0
"""null_count""",0.0,0.0
"""mean""",41747.056895,50772.110868
"""std""",51149.516891,39414.055439
"""min""",1.0,1.0
"""25%""",7493.0,19150.0
"""50%""",20250.0,35994.0
"""75%""",55027.0,96217.0
"""max""",281275.0,261154.0


In [22]:
similar = features_df.filter(pl.col("id") == 1).select("features")
features = similar['features'].to_list()[0]
print(features)

Adventure Shooter Simulator Action Fantasy Stealth actionadventure blackjack bowandarrow crime darkness death difficultylevel ghosts immersive invisibility melee movingbodies pacifistplaythrough particlesystem pickpocketing polygonald potion realtimecombat shadowstealth singleplayeronly stealthkill steampunk swimming swordplay theft thief


In [23]:
def get_row(id):
    return features_df.filter(pl.col('id') == id)

In [24]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

ids = features_df["id"]
indexes = {x: i for i, x in enumerate(ids)}

temp = features_df.select(pl.col(["id", "description"])).rows_by_key(key=["id"]).items()
features = [[x[0], x[1][0]] for x in temp]
count = TfidfVectorizer(stop_words='english')
matrix = count.fit_transform(([x[1] for x in features]))
cosine_sim = cosine_similarity(matrix, matrix)

def get_unrecommendations(game_id):
    idx = indexes[game_id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=False)
    sim_scores = sim_scores[1:1001]
    results = [i[0] for i in sim_scores]
    results_df = features_df.filter(pl.col('id').is_in(results))
    return results_df.sort('count').head(50).sample(25)

print(get_unrecommendations(72))

shape: (25, 4)
┌─────┬───────────────────────────────────┬───────────────────────────────────┬───────┐
│ id  ┆ description                       ┆ features                          ┆ count │
│ --- ┆ ---                               ┆ ---                               ┆ ---   │
│ i64 ┆ str                               ┆ str                               ┆ i32   │
╞═════╪═══════════════════════════════════╪═══════════════════════════════════╪═══════╡
│ 168 ┆ star first computer star univers… ┆ Shooter Simulator Action Science… ┆ 0     │
│ 169 ┆ conflict neutral family business… ┆ Shooter Simulator Action Science… ┆ 0     │
│ 176 ┆ computer later entertainment ori… ┆ Racing Sport Action Scienceficti… ┆ 0     │
│ 31  ┆ farce ready trek time everybody … ┆ Adventure Pointandclick Comedy S… ┆ 0     │
│ …   ┆ …                                 ┆ …                                 ┆ …     │
│ 86  ┆ dune often considered first mode… ┆ RealTimeStrategyRTS Strategy Sci… ┆ 0     │
│ 96  ┆ electroni

In [25]:
def extract_features(id):
    similar = get_row(id)
    features = similar['features'].to_list()
    description = similar['description'].to_list()
    if len(features) < 1 or len(description) < 1: 
        return None
    return {
        'id': id,
        'features': features[0],
        'description': description[0]
    }

In [26]:
extract_features(2)

{'id': 2,
 'features': 'Adventure Simulator Action Fantasy Stealth actionadventure assassin basketball betrayal blackjack bowandarrow bread crime darkness difficultylevel ghosts immersive instantkill invisibility magic maleprotagonist medieval melee movingbodies murder pacifistplaythrough particlesystem pickpocketing plottwist polygonald potion realtimecombat reluctanthero retailgameswithsteamactivation shadowstealth singleplayeronly stealthkill steampunk swimming swordplay theft thief tombraiding voiceacting',
 'description': 'thief stealth dark sneak subterranean forbidding dark sinister city heavily inspired dark ally money hidden allies story deception revenge'}

In [28]:
training_data = []
get_negative = True
count_positive = 0
count_negative = 0


for row in result_df.iter_rows(named=True):
    anchor_game = extract_features(row['id'])
    similar_game = extract_features(row['similar_game_id'])

    if anchor_game is None or similar_game is None:
        continue

    training_data.append({
        'anchor': anchor_game,
        'game': similar_game,
        'label': 1
    })
    count_positive += 1

In [33]:
for row in features_df.iter_rows(named=True):
    anchor_game = extract_features(row['id'])
    dissimilar_games = get_unrecommendations(anchor_game['id'])
    for dissimilar_game in dissimilar_games.iter_rows(named=True):
        dissimilar_game = extract_features(dissimilar_game['id'])
        if anchor_game is None or dissimilar_game is None:
            continue
        
        training_data.append({
            'anchor': anchor_game,
            'game': dissimilar_game,
            'label': 0
        })
        features_df = features_df.with_columns(pl.when(pl.col('id') == dissimilar_game['id'])
                                                .then(pl.col('count') + 1)
                                                .otherwise(pl.col('count'))
                                                .alias("count"))
        count_negative += 1
    
    ## for triples
    # training_data.append({
    #     'anchor': anchor_game,
    #     'positive': similar_game,
    #     'negative': dissimilar_game
    # })

In [34]:
print(f'positive pairs: {count_positive}, negative pairs: {count_negative}')

positive pairs: 178065, negative pairs: 1038750


In [30]:
import json

with open('training_data_pairs_shuffled.json', 'w') as f:
    json.dump(training_data, f)

In [31]:
# from sklearn.model_selection import train_test_split
# 
# train_data, temp_data = train_test_split(training_data, test_size=0.8, random_state=42)
# val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
# 
# data_to_file = {
#     'train_data': train_data,
#     'val_data': val_data,
#     'test_data': test_data
# }

In [32]:
# import json
# 
# with open('training_data_pairs.json', 'w') as f:
#     json.dump(data_to_file, f)