Initial look into building similarities between boardgames

In [172]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
import boardgamegeek as bgg
import re
from tqdm import tqdm
import nltk
from datasketch import MinHash, MinHashLSH, MinHashLSHForest

In [157]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/brentonmallen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/brentonmallen/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Game List
Load the list of games scrapped from BGG

In [7]:
root_dir = os.getcwd()
gamelist_fname = 'game_urls.tsv'

In [10]:
game_list = pd.read_csv(os.path.join(root_dir, gamelist_fname),
                        sep='\t',
                       names = ['title', 'url'])

Extract a few bits for simplicity

In [35]:
year_pattern = re.compile(r"(|)")

In [136]:
game_list['id'] = game_list['url'].apply(lambda x: x.split('/')[-2]).astype(int)

# Game Data Test

In [25]:
client = bgg.BGGClient()

In [74]:
a = client.game(game_id=174430, choose=u'best-rank')

In [75]:
a.name

'Gloomhaven'

In [89]:
b = {'description':a.description,
                       'categories': [a.categories],
                       'mechanics': [a.mechanics],
                       'families': [a.families],
                       'max_players': [a.max_players],
                       'expansion': [a.expansion],
                       'designers': [a.designers],
                       'publishers': [a.publishers],
                       'expansions': [a.expansions],
                       'year': [a.yearpublished],
                       'minplaytime': [a.minplaytime],
                       'maxplaytime': [a.maxplaytime],
                       'minage': [a.minage],
                       'stats': [a.stats],
                       'ranks': [a.ranks]
                      }
# b['desc'] = b.desc.apply(lambda x: str(x).replace("Description from the publisher:"
#     ...: , "").replace('\n', ' ').strip())

In [87]:
trial = pd.DataFrame.from_dict(b)

In [88]:
trial['description'].values[0]

'Gloomhaven  is a game of Euro-inspired tactical combat in a persistent world of shifting motives. Players will take on the role of a wandering adventurer with their own special set of skills and their own reasons for traveling to this dark corner of the world. Players must work together out of necessity to clear out menacing dungeons and forgotten ruins. In the process, they will enhance their abilities with experience and loot, discover new locations to explore and plunder, and expand an ever-branching story fueled by the decisions they make.\n\nThis is a game with a persistent and changing world that is ideally played over many game sessions. After a scenario, players will make decisions on what to do, which will determine how the story continues, kind of like a “Choose Your Own Adventure” book. Playing through a scenario is a cooperative affair where players will fight against automated monsters using an innovative card system to determine the order of play and what a player does o

# Game Data

For simplicity, I'm limiting the number of games to the first 1000.  If this runs fast enough, I'll go ahead and do them all

In [223]:
def get_game_data(bgg_client, game_name=None, game_id=None, method=u'best-rank'):
    if game_name and game_id:
        raise Exception('Specify only a name or id')
    elif game_name:
        game = bgg_client.game(name=game_name, choose=method)
        return game
    elif game_id:
        game = bgg_client.game(game_id=game_id, choose=method)
        return game
    else:
        raise Exception('No Game Specified')
        
        
def game_df(games):
    game_df = pd.DataFrame()
    for g in games:
        temp_df = pd.DataFrame.from_dict(
            {
            'title': g.name,
            'description':g.description,
            'categories': [g.categories],
            'mechanics': [g.mechanics],
            'families': [g.families],
            'max_players': [g.max_players],
            'expansion': [g.expansion],
            'designers': [g.designers],
            'publishers': [g.publishers],
            'expansions': [g.expansions],
            'year': [g.yearpublished],
            'minplaytime': [g.minplaytime],
            'maxplaytime': [g.maxplaytime],
            'minage': [g.minage],
            'stats': [g.stats],
            'ranks': [g.ranks]
            }
        )
        game_df = pd.concat([game_df, temp_df])
    return game_df

## Get game data

In [224]:
limit = 1000
games = []
failed = []
for i in game_list.head(limit).id.values:
    try:
        games.append(get_game_data(client, game_id=i))
    except:
        failed.append(i)

In [225]:
len(failed)

231

For some reason some fetches fail, so I'll try one more time to grab the failed ones

In [226]:
retry_games = []
failed_again = []
for i in failed:
    try:
        retry_games.append(get_game_data(client, game_id=i))
    except:
        failed_again.append(i)

In [227]:
len(failed_again)

49

## Combine data

Concat the retries

In [243]:
game_data = game_df(set(games + retry_games))

In [245]:
game_data.shape

(951, 16)

## Process features and build hashtable

In [276]:
SIMILARITY_FIELDS = [
    'description',
    'categories',
    'mechanics',
    'families',
    'designers'
]

HASH_REZ = 128

In [247]:
from string import punctuation
from nltk.corpus import stopwords
from nltk import word_tokenize

def process_description(desc):
    stop_words = stopwords.words('english') + list(punctuation)
    # remove publisher description, newlines and strip
    processed_desc = (desc.
                      replace("Description from the publisher:", "").
                      replace('\n', ' ').
                      strip()
                     )
    # remove stopwords, punctuation and get word list
    words = word_tokenize(processed_desc)
    words = [w.lower() for w in words]
    return list(set([w for w in words if w not in stop_words and not w.isdigit() and len(w) > 1]))


def process_tokens(row):
    output_tokens = process_description(row['description'])
    output_tokens.extend(row['categories'])
    output_tokens.extend(row['mechanics'])
    output_tokens.extend(row['families'])
    output_tokens.extend(row['designers'])
    return output_tokens
    
    

In [248]:
# build token sets
game_data['_sim_tokens'] = game_data[SIMILARITY_FIELDS].apply(lambda row: process_tokens(row), axis=1)

In [277]:
# compute hashes
def compute_sim_hash(tokens):
    lshf_hash = MinHash(num_perm=HASH_REZ)
    for word in tokens:
        lshf_hash.update(word.encode('utf-8'))
    return lshf_hash

In [278]:
game_data['_sim_hash'] = game_data['_sim_tokens'].apply(lambda x: compute_sim_hash(x))

In [286]:
# LSH forest for 'top-n' similarities
forest = MinHashLSHForest(num_perm=HASH_REZ)

for ind, row in game_data.iterrows():
    try:
        forest.add(row['title'], row['_sim_hash'])
    except ValueError:
        print(f"{row['title']} already added")
    except:
        raise
forest.index()

Cosmic Encounter already added
Citadels already added
Lord of the Rings: The Confrontation already added


### Pandemic Sanity Check

In [299]:
pandemic_data = game_data[game_data.title == 'Pandemic']

In [300]:
forest.query(pandemic_data._sim_hash.values[0], 10)

['Ginkgopolis',
 'Struggle of Empires',
 'One Deck Dungeon',
 'A House Divided',
 'Pandemic',
 'Fury of Dracula (second edition)',
 'Ticket to Ride: Märklin',
 'Breakout: Normandy',
 'Timeline: Historical Events',
 'Port Royal']

Hmmm, not what I expected