# Boardgame Similarities using Metadata
This notebook also includes the data gathering code

Initial look into building similarities between boardgames using only metadata (no description)

In [81]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
import numpy as np
import boardgamegeek as bgg
import re
from tqdm import tqdm
import nltk
from datasketch import MinHash, MinHashLSH, MinHashLSHForest
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/brentonmallen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/brentonmallen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Game List
Load the list of games scrapped from BGG

In [3]:
root_dir = os.getcwd()
gamelist_fname = 'game_urls.tsv'

In [4]:
game_list = pd.read_csv(os.path.join(root_dir, gamelist_fname),
                        sep='\t',
                       names = ['title', 'url'])

Extract a few bits for simplicity

In [5]:
year_pattern = re.compile(r"(|)")

In [6]:
game_list['id'] = game_list['url'].apply(lambda x: x.split('/')[-2]).astype(int)

# Game Data Test

In [7]:
client = bgg.BGGClient()

In [8]:
a = client.game(game_id=174430, choose=u'best-rank')

In [9]:
a.name

'Gloomhaven'

In [10]:
b = {'description':a.description,
                       'categories': [a.categories],
                       'mechanics': [a.mechanics],
                       'families': [a.families],
                       'max_players': [a.max_players],
                       'expansion': [a.expansion],
                       'designers': [a.designers],
                       'publishers': [a.publishers],
                       'expansions': [a.expansions],
                       'year': [a.yearpublished],
                       'minplaytime': [a.minplaytime],
                       'maxplaytime': [a.maxplaytime],
                       'minage': [a.minage],
                       'stats': [a.stats],
                       'ranks': [a.ranks]
                      }
# b['desc'] = b.desc.apply(lambda x: str(x).replace("Description from the publisher:"
#     ...: , "").replace('\n', ' ').strip())

In [11]:
trial = pd.DataFrame.from_dict(b)

In [12]:
trial['description'].values[0]

'Gloomhaven  is a game of Euro-inspired tactical combat in a persistent world of shifting motives. Players will take on the role of a wandering adventurer with their own special set of skills and their own reasons for traveling to this dark corner of the world. Players must work together out of necessity to clear out menacing dungeons and forgotten ruins. In the process, they will enhance their abilities with experience and loot, discover new locations to explore and plunder, and expand an ever-branching story fueled by the decisions they make.\n\nThis is a game with a persistent and changing world that is ideally played over many game sessions. After a scenario, players will make decisions on what to do, which will determine how the story continues, kind of like a “Choose Your Own Adventure” book. Playing through a scenario is a cooperative affair where players will fight against automated monsters using an innovative card system to determine the order of play and what a player does o

# Game Data

For simplicity, I'm limiting the number of games to the first 1000.  If this runs fast enough, I'll go ahead and do them all

In [42]:
def get_game_data(bgg_client, game_name=None, game_id=None, method=u'best-rank'):
    if game_name and game_id:
        raise Exception('Specify only a name or id')
    elif game_name:
        game = bgg_client.game(name=game_name, choose=method)
        return game
    elif game_id:
        game = bgg_client.game(game_id=game_id, choose=method)
        return game
    else:
        raise Exception('No Game Specified')
        
        
def game_df(games):
    game_df = pd.DataFrame()
    for g in games:
        temp_df = pd.DataFrame.from_dict(
            {
            'title': g.name,
            'id': g.id,
            'description':g.description,
            'categories': [g.categories],
            'mechanics': [g.mechanics],
            'families': [g.families],
            'max_players': [g.max_players],
            'expansion': [g.expansion],
            'designers': [g.designers],
            'publishers': [g.publishers],
            'expansions': [g.expansions],
            'year': [g.yearpublished],
            'minplaytime': [g.minplaytime],
            'maxplaytime': [g.maxplaytime],
            'minage': [g.minage],
            'stats': [g.stats],
            'ranks': [g.ranks]
            }
        )
        game_df = pd.concat([game_df, temp_df])
    return game_df

## Get game data

In [14]:
limit = 5000
games = []
failed = []
for i in game_list.head(limit).id.values:
    try:
        games.append(get_game_data(client, game_id=i))
    except:
        failed.append(i)

In [15]:
len(failed)

1323

For some reason some fetches fail, so I'll try one more time to grab the failed ones

In [16]:
retry_games = []
failed_again = []
for i in failed:
    try:
        retry_games.append(get_game_data(client, game_id=i))
    except:
        failed_again.append(i)

In [17]:
len(failed_again)

367

In [18]:
last_try_games = []
left_behind = []
for i in failed_again:
    try:
        last_try_games.append(get_game_data(client, game_id=i))
    except:
        left_behind.append(i)

In [19]:
len(left_behind)

83

## Combine data

Concat the retries

In [43]:
game_data = game_df(set(games + retry_games + last_try_games))

The duplicates by name are due to release dates.  i.e. Cosmic Encounter first came out in the 70's.  We can deal with this later but for now we'll save off the entire dataset

In [55]:
game_data.title.value_counts().head(10)

Cosmic Encounter                        4
Samurai                                 3
Santorini                               2
Cry Havoc                               2
Barbarossa                              2
Lord of the Rings: The Confrontation    2
Axis & Allies                           2
Tales of the Arabian Nights             2
Cartagena                               2
Crimson Skies                           2
Name: title, dtype: int64

In [56]:
game_data[game_data.title=="Cosmic Encounter"]

Unnamed: 0,title,id,description,categories,mechanics,families,max_players,expansion,designers,publishers,expansions,year,minplaytime,maxplaytime,minage,stats,ranks
0,Cosmic Encounter,15,"By request of Fantasy Flight Games, Board Game...","[Bluffing, Negotiation, Science Fiction]","[Hand Management, Variable Player Powers]","[Admin: Better Description Needed!, Cosmic Enc...",6,False,"[Bill Eberle, Jack Kittredge, Bill Norton, Pet...","[Eon, Alga, ASS Altenburger Spielkarten, Desca...","[Thing (id: 4715), Thing (id: 4716), Thing (id...",1977,90,90,12,"{'usersrated': 3699, 'average': 6.92299, 'baye...","[BoardGameRank(id: 1, name: Board Game Rank, v..."
0,Cosmic Encounter,39463,From the Manufacturer\n\nBuild a galactic empi...,"[Bluffing, Negotiation, Science Fiction, Space...","[Hand Management, Partnerships, Take That, Var...",[Cosmic Encounter],5,False,"[Bill Eberle, Jack Kittredge, Bill Norton, Pet...","[Fantasy Flight Games, Arclight, Asterion Pres...","[Thing (id: 114276), Thing (id: 87507), Thing ...",2008,60,120,12,"{'usersrated': 22122, 'average': 7.58109, 'bay...","[BoardGameRank(id: 1, name: Board Game Rank, v..."
0,Cosmic Encounter,40531,"In Cosmic Encounter, you play the leader of a ...","[Bluffing, Card Game, Science Fiction]",[Variable Player Powers],[Cosmic Encounter],4,False,"[Bill Eberle, Jack Kittredge, Bill Norton, Pet...","[Avalon Hill Games, Inc.]",[],2000,60,60,0,"{'usersrated': 1167, 'average': 6.40128, 'baye...","[BoardGameRank(id: 1, name: Board Game Rank, v..."
0,Cosmic Encounter,40529,Players represent alien races that are seeking...,"[Bluffing, Card Game, Science Fiction]",[Variable Player Powers],[Cosmic Encounter],6,False,"[Bill Eberle, Jack Kittredge, Bill Norton, Pet...","[Hexagames (I), Mayfair Games]","[Thing (id: 207599), Thing (id: 2739)]",1991,60,60,0,"{'usersrated': 914, 'average': 7.18415, 'bayes...","[BoardGameRank(id: 1, name: Board Game Rank, v..."


#### Remove Expansions

In [57]:
game_data.expansion.value_counts()

False    4917
Name: expansion, dtype: int64

In [58]:
game_data = game_data[game_data.expansion == False]

In [59]:
game_data.shape

(4917, 17)

#### Save off the data

In [60]:
from sklearn.externals import joblib
joblib.dump(game_data, 'game_data-20180627.gz', compress=('gzip', 3))

['game_data-20180627.gz']

## Process features and build hashtable

In [61]:
SIMILARITY_FIELDS = [
    'description',
    'categories',
    'mechanics',
    'families',
    'designers'
]

HASH_REZ = 128

In [67]:
from string import punctuation
from nltk.corpus import stopwords
from nltk import word_tokenize

def process_description(desc):
    stop_words = stopwords.words('english') + list(punctuation)
    # remove publisher description, newlines and strip
    processed_desc = (desc.
                      replace("Description from the publisher:", "").
                      replace('\n', ' ').
                      strip()
                     )
    # remove stopwords, punctuation and get word list
    words = word_tokenize(processed_desc)
    words = [w.lower() for w in words]  # lowercase all words
    words = list(set([w for w in words if w not in stop_words and not w.isdigit()
                      and len(w) > 1]))
    
    # stem words (convert words to their root form)
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(w) for w in words]
    return stemmed

def process_tokens(row):
    output_tokens = []
#     output_tokens.extend(process_description(row['description']))
    output_tokens.extend(row['categories'])
    output_tokens.extend(row['mechanics'])
    output_tokens.extend(row['families'])
    output_tokens.extend(row['designers'])
    return output_tokens
 

In [69]:
# build token sets
game_data['_sim_tokens'] = game_data[SIMILARITY_FIELDS].apply(lambda row: process_tokens(row), axis=1)

In [70]:
game_data._sim_tokens.head(1).tolist()

[['Sports',
  'Tile Placement',
  'Admin: Better Description Needed!',
  'Sports: Football / Soccer',
  'Shaun Derrick']]

In [71]:
# compute hashes
def compute_sim_hash(tokens):
    lshf_hash = MinHash(num_perm=HASH_REZ)
    for word in tokens:
        lshf_hash.update(word.encode('utf-8'))
    return lshf_hash

In [72]:
game_data['_sim_hash'] = game_data['_sim_tokens'].apply(lambda x: compute_sim_hash(x))

In [78]:
# LSH forest for 'top-n' similarities
forest = MinHashLSHForest(num_perm=HASH_REZ)

for ind, row in game_data.iterrows():
    try:
        forest.add(f"{row['title']} (id:{row['id']})", row['_sim_hash'])
    except ValueError:
        print(f"{row['title']} already added")
    except:
        raise
forest.index()

### Pandemic Sanity Check

In [79]:
pandemic_check = game_data[game_data.title == 'Pandemic']

In [80]:
forest.query(pandemic_check._sim_hash.values[0], 6)

['Pandemic Legacy: Season 1 (id:161936)',
 'Defenders of the Realm (id:65532)',
 'Pandemic (id:30549)',
 'Pandemic: The Cure (id:150658)',
 'Pandemic: Iberia (id:198928)',
 'Bloc by Bloc: The Insurrection Game (id:190247)']

### Random Sample Check

In [551]:
sample_data = game_data.sample(1)

In [552]:
sample_data.title

0    Thunderstone
Name: title, dtype: object

In [553]:
forest.query(sample_data._sim_hash.values[0], 6)

['T.I.M.E Stories',
 'Thunderstone',
 'Broom Service',
 'StarCraft: The Board Game',
 'Thunderstone: Dragonspire',
 'Thunderstone Quest']

If I remove the description, I get good results, but if I use description with just removing stop words, the results are poor.