# Boardgame Similarities using Description

In [288]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from functools import partial
import numpy as np
import boardgamegeek as bgg
import re
from tqdm import tqdm
import time
from datasketch import MinHash, MinHashLSH, MinHashLSHForest
from sklearn.externals import joblib
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.wordnet import ADJ

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/brentonmallen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/brentonmallen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/brentonmallen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [161]:
STOPWORDS = stopwords.words('english') + list(punctuation)
# add board game specific ones
boardgame_stopwords = [
    'board',
    'boardgame',
    'boardgames',
    'player',
    'play',
    'played',
    'rule',
    'rulebook',
    'expansion',
    'game',
    'point',
    'victory',
    'win',
    'piece',
    'turn',
    'round',
    'point',
    'score'
    
]

STOPWORDS.extend(boardgame_stopwords)

In [320]:
from string import punctuation
from nltk.corpus import stopwords
from nltk import word_tokenize
import re

PATTERN = re.compile("(|)")

def process_description(desc,
                        stop_words=stopwords.words('english'),
                        tokenize=True,
                        regex_pattern=None):
    
    lmtzr = WordNetLemmatizer()
    stmmr = SnowballStemmer('english', ignore_stopwords=True)
        
    # remove publisher description, newlines and strip
    processed_desc = (desc.
                      replace("Description from the publisher:", "").
                      replace('\n', ' ').
                      strip()
                     )
    processed_desc = re.sub(regex_pattern, "", processed_desc)
    # remove puctuation
    table = str.maketrans('', '', punctuation)
    processed_desc = [w.translate(table) for w in processed_desc]
    processed_desc = "".join(processed_desc).split(' ')
    # lower case everything
    processed_desc = [w.lower() for w in processed_desc]
    # lemmatize words
    processed_desc = [lmtzr.lemmatize(w) for w in processed_desc]
    # remove stop words
    processed_desc = [w for w in processed_desc if w not in stop_words and not w.isdigit()]
    # stem words
#     processed_desc = [stmmr.stem(w) for w in processed_desc]
    if tokenize:
        return list(set(processed_desc))
    else:
        return " ".join(processed_desc)
    
    
def process_tokens(row):
    output_tokens = []
    output_tokens.extend(process_description(row['description']))
    output_tokens.extend(row['categories'])
    output_tokens.extend(row['mechanics'])
    output_tokens.extend(row['families'])
    output_tokens.extend(row['designers'])
    return output_tokens

## Load Game Data

In [5]:
fname = 'game_data-20180627.gz'
game_data = joblib.load(fname)

## Process Description

In [321]:
process_partial = partial(process_description,
                          regex_pattern=PATTERN,
                          tokenize=False,
                          stop_words = STOPWORDS)

In [322]:
game_data['_desc'] = game_data['description'].apply(process_partial)

## Description Topic Extraction

In [323]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [324]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [325]:
n_features = 250
n_components = 20  # number of topics
n_top_words = 20
tf_vectorizer = CountVectorizer(max_df=0.99, min_df=.01,
                                max_features=n_features,
                                analyzer='word',
                                stop_words=STOPWORDS,
                                ngram_range=(1,1))

In [326]:
t0 = time.time()
tf = tf_vectorizer.fit_transform(game_data['_desc'])
print("done in %0.3fs." % (time.time() - t0))

done in 0.637s.


In [327]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (tf.shape[0], n_features))

lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0
                                )

t0 = time.time()
lda.fit(tf)
print("done in %0.3fs." % (time.time() - t0))

Fitting LDA models with tf features, n_samples=4917 and n_features=250...
done in 9.796s.


In [328]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, 20)


Topics in LDA model:
Topic #0: character king dragon location role take one ability new adventure power house special weapon gold set different includes series face
Topic #1: de treasure island room gold find secret must try help sea way ha order house around end collect course many
Topic #2: tile train placed line two place new one type placing set number level end ha also may four series bonus
Topic #3: war battle army unit system combat force command two side map one world scenario series also front event first ha
Topic #4: new set edition master word adventure time book box also includes youll need even feature original one make find hero
Topic #5: card deck hand one ha two draw take end set first start playing three also number new value face table
Topic #6: phase influence land action gain take end may power one control region next time choose year people use must many
Topic #7: hero monster dungeon animal level treasure one take ha different ability must new card two fight need

In [339]:
g = 'Pandemic'
game_data[game_data.title == g]['_desc'].tolist()

['pandemic several virulent disease broken simultaneously world diseasefighting specialist whose mission treat disease hotspot researching cure four plague get hand  depicts several major population center earth use four action travel city treat infected populace discover cure build research station deck card provides ability sprinkled throughout deck epidemic card accelerate intensify disease activity second separate deck card control normal spread infection  taking unique role within team must plan strategy mesh specialist strength order conquer disease example operation expert build research station needed find cure disease allow greater mobility city scientist need four card particular disease cure instead normal five—but disease spreading quickly time running one disease spread beyond recovery much time elapses lose cure four disease  edition pandemic includes two new characters—the contingency planner quarantine specialist—not available earlier edition  pandemic first pandemic se

In [343]:
a = lda.transform(tf_vectorizer.transform(game_data[game_data.title == g]['_desc']))


In [347]:
" ".join([tf_vectorizer.get_feature_names()[i] for i in a[0].argsort()[::-1]])

'adventure allows additional another around attempt also allow age available add attack along ability area air action across animal army'

In [348]:
a[0]

array([0.0009434 , 0.0009434 , 0.0009434 , 0.0009434 , 0.16457074,
       0.30235093, 0.0009434 , 0.0009434 , 0.0009434 , 0.24493383,
       0.0009434 , 0.0009434 , 0.0009434 , 0.10300961, 0.0009434 ,
       0.0009434 , 0.10216844, 0.0009434 , 0.06975891, 0.0009434 ])