# Boardgame Similarities using Description

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
import numpy as np
import boardgamegeek as bgg
import re
from tqdm import tqdm
import nltk
import time
from datasketch import MinHash, MinHashLSH, MinHashLSHForest
from nltk.stem.porter import PorterStemmer
from sklearn.externals import joblib

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/brentonmallen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/brentonmallen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
from string import punctuation
from nltk.corpus import stopwords
from nltk import word_tokenize

def process_description(desc):
    stop_words = stopwords.words('english') + list(punctuation)
    # remove publisher description, newlines and strip
    processed_desc = (desc.
                      replace("Description from the publisher:", "").
                      replace('\n', ' ').
                      strip()
                     )
    # remove stopwords, punctuation and get word list
    words = word_tokenize(processed_desc)
    words = [w.lower() for w in words]  # lowercase all words
    words = list(set([w for w in words if w not in stop_words and not w.isdigit()
                      and len(w) > 1]))
    
    # stem words (convert words to their root form)
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(w) for w in words]
    return stemmed

def process_tokens(row):
    output_tokens = []
    output_tokens.extend(process_description(row['description']))
    output_tokens.extend(row['categories'])
    output_tokens.extend(row['mechanics'])
    output_tokens.extend(row['families'])
    output_tokens.extend(row['designers'])
    return output_tokens

## Load Game Data

In [3]:
fname = 'game_data-20180627.gz'
game_data = joblib.load(fname)

## Process Description

In [4]:
game_data['_desc_tokens'] = game_data['description'].apply(process_description)

## Description Topic Extraction

In [5]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [6]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [7]:
n_features = 100
n_components = 20  # number of topics
n_top_words = 20
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                analyzer='word',
                                stop_words='english',
                                ngram_range=(1,3))

In [8]:
t0 = time.time()
tf = tf_vectorizer.fit_transform(game_data['description'])
print("done in %0.3fs." % (time.time() - t0))

done in 5.802s.


In [9]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (tf.shape[0], n_features))

lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

t0 = time.time()
lda.fit(tf)
print("done in %0.3fs." % (time.time() - t0))

Fitting LDA models with tf features, n_samples=4917 and n_features=100...
done in 9.485s.


In [10]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: points victory victory points players game score player end different use win gain turns control place possible wins special available make
Topic #1: building build resources game players player use different turn used great points place game players gain board need like play best
Topic #2: space start new use order using takes players game need face possible win board points just choose different make used
Topic #3: players game player actions action money turn order end different new time make use available wins round place board cards
Topic #4: edition games rules second series includes new game available set number based great played make playing different play special possible
Topic #5: team game win try wins point make play takes player placed playing based use possible includes players used start rules
Topic #6: phase turn players number player game order end round gain turns place action ends choose attack based value resources rounds
Topic #7: c