# Goal: Scrape data off a boardgame geek wishlist

Data to collect:

1) Username

2) Game

3) User Game Rating

4) Avg Game Rating (to be used in the case the user has not provided a rating)


References:

https://github.com/ThaWeatherman/scrapers/tree/master/boardgamegeek

https://boardgamegeek.com/browse/boardgamemechanic

## Written functions

#### UserCollection(user) :
Returns collection pulled from user's BGG collection url. <br>
Input: Username (as in url)<br>
Output: pandas DataFrame with Username, Game, User Game Rating, and BGG Game Rating

#### RatingThreshold(collection) : 
Returns curated collection that keeps only games with User or BGG rating > 7 <br>
Input: pandas DataFrame from UserCollection function<br>
Output: pandas DataFrame

#### games2vec(user_collection, games_dict) :
Returns user collection as a vector where each index is represented by a game as determined by game_dict<br>
Input: pandas DataFrame of user collection cleaned by RatingThreshold, games dictionary<br>
Output: numpy array

#### bgg_dict() :
Establishes the game dictionary (loaded from a csv scraped from ranked games of BGG)
Output: Dictionary with games to index, Dictionary with index to game name

#### generate_training(collection, games_dict) :
Returns user collection as two matrices. Each row of predictor and predicted matrix corresponds to a boardgame from user's collection (input as a games2vec vector). The predictor vector has all other games from the collection. Predicted vector has only the game being predicted.<br>
Input: numpy array from games2vec function<br>
Output: numpy matrix predictor, numpy matrix predicted

# Run this code

1) Import all libraries used

2) Define all functions (as described above)

3) Create Training Matrix from all user collections

4) Split data to Training and CV datasets

### Libraries and Functions

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import numpy as np
import math

In [2]:
def UserCollection(user):
    # Import necessary libraries
    from bs4 import BeautifulSoup
    import requests
    import pandas as pd
    
    # Set up pd dataframe
    col = ['User','Game','User rating', 'BGG rating']
    games = pd.DataFrame(columns = col)
    
    # Pull webpage
    url = "https://boardgamegeek.com/collection/user/" + user + "?own=1&subtype=boardgame&ff=1"
    r = requests.get(url)

    data = r.text

    soup = BeautifulSoup(data, "lxml")
    
    # Get all the "Objects" that contain each boardgame in the collection.
    #Use the fact that the boardgame collection is the only one with the attribute "id".
    #All other tables on the page have no additional attributes
    all_games = soup.find_all(lambda tag:tag.name=='tr' and len(tag.attrs) > 0)
    
    for i in range(0,len(all_games)):
        # Start with 1 boardgame "Object"
        current_game = all_games[i]
        
        # To return Game name:
        #t is a bs4 tag
        #collection rating is stored under td tag, class = collection_objectname , a
        t_name = current_game.find_all('td', {'class':'collection_objectname '})[0]
        name = t_name.a.contents[0]
                

        # To return (user) Collection Rating:
        #t is a bs4 tag
        #collection rating is stored under td tag, class = collection_rating , div class=ratingtext
        t_UserRating = current_game.find_all('td',{'class':'collection_rating '})[0]
        t_UserRating = t_UserRating.find_all('div',{'class':'ratingtext'})

        # It's possible the user does not provide a rating, so we should check for that
        if t_UserRating:
            t_UserRating = t_UserRating[0]
            UserRating = t_UserRating.contents[0]
        else:
            UserRating = 'N/A'
        
        
        # To return BGG Rating:
        #t is a bs4 tag
        #collection rating is stored under td tag, class = collection_bggrating
        t_BggRating = current_game.find_all('td',{'class':'collection_bggrating'})

        #Can't call the td attribute for some reason. Use 'text' to pull the text string
        rate_text = t_BggRating[0].text

        #The text string includes \n and \t spacing. Get just the rating number (a float)
        #Find first index of a numerical digit
        #This can be done using the regular expression library 're'
        import re
        start = re.search("\d",rate_text) #\d re flag for any decimal digit [0-9]
        
        # It's possible bgg does not provide a rating, so we should check        
        if start is None: 
            #Start is currently set as a NoneType if regex does not find a numerical character
            start = rate_text.find('N') #Use standard str.find to locate index for "N/A"
        else:  
            #Regex does return an output
            start = re.search("\d",rate_text).start()

        #Find the end of the rating string
        end = rate_text.find('\t',start)

        #Get the decimal rating
        BggRating = rate_text[start:end]
        
        
        # Final Result
        print('User: ' + user)
        print('Game name: ' + name)
        print('User rating: ' + UserRating)
        print('BGG rating: ' + BggRating)
        print('\n')
        
        games = games.append(pd.DataFrame([[user,name, UserRating, BggRating]], columns=col))
    
    # Reset the index of the pd dataframe since the rows get appended all with index=0
    games.reset_index(drop=True, inplace=True)
    
    # Export collection as a csv for later
    games.to_csv(user+'_raw.csv',sep='\t')
    
    return games

In [3]:
#Need to rewrite Rating Threshold to work with the csv files.
#Since the csv files load in N/A values as NaN, we just need to change each
#N/A to isnull() checks

def RatingThreshold(collection):
    
    print('Collection Size start: ' + str(collection.shape[0]))
    
    # First remove any games that have no User Rating and no BGG rating
    na_idx = collection[(collection['User rating'].isnull()) & (collection['BGG rating'].isnull())].index.tolist() #Grab indices of N/A User and BGG rating rows
    collection.drop(collection.index[na_idx], inplace=True)
    
    print('Collection Size after removing N/A ratings: ' + str(collection.shape[0]))
    
    # Remove any games with User rating < 7
    #Ratings are stored as strings to account for N/A rating
    #Convert column to numeric using pandas to_numeric function
    #Set errors to 'coerce' which turns non-numeric strings to NaN values, which will work for numeric comparisons

    #Keep if User rating == N/A (for BGG based comparison) and User rating >= 7
    collection = collection[(collection['User rating'].isnull()) | (pd.to_numeric(collection['User rating'], errors='coerce') >= 7)]
    
    print('Collection Size after removing low User ratings: ' + str(collection.shape[0]))
    
    #Reset index
    collection.reset_index(drop=True, inplace=True)
    
    # Remove any games with BGG rating < 7 (if there is no User Rating)
    #Similarly, convert BGG rating column to_numeric
    #Find row indices where User rating == N/A and BGG rating < 7
    #Drop relevant rows from DataFrame
    low_bgg_idx = collection[(collection['User rating'].isnull()) & (pd.to_numeric(collection['BGG rating'], errors='coerce') < 7)].index.tolist()
    collection.drop(collection.index[low_bgg_idx],inplace=True)
    
    collection.reset_index(drop=True, inplace=True)
    
    print('Collection Size after removing low BGG ratings (if no User rating available): ' + str(collection.shape[0]))
    
    return collection

In [4]:
def games2vec(user_collection, games_dict):
    '''Take the user collection (game titles series) and game title dictionary as input.
    Output a vector representation of the collection.
    All mismatched game titles will be ignored.'''
    
    # Use numpy to generate the vector
    import numpy as np
    
    # Set up unpopulated vector of 0's
    collection = np.zeros((1,len(games_dict)))
    
    for i in range(0,len(user_collection)):
        current_game = user_collection.iloc[i]
        
        if current_game in games_dict:
            current_idx = games_dict[current_game]
            collection[0, current_idx] = 1
    
    return collection

In [5]:
def bgg_dict(min_rank=-1):
    '''Create dictionary of all ranked games on BGG to a number; the number is the index for the game used in the data sets.
    Also returns a reverse dictionary of number to game for decoding.
    Returns: Games dictionary, Games_reverse dictionary
    '''
       
    import pandas as pd
    # Create dictionary of all games. From previous webscraping, we have a csv with all ranked games on BGG. We'll limit our recommendations to games on the list.
    games_list = pd.read_csv('bgg id output.csv')

    # Remove all NaN rows
    games_list.dropna(axis=0,how='any',inplace=True)
    games_list.reset_index(drop=True,inplace=True)

    # There are repeat titles in the list. Remove them.
    rep_games_idx = games_list[games_list['Game'].duplicated()].index.tolist() #Returns the indices of all repeat titles. This list does NOT include the first appearance
    games_list.drop(games_list.index[rep_games_idx],inplace=True)
    games_list.reset_index(drop=True,inplace=True)

    # Get just the titles
    games = games_list['Game']

    # Create empty dictionary
    games_idx = dict()

    # Establish minimum game rank to consider; default -1 considers all games
    if min_rank == -1:
        min_rank = len(games)
    elif min_rank >= len(games):
        min_rank = len(games)
    else:
        min_rank = min_rank+1
    
    # The dictionary games_idx will use Game Titles as the key and a number as the value
    for i in range(0,min_rank):
        game_title = games[i]
        games_idx[game_title] = i

        
    games_reverse = dict((y,x) for x,y in games_idx.items())
    
    return games_idx, games_reverse

In [6]:
def generate_training(collection, games_dict):
    """Takes a user's collection as input along with the games dictionary.
    Outputs a matrix where each row is the training vector (predictor) for a game
    and another matrix where each row is the expected vector (predicted)"""
    
    import numpy as np
    
    # Find all indices in the collection array with value=1
    in_collection = np.nonzero(collection)[0] #returns index of nonzeros (aka value=1) of collection array

    # Create predictor vectors and predicted vectors
    #Iterate through every game in the collection, indices given by in_collection
    #Set value at the current index to 0 for predictor vector
    #Set a predicted vector to have 1 at the current index

    # Create two empty np arrays. #Rows = #Games in collection. #Columns = #Game Titles
    predictor = np.empty([len(in_collection), len(games_dict)])
    predicted = np.zeros([len(in_collection), len(games_dict)])

    for i in range(0,len(in_collection)):
        # Index of current game
        current_game = in_collection[i]

    #     # Create copy of the collection
    #     curr_predictor = np.copy(collection)
    #    # Set current boardgame value to 0 in predictor
    #    curr_predictor[0,current_game] = 0

        # Set values in current predictor row to collection
        predictor[i,:] = collection
        # Set current boardgame value to 0 in predictor
        predictor[i,current_game] = 0

        # Set current boardgame in predicted vector to 1    
        predicted[i,current_game] = 1
    
    return predictor, predicted

### Create Training Matrix: Aggregate all collections

In [120]:
# Number of games we are considering (from top rated)
total = 5000

# Get the game_list
games_list = pd.read_csv('bgg id output.csv')
#Remove all NaN rows
games_list.dropna(axis=0,how='any',inplace=True)
games_list.reset_index(drop=True,inplace=True)
#There are repeat titles in the list. Remove them.
rep_games_idx = games_list[games_list['Game'].duplicated()].index.tolist() #Returns the indices of all repeat titles. This list does NOT include the first appearance
games_list.drop(games_list.index[rep_games_idx],inplace=True)
games_list.reset_index(drop=True,inplace=True)
#Reduce to # of games to consider
games_list = games_list.loc[:total,:]

# Get the playerlist
playerlist = pd.read_excel('playerlist.xlsx')
# Generate the bgg dictionary of indices to game
games, games_decode = bgg_dict(total)

In [None]:
# Have a list for removed users (those whose collections fall below 1 game after cleanup)
removed_users = []

for i in range(0,len(playerlist)):
    # Get the user file
    #print(str(ct) + ': ' + str(user))
    user = playerlist['Username'][i]
    file = str(user) + '_raw.csv'
    user_collection = pd.read_csv(file, sep='\t', encoding='ISO-8859-1')
    #Drop the column 'Unnamed: 0' that is used to number the rows
    user_collection.drop('Unnamed: 0', inplace=True, axis=1)
    
    # Apply RatingThreshold
    user_collection_cleaned = RatingThreshold(user_collection)
    
    # Check if user has more than 1 game after cleaning the collection
    if user_collection_cleaned.shape[0] == 1:
        print('Less than 1 game added')
        removed_users = removed_users + [user_collection_cleaned.loc[0,'User']]
    else:
        # Vectorize the collection
        user_vector = games2vec(user_collection_cleaned['Game'], games)
        #Add user number
        user_vector = np.append([[i]], user_vector, axis=1)
        #Create row as a dataframe
        user_row = pd.DataFrame(user_vector, columns=list(['User']) + list(games_list['Game']))
        
        # Append to existing aggregate_collections dataframe
        if i == 0:
            aggregate_collections = user_row
        else:
            aggregate_collections = pd.concat([aggregate_collections, user_row])

aggregate_collections.to_csv('Coded user collections.csv',encoding='utf-8')     

In [114]:
aggregate_collections.shape

(211, 5002)

In [119]:
removed_users

[]

### Create Training Matrix: Split user matrix to inputs and outputs

In [20]:
# Get the aggregated collections list
aggregate_collections = pd.read_csv('Coded user collections.csv')
aggregate_collections.drop(['Unnamed: 0'], inplace=True, axis=1)

# Number of games we are considering (from top rated)
total = 5000

# Get the game_list
games_list = pd.read_csv('bgg id output.csv')
#Remove all NaN rows
games_list.dropna(axis=0,how='any',inplace=True)
games_list.reset_index(drop=True,inplace=True)
#There are repeat titles in the list. Remove them.
rep_games_idx = games_list[games_list['Game'].duplicated()].index.tolist() #Returns the indices of all repeat titles. This list does NOT include the first appearance
games_list.drop(games_list.index[rep_games_idx],inplace=True)
games_list.reset_index(drop=True,inplace=True)
#Reduce to # of games to consider
games_list = games_list.loc[:total,:]

# Get the playerlist
playerlist = pd.read_excel('playerlist.xlsx')
# Generate the bgg dictionary of indices to game
games, games_decode = bgg_dict(total)

In [52]:
#Keep track of how many training rows we will have
total_training_rows = 0

#Iterate through all users and convert collections into input vectors and expected_output vectors
for i in range(0, aggregate_collections['User'].shape[0]):
    #Use generate_training to create the input and expected output vectors
    user_input, user_exp_output = generate_training(aggregate_collections.iloc[i,1:].as_matrix(), games)
    
    #Append vectors into a matrix
    if i == 0:
        training_data = user_input
        exp_output_data = user_exp_output
    else:
        training_data = np.concatenate((training_data, user_input),axis=0)
        exp_output_data = np.concatenate((exp_output_data, user_exp_output), axis=0)
    
    #Update number of rows generated
    total_training_rows = total_training_rows + user_input.shape[0]

In [71]:
# For storage, store input and output stacked vertically in a csv
saving = np.concatenate((training_data, exp_output_data), axis=0)
saving = pd.DataFrame(saving, columns=list(games_list['Game']))
saving.to_csv('Vectorized bgg data.csv', encoding='utf-8')

#### RUN THIS CODE BELOW TO LOAD AND SPLIT TRAINING DATA

In [7]:
# Number of games we are considering (from top rated)
total = 5000
# Generate the bgg dictionary of indices to game
games, games_decode = bgg_dict(total)

In [8]:
# Load vectorized data
vectorized = pd.read_csv('Vectorized bgg data.csv')
vectorized.drop('Unnamed: 0', axis=1, inplace=True) #Remove unnecessary first column
row_size = int(vectorized.shape[0]/2) #The actual total datapoints is half the number of rows

# Split the data into the input and expected_output vectors
input_data = vectorized.iloc[:row_size,:].as_matrix()
exp_output_data = vectorized.iloc[row_size:,:].as_matrix()

print(input_data.shape)
print(exp_output_data.shape)

(8549, 5001)
(8549, 5001)


In [9]:
# Divide dataset into Training and Test data
from sklearn.model_selection import train_test_split

Train_inputs, CV_inputs, Train_outputs, CV_outputs = train_test_split(input_data, exp_output_data, test_size=0.25, random_state=101)

print('Training inputs: ' + str(Train_inputs.shape))
print('Training outputs: ' + str(Train_outputs.shape))
print('CV inputs: ' + str(CV_inputs.shape))
print('CV outputs: ' + str(CV_outputs.shape))

Training inputs: (6411, 5001)
Training outputs: (6411, 5001)
CV inputs: (2138, 5001)
CV outputs: (2138, 5001)


#### Some extra analysis

In [10]:
# See how many of the 5000 games are actually in collections
#Sum all the columns (of expected_outputs) and find how many have 0 mentions
mentions = exp_output_data.sum(axis=0)

In [14]:
# Game that appears the most
print(games_decode[mentions.argmax()] +': '+ str(mentions.max()))

Codenames: 77.0


In [15]:
# Get indices of games that are in and games that never appear in the user collections
not_in_collections_idx = np.where(mentions==0)[0]
in_collections_idx = np.nonzero(mentions)[0]

In [16]:
not_in_collections_idx[0]

53

In [17]:
in_collections_idx[0]

0

In [18]:
len(not_in_collections_idx)

3168

In [19]:
len(in_collections_idx)

1833

Out of the top 5000 games, 3168 games are not in any of the user collections. Indices of the games are stored in array not_in_collections_idx.

# Black box MultiLayer Perceptron

Just try sklearn's MLP regressor...?

Inspired by: https://towardsdatascience.com/the-perils-of-predictive-policing-11928a9f1d60

http://scikit-learn.org/stable/modules/neural_networks_supervised.html

https://en.wikipedia.org/wiki/Limited-memory_BFGS



In [21]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(300), #1 hidden layer of 300 neurons
                    solver = 'lbfgs', #Limited memory BFGS
                    max_iter = 300, #Converge by 300 iterations or optimization stops
                    random_state=1) 

In [22]:
mlp.fit(Train_inputs,Train_outputs)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=300, learning_rate='constant',
       learning_rate_init=0.001, max_iter=300, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [23]:
# Evaluate training error
mlp.score(Train_inputs, Train_outputs)

0.81547340508501009

In [24]:
# Save the model
from sklearn.externals import joblib
filename = 'bgg_rec_mlp.joblib.pkl'
_ = joblib.dump(mlp, filename, compress=9)

#### For future use, we can just load the model

In [26]:
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
filename = 'bgg_rec_mlp.joblib.pkl'

In [27]:
loaded_mlp = joblib.load(filename)

In [28]:
loaded_mlp.score(Train_inputs, Train_outputs)

0.81547340508501009

#### Let's try checking the CV dataset

In [29]:
loaded_mlp.score(CV_inputs, CV_outputs)

0.059401309635173059

Looks like the mlp does decently well with the training data set (score = 0.81) whereas it does terribly with the CV data set (score = 0.06).

Let's see if there's an affect of different numbers of neurons in the hidden layer

In [37]:
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
from IPython.display import clear_output

neurons = [100, 300, 500, 1000]

col = ['#Neurons', 'Training Acc', 'CV Acc']
#mlp_results = pd.DataFrame(columns=col)

for n in neurons:
    mlp = MLPClassifier(hidden_layer_sizes = (n),
                       solver = 'lbfgs',
                       max_iter = 200,
                       random_state = 1)
    
    mlp.fit(Train_inputs, Train_outputs)
    
    trained_acc = mlp.score(Train_inputs, Train_outputs)
    CV_acc = mlp.score(CV_inputs, CV_outputs)
    
    mlp_results = mlp_results.append(pd.DataFrame([[n, trained_acc, CV_acc]], columns=col))
    
    clear_output(wait = True)
    print(mlp_results)
    
    filename = 'bgg_mlp_' + str(n) + 'neurons.joblib.pkl'
    _ = joblib.dump(mlp, filename, compress=9)

In [38]:
mlp_results

Unnamed: 0,#Neurons,Training Acc,CV Acc
0,100,0.119482,0.00608
0,300,0.434254,0.018241
0,500,0.433006,0.019177
0,1000,0.67041,0.055192


Let's also adjust how many iterations we need to converge

In [39]:
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
from IPython.display import clear_output

iter_ct = [100, 300, 500, 1000]

col = ['Iter', 'Training Acc', 'CV Acc']
mlp_results_iter = pd.DataFrame(columns=col)

for i in iter_ct:
    mlp = MLPClassifier(hidden_layer_sizes = (300),
                       solver = 'lbfgs',
                       max_iter = i,
                       random_state = 1)
    
    mlp.fit(Train_inputs, Train_outputs)
    
    trained_acc = mlp.score(Train_inputs, Train_outputs)
    CV_acc = mlp.score(CV_inputs, CV_outputs)
    
    mlp_results_iter = mlp_results_iter.append(pd.DataFrame([[i, trained_acc, CV_acc]], columns=col))
    
    clear_output(wait = True)
    print(mlp_results_iter)
    
    filename = 'bgg_mlp_' + str(i) + 'iter.joblib.pkl'
    _ = joblib.dump(mlp, filename, compress=9)

   Iter  Training Acc    CV Acc
0  1000      0.000000  0.000000
0  1000      0.813914  0.058934
0  1000      0.976447  0.063611
0  1000      0.999844  0.065014


There was a bug in the previous run so the Iter# is incorrect. Reset it properly.

In [41]:
mlp_results_iter['Iter'].loc[:] = [100, 300, 500, 1000]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [42]:
mlp_results_iter

Unnamed: 0,Iter,Training Acc,CV Acc
0,100,0.0,0.0
0,300,0.813914,0.058934
0,500,0.976447,0.063611
0,1000,0.999844,0.065014


It seems more iterations allow the training accuracy to converge and it takes at least 500 to get above 90% accuracy. However, the CV accuracy seems to converge with 300 iteration cap whereas increasing number of neurons can increase the CV accuracy.

Let's try a 1000 neuron and 1500 neuron combo with 1000 iteration-cap.

In [11]:
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
from IPython.display import clear_output

neurons = [1000, 1500]

col = ['#Neurons', 'Training Acc', 'CV Acc']
mlp_results = pd.DataFrame(columns=col)

for n in neurons:
    mlp = MLPClassifier(hidden_layer_sizes = (n),
                       solver = 'lbfgs',
                       max_iter = 1000,
                       random_state = 1)
    
    mlp.fit(Train_inputs, Train_outputs)
    
    trained_acc = mlp.score(Train_inputs, Train_outputs)
    CV_acc = mlp.score(CV_inputs, CV_outputs)
    
    mlp_results = mlp_results.append(pd.DataFrame([[n, trained_acc, CV_acc]], columns=col))
    
    clear_output(wait = True)
    print(mlp_results)
    
    filename = 'bgg_mlp_' + str(n) + 'neurons_1000iter.joblib.pkl'
    _ = joblib.dump(mlp, filename, compress=9)

  #Neurons  Training Acc    CV Acc
0     1000      0.999844  0.080917
0     1500      0.000000  0.000000


In [10]:
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
from IPython.display import clear_output

neurons = [1500]

col = ['#Neurons', 'Training Acc', 'CV Acc']
mlp_results = pd.DataFrame(columns=col)

for n in neurons:
    mlp = MLPClassifier(hidden_layer_sizes = (1500),
                       solver = 'lbfgs',
                       max_iter = 1000,
                       random_state = 1)
    
    mlp.fit(Train_inputs, Train_outputs)
    
    trained_acc = mlp.score(Train_inputs, Train_outputs)
    CV_acc = mlp.score(CV_inputs, CV_outputs)
    
    mlp_results = mlp_results.append(pd.DataFrame([[n, trained_acc, CV_acc]], columns=col))
    
    #clear_output(wait = True)
    print(mlp_results)
    
    #filename = 'bgg_mlp_' + str(n) + 'neurons_1000iter.joblib.pkl'
    #_ = joblib.dump(mlp, filename, compress=9)

  #Neurons  Training Acc  CV Acc
0     1500           0.0     0.0


Looks like with the scikit learn mlp, 1000 neurons with 1000 iterations performs great on Training but still has very low CV accuracy. The model is essentially overfitting the Training data set.

Furthermore, it looks like above 1000 neurons in the hidden layer, the model falls apart.

To try and account for overfitting, we can either obtain more training examples (which is not trivial) or reduce the number of features (which is possible; there are 1833 games in the top 5000 games that don't even show up in any collection).

# Word2Vec and Tensorflow implementation

https://www.tensorflow.org/tutorials/word2vec

http://adventuresinmachinelearning.com/word2vec-tutorial-tensorflow/

https://stackoverflow.com/questions/37394970/tensorflow-word2vec-cbow-model

https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw

As in examples of word2vec implementation, we have created a "one-hot" vector where each unique integer value (index) is assigned to a specific boardgame. We have the dictionary required to encode and decode boardgame collections and the corresponding output. And we already have our dataset with both collection inputs and the expected outputs for training.

Word2vec only has 1 hidden layer: a word embedding matrix.

The output layer uses the softmax function

https://gist.github.com/discorev/b6a0900a52b62cd04f33

https://gist.github.com/yxtay/a94d971955d901c4690129580a4eafb9

http://adventuresinmachinelearning.com/python-tensorflow-tutorial/

In [11]:
import tensorflow as tf

# Python optimization variables
learning_rate = 0.5
epochs = 10
batch_size = 100

# Training data placeholders
#input x will be an unknown # of training examples encoded as a vectors of length 13955 (boardgames)
x = tf.placeholder(tf.float32, [None, 13955]) 
#output placeholder will be # of predictions encoded as vectors of length 13955 (boardgames)
y = tf.placeholder(tf.float32, [None, 13955])

#weights connecting input to hidden layer
# tf.random_normal will generate values from a mean=0 and stddev=input
W1 = tf.Variable(tf.random_normal([13955,300], stddev=0.03), name = 'W1') #300 Neuronal layer
b1 = tf.Variable(tf.random_normal([300]), name = 'b1') #First layer bias

#weights connecting hidden layer to output layer
W2 = tf.Variable(tf.random_normal([300,13955],stddev=0.03), name = 'W2') #Convert back to 13955 output
b2 = tf.Variable(tf.random_normal([13955]), name = 'b2') #Second layer bias


# Calculations for the hidden layer
hidden_out = tf.add(tf.matmul(x, W1),b1) #matrix multiply x and W1 weights, add b1 bias
hidden_out = tf.nn.relu(hidden_out) #rectified linear unit activation function: converts all negative values to 0

# Calculate output layer with softmax
y_ = tf.nn.softmax(tf.add(tf.matmul(hidden_out, W2), b2)) #apply softmax after matrix multiply hidden layer output and W2 weights, and adding b2 bias


# Cost function: Cross Entropy
y_clipped = tf.clip_by_value(y_, 1e-10, 0.9999999) #Limits output values between 1e-10 and 0.9999999; prevents log(0) operations
cross_entropy = -tf.reduce_mean(tf.reduce_sum(y * tf.log(y_clipped) 
                                              + (1-y) * tf.log(1 - y_clipped), axis=1))

# Optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate).minimize(cross_entropy)
#optimizer will minimize the cross entropy cost function and use the learning rate we set (0.5) as learning rate alpha

# Initialization operator
init_op = tf.global_variables_initializer()

# Define an accuracy assessment operation
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_, 1)) #Checks that predicted output is same as predicted
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) #Changes boolean correct_prediction to float, then finds mean. 1 = highest accuracy


In [None]:
# Start tensorflow session

with tf.Session() as sess:
    # initialize the variables
    sess.run(init_op)
    total_batch = int(len(<training labels>) / batch_size) #Use data set
    
    for epoch in range(epochs):
        avg_cost = 0
        for i in range(total_batch):
            batch_x, batch_y = <training group.nextbatch(batch_size=batch_size) #Use data set
            _, c = sess.run(optimizer, cross_entropy],
                           feed_dict = {x: batch_x, y: batch_y})
            avg_cost += c / total_batch
        
        print("Epoch: ", (epoch + 1), "cost = ", "{:.3f}".format(avg_cost))
        
    print(sess.run(accuracy, feed_dict={x: <training group decoded>, y:<training group coded>})) #Use data set

## Implementing Tensorflow Model


https://towardsdatascience.com/learn-word2vec-by-implementing-it-in-tensorflow-45641adaf2ac

In [1]:
import tensorflow as tf
import numpy as np

In [12]:
print('Training inputs: ' + str(Train_inputs.shape))
print('Training outputs: ' + str(Train_outputs.shape))
print('CV inputs: ' + str(CV_inputs.shape))
print('CV outputs: ' + str(CV_outputs.shape))

Training inputs: (5984, 5001)
Training outputs: (5984, 5001)
CV inputs: (2565, 5001)
CV outputs: (2565, 5001)


In [17]:
games
len(games_decode)

5001

In [92]:
#NEW VERSION
#Create tensorflow variables

# Make placeholders for x_train and y_train (x = data points, y = expected labels)
#Create placeholder architecture. shape = (None, len(games)) generates a matrix with unknown number of rows and len(games) columns
#This is the same as the representation of our data in vector form, with each individual example taking a row

batch_size = Train_inputs.shape[0]
col_size = len(games)
num_sampled = 30 #Number of negative examples to sample
EMBEDDING_DIM = 150 #Hyperparameter to be adjusted; # neurons of the hidden layer


def gamegram():
    x = tf.placeholder(tf.float32, shape = [batch_size, None])
    y_label = tf.placeholder(tf.int32, shape=[batch_size, None])
    #val_data = tf.constant(val_data,dtype=tf.int32)

    with tf.variable_scope("gamegram") as scope:
        embeddings = tf.Variable(tf.random_uniform([col_size,
                                                   EMBEDDING_DIM],
                                                  -1.0, 1.0))
        batch_embeddings = tf.nn.embedding_lookup(embeddings, tf.cast(x,tf.int32))

        weights = tf.Variable(tf.truncated_normal([col_size,
                                                  EMBEDDING_DIM],
                                                  stddev = 1.0/math.sqrt(EMBEDDING_DIM)))
        biases = tf.Variable(tf.zeros(col_size))

        loss = tf.reduce_mean(tf.nn.nce_loss(weights = weights,
                                            biases = biases,
                                            labels = y_label,
                                            inputs = x,
                                            num_sampled = num_sampled,
                                            num_classes = col_size,
                                            num_true = col_size))

        norm = tf.sqrt(tf.reduce_mean(tf.square(embeddings), 1, keep_dims = True))

        normalized_embeddings = embeddings/norm
        
#        val_embeddings = tf.nn.embedding_lookup(normalized_embeddings, val_dataset)
#        similarity = tf.matmul(val_embeddings, normalized_embeddings, transpose_b=True)
    
        return x, y_label, normalized_embeddings, loss #, similarity

In [93]:
def run():
    n_iters = 100
    
    x, y_label, normalized_embeddings, loss = gamegram()
    #    inputs, labels, normalized_embeddings, loss, similarity = gamegram()
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
    
    init = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        sess.run(init)
        
        average_loss = 0.0
        
        # train for n_iter iterations
        #Recall: x = Train_inputs, y_label = Train_outputs
        for _ in range(n_iters):
            feed_dict = {x: Train_inputs, y_label: Train_outputs}
            _, loss_val = sess.run([optimizer, loss], feed_dict)
            average_loss += loss_val

            if step %1000 == 0:
                if step > 0:
                    average_loss /= 1000
                print('loss at iter', step, ':', average_loss)
                average_loss = 0        

        final_embeddings = normalized_embedding.eval()
        return final_embeddings
                    
#         for step, batch_data in enumerate(train_data): #need to change
#            # inputs, labels = batch_data #need to change
#            # feed_dict = {batch_inputs: inputs, batch_labels: labels}
            
#             _, loss_val = session.run([optimizer, loss], feed_dict)
#             average_loss += loss_val
            
#             if step %1000 == 0:
#                 if step > 0:
#                     average_loss /= 1000
#                 print('loss at iter', step, ':', average_loss)
#                 average_loss = 0

In [None]:
final_embeddings = run()

#visualize_embeddings(final_embeddings, games_decode)

In [28]:
#Create tensorflow variables

# Make placeholders for x_train and y_train (x = data points, y = expected labels)
#Create placeholder architecture. shape = (None, len(games)) generates a matrix with unknown number of rows and len(games) columns
#This is the same as the representation of our data in vector form, with each individual example taking a row

x = tf.placeholder(tf.int32, shape = (None, len(games)))
y_label = tf.placeholder(tf.int32, shape=(None, len(games)))


# Hidden layer calculation
EMBEDDING_DIM = 700 #Hyperparameter to be adjusted; # neurons of the hidden layer

W1 = tf.Variable(tf.random_normal([len(games), EMBEDDING_DIM])) #Weights

b1 = tf.Variable(tf.random_normal([EMBEDDING_DIM])) #bias

hidden_representation = tf.add(tf.matmul(x,W1), b1) # x * W1 + b

# Output layer calculation
W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, len(games)])) #Weights

b2 = tf.Variable(tf.random_normal([len(games)]))

prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_representation,W2), b2))
#Apply softmax: Converts to normalized probability

In [29]:
W1.shape

TensorShape([Dimension(501), Dimension(700)])

In [12]:
sess = tf.Session()

init = tf.global_variables_initializer()

sess.run(init) #make sure you do this!

#cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1]))
cross_entropy_loss = tf.reduce_mean(tf.reduce_sum(y_label * tf.log(prediction + 1e-10) + (1-y_label) * tf.log(1 - prediction + 1e-10), axis=1))
sess.run(cross_entropy_loss, feed_dict={x: Train_inputs, y_label:Train_outputs})

#Added +1e-10 to softmax predictions to try and handle NaN cases. It seems super small values get cut out; happens with extremely sparse matrices.
#https://stackoverflow.com/questions/39583752/nan-from-sparse-softmax-cross-entropy-with-logits-in-tensorflow


-44.917133

In [30]:
# Train the model

sess = tf.Session()

init = tf.global_variables_initializer()

sess.run(init) #make sure you do this!

# Define loss function: Cross Entropy Loss function
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction + 1e-10), reduction_indices=[1]))
# Define training step
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss)

n_iters = 100

# train for n_iter iterations
#Recall: x = Train_inputs, y_label = Train_outputs
for _ in range(n_iters):
    sess.run(train_step, feed_dict={x: Train_inputs, y_label: Train_outputs})
    print('loss is : ', sess.run(cross_entropy_loss, feed_dict={x: Train_inputs, y_label: Train_outputs}))    


loss is :  22.961
loss is :  22.9602
loss is :  22.9594
loss is :  22.9587
loss is :  22.958
loss is :  22.9575
loss is :  22.9572
loss is :  22.9571
loss is :  22.957
loss is :  22.9569
loss is :  22.9568
loss is :  22.9567
loss is :  22.9566
loss is :  22.9565
loss is :  22.9564
loss is :  22.9561
loss is :  22.9557
loss is :  22.9552
loss is :  22.9546
loss is :  22.9541
loss is :  22.9536
loss is :  22.9528
loss is :  22.9518
loss is :  22.9506
loss is :  22.9494
loss is :  22.9482
loss is :  22.9472
loss is :  22.9463
loss is :  22.9456
loss is :  22.9452
loss is :  22.9451
loss is :  22.9451
loss is :  22.9451
loss is :  22.9451
loss is :  22.9451
loss is :  22.9451
loss is :  22.945
loss is :  22.945
loss is :  22.945
loss is :  22.945
loss is :  22.945
loss is :  22.945
loss is :  22.945
loss is :  22.945
loss is :  22.945
loss is :  22.945
loss is :  22.945
loss is :  22.945
loss is :  22.945
loss is :  22.945
loss is :  22.9448
loss is :  22.9438
loss is :  22.9418
loss is : 

With all games in the vector and 700 hidden neurons, loss is about 23
With top 5000 games in the vector and 700 hidden neurons, loss is about 23.0141
With top 500 games in the vector and 700 hidden neurons, loss is about 22.9393

In [31]:
games

{'1775: Rebellion': 220,
 '1830: Railways & Robber Barons': 148,
 '1960: The Making of the President': 140,
 '1989: Dawn of Freedom': 337,
 '51st State: Master Set': 256,
 '7 Wonders': 37,
 '7 Wonders Duel': 8,
 'A Feast for Odin': 35,
 'A Few Acres of Snow': 207,
 'A Game of Thrones (first edition)': 281,
 'A Game of Thrones: The Board Game (Second Edition)': 73,
 'A Game of Thrones: The Card Game (Second Edition)': 254,
 'Above and Below': 126,
 'Abyss': 301,
 'Acquire': 185,
 'Advanced Squad Leader': 171,
 'Advanced Squad Leader: Starter Kit #1': 360,
 "Aeon's End": 375,
 'Age of Empires III: The Age of Discovery': 116,
 'Age of Industry': 397,
 'Age of Steam': 94,
 'Agricola': 14,
 'Agricola (revised edition)': 291,
 'Agricola: All Creatures Big and Small': 153,
 'Airlines Europe': 267,
 'Alchemists': 74,
 'Alhambra': 381,
 'Alien Frontiers': 132,
 'Amerigo': 250,
 'Among the Stars': 378,
 'Amun-Re': 235,
 'Anachrony': 139,
 'Android: Netrunner': 31,
 'Antiquity': 227,
 'AquaSphere

In [49]:
games['Spyfall']

318

In [59]:
me=pd.DataFrame(data=['7 Wonders','Pandemic','Codenames','Dixit', 'Pandemic Legacy: Season 1','One Night Ultimate Werewolf','The Resistance','Coup',
'Galaxy Trucker','Space Alert','Cosmic Encounter','Spyfall'], columns=['Games'])

In [61]:
my_collection = games2vec(me['Games'],games)

# STOP


## Original Code Drafts

Below are the original code drafts that led to development of the above 5 functions

In [1]:
# Import Requests package
import requests

# Pull BGG API

bgg = requests.get("https://www.boardgamegeek.com/xmlapi2/")
print(bgg.status_code)

# Status code:
#     200: everything went ok
#     301: server redirecting to a new endpoint
#     401: server does not recognize user as authenticated
#     400: bad request
#     403: access is forbidden
#     404: resource wasn't found on server

200


In [2]:
#Using XLMAPI original

# There are currently two XML API URLs of relevance:

# http://boardgamegeek.com/xmlapi/search
# http://boardgamegeek.com/xmlapi/game/[gameid]

# HAVE TO USE XMLAPI , not XMLAPI2

## Webscraping

In [3]:
# # Use Scrapy, installed via pip install scrapy 
# import scrapy

In [4]:
# import BGG_csvsetting

# # Build python subclass from the scrapy class Spider which will crawl through URls
# class BGGSpider(scrapy.Spider):
#     name = "bgg_spider"
#     home_url = 'https://www.boardgamegeek.com'
#     user = 'TomVasel'
#     start_urls = [home_url+'/collection/user/'+user+'?own=1&subtype=boardgame&ff=1']
#     print("Starting with: " + user)
    
#     custom_settings = {
#         'DOWNLOAD_DELAY': 2,  # 2 second delay
# #        'ITEM_PIPELINES': {'__main__.CSVExportPipeline': 1},
#         'FEED_EXPORTERS' : {'csv' : 'BGG_csvsetting.BGGCsvItemExporter'},
#         'FEED_FORMAT' : 'csv',
#         'CSV_DELIMITER' : ';',
#         'FEED_URI' : 'bgg userlist output.csv',
#         'FEED_EXPORT_FIELDS': ['User', 'BGG Rank','GameID']}
    
#     def parse(self, response):
#         # Define the selector (pattern) to find the relevant element on the page
#         # BGG starting URL shows a table of 100 games; the html elements are in the next cell        
#         gamelist_ID = response.xpath('//tr[@id = "row_"]//td[contains(@class,"collection_objectname")]//a[contains(@href, "/boardgame/")]/@href').extract()
#         gamelist_rank = response.xpath('//tr[@id = "row_"]//td[contains(@class,"collection_rank")]//a/@name').extract()
#         gamelist_name = response.xpath('//tr[@id = "row_"]//td[contains(@class,"collection_objectname")]//a[contains(@href, "/boardgame/")]/text()').extract()

#         #print(gamelist_name)
#         #print(gamelist_ID)
#         allIDs = {}

# #        for game in zip(gamelist_name, gamelist_ID):
#         if len(gamelist_rank) > 0:
#             for game in zip(gamelist_name[0:len(gamelist_rank)], gamelist_rank, gamelist_ID[0:len(gamelist_rank)]):    
#                 yield {
#                     'Game':game[0],
#                     'BGG Rank':game[1],
#                     'GameID':game[2].split("/")[2]
#                 }

#             # Go to next page
#             next_page = response.xpath('//a[contains(@title, "next page")]/@href').extract_first()
#             if next_page:
#                 home_url = 'https://www.boardgamegeek.com'
#                 print(["Going to: "+ home_url + next_page])
#                 yield scrapy.Request(
#                     response.urljoin([home_url+next_page][0]),
#                     callback = self.parse
#             )

## Using Beautiful Soup

In [5]:
# Import necessary libraries
from bs4 import BeautifulSoup
import requests

In [6]:
# Use beautifulsoup to pull the webpage as lxml format
user = ''
url = "https://boardgamegeek.com/collection/user/" + user + "?own=1&subtype=boardgame&ff=1"
r = requests.get(url)

data = r.text

soup = BeautifulSoup(data, "lxml")

In [7]:
# Get all the "Objects" that contain each boardgame in the collection.
#Use the fact that the boardgame collection is the only one with the attribute "id".
#All other tables on the page have no additional attributes
all_games = soup.find_all(lambda tag:tag.name=='tr' and len(tag.attrs) > 0)

In [8]:
# Start with 1 boardgame "Object"
#Pull out: Game, User Rating, and Avg Rating
current_game = all_games[13]

In [9]:
# To return Game name:
#t is a bs4 tag
#collection rating is stored under td tag, class = collection_objectname , a
t_name = current_game.find_all('td', {'class':'collection_objectname '})[0]
name = t_name.a.contents[0]

In [10]:
# To return (user) Collection Rating:
#t is a bs4 tag
#collection rating is stored under td tag, class = collection_rating , div class=ratingtext
t_UserRating = current_game.find_all('td',{'class':'collection_rating '})[0]
t_UserRating = t_UserRating.find_all('div',{'class':'ratingtext'})

# It's possible the user does not provide a rating, so we should check for that
if t_UserRating:
    t_UserRating = t_UserRating[0]
    UserRating = t_UserRating.contents[0]
else:
    UserRating = 'N/A'


In [11]:
# To return BGG Rating:
#t is a bs4 tag
#collection rating is stored under td tag, class = collection_bggrating
t_BggRating = current_game.find_all('td',{'class':'collection_bggrating'})

#Can't call the td attribute for some reason. Use 'text' to pull the text string
rate_text = t_BggRating[0].text

#The text string includes \n and \t spacing. Get just the rating number (a float)
#Find first index of a numerical digit
#This can be done using the regular expression library 're'
import re
start = re.search("\d",rate_text) #\d re flag for any decimal digit [0-9]

# It's possible bgg does not provide a rating, so we should check
if start is None: #Start is currently set as a NoneType if regex does not find a numerical character
    start = rate_text.find('N') #Use standard str.find to locate index for "N/A"
else:  #Regex does return an output
    start = re.search("\d",rate_text).start()

#Find the end of the rating string
end = rate_text.find('\t',start)

#Get the decimal rating
BggRating = rate_text[start:end]

In [None]:
# Final Result
print('User: ' + user)
print('Game name: ' + name)
print('User rating: ' + UserRating)
print('BGG rating: ' + BggRating)

# Setup up a function

Input is the username


Outputs user, game name, user rating, bgg rating

In [13]:
def UserCollection(user):
    # Import necessary libraries
    from bs4 import BeautifulSoup
    import requests
    
    # Pull webpage
    url = "https://boardgamegeek.com/collection/user/" + user + "?own=1&subtype=boardgame&ff=1"
    r = requests.get(url)

    data = r.text

    soup = BeautifulSoup(data, "lxml")
    
    # Get all the "Objects" that contain each boardgame in the collection.
    #Use the fact that the boardgame collection is the only one with the attribute "id".
    #All other tables on the page have no additional attributes
    all_games = soup.find_all(lambda tag:tag.name=='tr' and len(tag.attrs) > 0)
    
    for i in range(0,len(all_games)):
        # Start with 1 boardgame "Object"
        current_game = all_games[i]
        
        # To return Game name:
        #t is a bs4 tag
        #collection rating is stored under td tag, class = collection_objectname , a
        t_name = current_game.find_all('td', {'class':'collection_objectname '})[0]
        name = t_name.a.contents[0]
                

        # To return (user) Collection Rating:
        #t is a bs4 tag
        #collection rating is stored under td tag, class = collection_rating , div class=ratingtext
        t_UserRating = current_game.find_all('td',{'class':'collection_rating '})[0]
        t_UserRating = t_UserRating.find_all('div',{'class':'ratingtext'})

        # It's possible the user does not provide a rating, so we should check for that
        if t_UserRating:
            t_UserRating = t_UserRating[0]
            UserRating = t_UserRating.contents[0]
        else:
            UserRating = 'N/A'
        
        
        # To return BGG Rating:
        #t is a bs4 tag
        #collection rating is stored under td tag, class = collection_bggrating
        t_BggRating = current_game.find_all('td',{'class':'collection_bggrating'})

        #Can't call the td attribute for some reason. Use 'text' to pull the text string
        rate_text = t_BggRating[0].text

        #The text string includes \n and \t spacing. Get just the rating number (a float)
        #Find first index of a numerical digit
        #This can be done using the regular expression library 're'
        import re
        start = re.search("\d",rate_text) #\d re flag for any decimal digit [0-9]
        
        # It's possible bgg does not provide a rating, so we should check        
        if start is None: 
            #Start is currently set as a NoneType if regex does not find a numerical character
            start = rate_text.find('N') #Use standard str.find to locate index for "N/A"
        else:  
            #Regex does return an output
            start = re.search("\d",rate_text).start()

        #Find the end of the rating string
        end = rate_text.find('\t',start)

        #Get the decimal rating
        BggRating = rate_text[start:end]
        
        
        # Final Result
        print('User: ' + user)
        print('Game name: ' + name)
        print('User rating: ' + UserRating)
        print('BGG rating: ' + BggRating)
        print('\n')
    

One problem to work on: The table on BGG uses a built in "next page", so the url doesn't change when moving from games 1-300 and 301-600. This is problematic if a user has more than 300 boardgames.

Might have to use scrapy to move between tables...

## Setting up pandas dataframe

In [15]:
# Import pandas
import pandas as pd

In [16]:
# Set up a new dataframe with desired column headings:
#User
#Game
#User rating
#BGG rating
col = ['User','Game','User rating', 'BGG rating']
games = pd.DataFrame(columns = col)
games

Unnamed: 0,User,Game,User rating,BGG rating


In [None]:
# Try appending an example game
#Game values are passed as a list (in a list) with same col headings
#Append takes the new temporary DataFrame and appends it to the game DataFrame
games = games.append(pd.DataFrame([['TomVasel','Super Motherload','8','6.829']], columns=col))
games

In [None]:
# Try appending a second example game
games = games.append(pd.DataFrame([['TomVasel','Survive: Escape From Atlantis!','7.5','7.172']], columns=col))
games

In [19]:
# Reset the indices to 0 to n-1
#drop will prevent the old indices from becoming a new column
#inplace will make the changes to the games DataFrame
games.reset_index(drop=True,inplace=True)

In [None]:
games.head()

## Add dataframe structure to the function

In [21]:
def UserCollection(user):
    # Import necessary libraries
    from bs4 import BeautifulSoup
    import requests
    import pandas as pd
    
    # Set up pd dataframe
    col = ['User','Game','User rating', 'BGG rating']
    games = pd.DataFrame(columns = col)
    
    # Pull webpage
    url = "https://boardgamegeek.com/collection/user/" + user + "?own=1&subtype=boardgame&ff=1"
    r = requests.get(url)

    data = r.text

    soup = BeautifulSoup(data, "lxml")
    
    # Get all the "Objects" that contain each boardgame in the collection.
    #Use the fact that the boardgame collection is the only one with the attribute "id".
    #All other tables on the page have no additional attributes
    all_games = soup.find_all(lambda tag:tag.name=='tr' and len(tag.attrs) > 0)
    
    for i in range(0,len(all_games)):
        # Start with 1 boardgame "Object"
        current_game = all_games[i]
        
        # To return Game name:
        #t is a bs4 tag
        #collection rating is stored under td tag, class = collection_objectname , a
        t_name = current_game.find_all('td', {'class':'collection_objectname '})[0]
        name = t_name.a.contents[0]
                

        # To return (user) Collection Rating:
        #t is a bs4 tag
        #collection rating is stored under td tag, class = collection_rating , div class=ratingtext
        t_UserRating = current_game.find_all('td',{'class':'collection_rating '})[0]
        t_UserRating = t_UserRating.find_all('div',{'class':'ratingtext'})

        # It's possible the user does not provide a rating, so we should check for that
        if t_UserRating:
            t_UserRating = t_UserRating[0]
            UserRating = t_UserRating.contents[0]
        else:
            UserRating = 'N/A'
        
        
        # To return BGG Rating:
        #t is a bs4 tag
        #collection rating is stored under td tag, class = collection_bggrating
        t_BggRating = current_game.find_all('td',{'class':'collection_bggrating'})

        #Can't call the td attribute for some reason. Use 'text' to pull the text string
        rate_text = t_BggRating[0].text

        #The text string includes \n and \t spacing. Get just the rating number (a float)
        #Find first index of a numerical digit
        #This can be done using the regular expression library 're'
        import re
        start = re.search("\d",rate_text) #\d re flag for any decimal digit [0-9]
        
        # It's possible bgg does not provide a rating, so we should check        
        if start is None: 
            #Start is currently set as a NoneType if regex does not find a numerical character
            start = rate_text.find('N') #Use standard str.find to locate index for "N/A"
        else:  
            #Regex does return an output
            start = re.search("\d",rate_text).start()

        #Find the end of the rating string
        end = rate_text.find('\t',start)

        #Get the decimal rating
        BggRating = rate_text[start:end]
        
        
        # Final Result
        print('User: ' + user)
        print('Game name: ' + name)
        print('User rating: ' + UserRating)
        print('BGG rating: ' + BggRating)
        print('\n')
        
        games = games.append(pd.DataFrame([[user,name, UserRating, BggRating]], columns=col))
    
    # Reset the index of the pd dataframe since the rows get appended all with index=0
    games.reset_index(drop=True, inplace=True)
    
    # Export collection as a csv for later
    games.to_csv(user+'_raw.csv',sep='\t')
    
    return games

# Let's clean up the DataSet

Only keep games that have a user rating > 7. If there is no user rating, keep the game if the avg BGG rating > 7.

In [24]:
# First remove any games that have no User Rating and no BGG rating
na_idx = TomVasel[(TomVasel['User rating'] == 'N/A') & (TomVasel['BGG rating'] == 'N/A')].index.tolist() #Grab indices of N/A User and BGG rating rows
TomVasel.drop(TomVasel.index[na_idx], inplace=True)

In [None]:
TomVasel

In [26]:
# Remove any games with User rating < 7
#Ratings are stored as strings to account for N/A rating
#Convert column to numeric using pandas to_numeric function
#Set errors to 'coerce' which turns non-numeric strings to NaN values, which will work for numeric comparisons

#Keep if User rating == N/A (for BGG based comparison) and User rating >= 7
TomVasel = TomVasel[(TomVasel['User rating'] == 'N/A') | (pd.to_numeric(TomVasel['User rating'], errors='coerce') >= 7)]

In [None]:
TomVasel

In [28]:
TomVasel.reset_index(drop=True, inplace=True)

In [None]:
TomVasel

In [30]:
# Remove any games with BGG rating < 7 (if there is no User Rating)
#Similarly, convert BGG rating column to_numeric
#Find row indices where User rating == N/A and BGG rating < 7
#Drop relevant rows from DataFrame
low_bgg_idx = TomVasel[(TomVasel['User rating'] == 'N/A') & (pd.to_numeric(TomVasel['BGG rating'], errors='coerce') < 7)].index.tolist()
TomVasel.drop(TomVasel.index[low_bgg_idx],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [None]:
# Final list are games that have either User or BGG rating with relevant rating >= 7
TomVasel

# Make the cleanup function

In [102]:
def RatingThreshold(collection):
    
    print('Collection Size start: ' + str(collection.shape[0]))
    
    # First remove any games that have no User Rating and no BGG rating
    na_idx = collection[(collection['User rating'] == 'N/A') & (collection['BGG rating'] == 'N/A')].index.tolist() #Grab indices of N/A User and BGG rating rows
    collection.drop(collection.index[na_idx], inplace=True)
    
    print('Collection Size after removing N/A ratings: ' + str(collection.shape[0]))
    
    # Remove any games with User rating < 7
    #Ratings are stored as strings to account for N/A rating
    #Convert column to numeric using pandas to_numeric function
    #Set errors to 'coerce' which turns non-numeric strings to NaN values, which will work for numeric comparisons

    #Keep if User rating == N/A (for BGG based comparison) and User rating >= 7
    collection = collection[(collection['User rating'] == 'N/A') | (pd.to_numeric(collection['User rating'], errors='coerce') >= 7)]
    
    print('Collection Size after removing low User ratings: ' + str(collection.shape[0]))
    
    # Remove any games with BGG rating < 7 (if there is no User Rating)
    #Similarly, convert BGG rating column to_numeric
    #Find row indices where User rating == N/A and BGG rating < 7
    #Drop relevant rows from DataFrame
    low_bgg_idx = collection[(collection['User rating'] == 'N/A') & (pd.to_numeric(collection['BGG rating'], errors='coerce') < 7)].index.tolist()
    collection.drop(collection.index[low_bgg_idx],inplace=True)
    
    collection.reset_index(drop=True, inplace=True)
    
    print('Collection Size after removing low BGG ratings (if no User rating available): ' + str(collection.shape[0]))
    
    return collection

In [None]:
TomVasel2 = UserCollection('TomVasel')

In [34]:
TomVasel2.shape

(300, 4)

In [None]:
TomVasel2 = RatingThreshold(TomVasel2)

In [None]:
TomVasel2

# Create "games2vec"

1) Create dictionary for entire BGG collection (ranked). Each game corresponds to an index in the vector.

2) For any collection, produce a collection vector (start with zeros)

3) Populate collection vector based on the input collection

# Create Game Dictionary

Pairs each boardgame title to an index for the final vector

In [37]:
# Create dictionary of all games. From previous webscraping, we have a csv with all ranked games on BGG. We'll limit our recommendations to games on the list.
games_list = pd.read_csv('bgg id output.csv')

In [38]:
# Remove all NaN rows
games_list.dropna(axis=0,how='any',inplace=True)
games_list.reset_index(drop=True,inplace=True)

In [39]:
# There are repeat titles in the list. Remove them.
rep_games_idx = games_list[games_list['Game'].duplicated()].index.tolist() #Returns the indices of all repeat titles. This list does NOT include the first appearance
games_list.drop(games_list.index[rep_games_idx],inplace=True)
games_list.reset_index(drop=True,inplace=True)

In [40]:
games_list

Unnamed: 0,Game,BGG Rank,GameID
0,Pandemic Legacy: Season 1,1.0,161936.0
1,Through the Ages: A New Story of Civilization,2.0,182028.0
2,Twilight Struggle,3.0,12333.0
3,Gloomhaven,4.0,174430.0
4,Star Wars: Rebellion,5.0,187645.0
5,Terra Mystica,6.0,120677.0
6,Terraforming Mars,7.0,167791.0
7,Scythe,8.0,169786.0
8,7 Wonders Duel,9.0,173346.0
9,The Castles of Burgundy,10.0,84876.0


In [41]:
games = games_list['Game']

In [42]:
games

0                               Pandemic Legacy: Season 1
1           Through the Ages: A New Story of Civilization
2                                       Twilight Struggle
3                                              Gloomhaven
4                                    Star Wars: Rebellion
5                                           Terra Mystica
6                                       Terraforming Mars
7                                                  Scythe
8                                          7 Wonders Duel
9                                 The Castles of Burgundy
10                              Caverna: The Cave Farmers
11                                            Puerto Rico
12                       War of the Ring (Second Edition)
13                                    Great Western Trail
14                                               Agricola
15                                 Mage Knight Board Game
16                                             Blood Rage
17            

In [43]:
# Create empty dictionary
games_idx = dict()

In [44]:
# The dictionary games_idx will use Game Titles as the key and a number as the value
for i in range(0,len(games)):
    game_title = games[i]
    games_idx[game_title] = i


In [45]:
games_idx

{'Pandemic Legacy: Season 1': 0,
 'Through the Ages: A New Story of Civilization': 1,
 'Twilight Struggle': 2,
 'Gloomhaven': 3,
 'Star Wars: Rebellion': 4,
 'Terra Mystica': 5,
 'Terraforming Mars': 6,
 'Scythe': 7,
 '7 Wonders Duel': 8,
 'The Castles of Burgundy': 9,
 'Caverna: The Cave Farmers': 10,
 'Puerto Rico': 11,
 'War of the Ring (Second Edition)': 12,
 'Great Western Trail': 13,
 'Agricola': 14,
 'Mage Knight Board Game': 15,
 'Blood Rage': 16,
 'Through the Ages: A Story of Civilization': 17,
 'Star Wars: Imperial Assault': 18,
 'Arkham Horror: The Card Game': 19,
 'Mansions of Madness: Second Edition': 20,
 'Mechs vs. Minions': 21,
 'Power Grid': 22,
 'Eclipse': 23,
 'Food Chain Magnate': 24,
 'Orléans': 25,
 'Robinson Crusoe: Adventures on the Cursed Island': 26,
 'Viticulture Essential Edition': 27,
 'Brass: Lancashire': 28,
 'Le Havre': 29,
 'Concordia': 30,
 'Android: Netrunner': 31,
 "Tzolk'in: The Mayan Calendar": 32,
 'T.I.M.E Stories': 33,
 'Codenames': 34,
 'A Fea

In [46]:
# Example showing it works. Tic-Tac-Toe is the last game in the list (rank 14221; index = 14220)
print("Tic-Tac-Toe index: " + str(games_idx['Tic-Tac-Toe']))

Tic-Tac-Toe index: 13954


In [47]:
len(games_idx)

13955

In [48]:
len(games)

13955

# Collection Vector

1) Generate an empty vector of zeros

2) Iterate through collection and populate the collection vector. 0 = not owned, 1 = owned

In [49]:
# Use numpy to generate the vector
import numpy as np

In [50]:
collection = np.zeros((1,len(games_idx)))

In [None]:
# Use TomVasel2 as example
TomVasel2

In [52]:
# Only need the game title column
TomVaselCollection = TomVasel2['Game']

In [53]:
# Iterate through the collection. For each game, set the collection vector value to 1
#Account for possibility that the game in the collection does not match a game in the Bgg ranked list. If so, skip it.
for i in range(0,len(TomVaselCollection)):
    current_game = TomVaselCollection.iloc[i]
    
    if current_game in games_idx:
        current_idx = games_idx[current_game]
        collection[0,current_idx] = 1
    

In [54]:
collection

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [55]:
games_idx['10 Days in Africa']

1511

In [56]:
collection[0,1511]

1.0

# Games2Vec function

In [57]:
def games2vec(user_collection, games_dict):
    '''Take the user collection (game titles series) and game title dictionary as input.
    Output a vector representation of the collection.
    All mismatched game titles will be ignored.'''
    
    # Use numpy to generate the vector
    import numpy as np
    
    # Set up unpopulated vector of 0's
    collection = np.zeros((1,len(games_dict)))
    
    for i in range(0,len(user_collection)):
        current_game = user_collection.iloc[i]
        
        if current_game in games_dict:
            current_idx = games_dict[current_game]
            collection[0, current_idx] = 1
    
    return collection

In [58]:
test = games2vec(TomVasel2['Game'],games_idx)

In [59]:
(test == collection).all() #Validate function output same as specific instant output

True

# Games Dictionary function

Just to have it so we don't have to hardcode it every time

In [60]:
def bgg_dict():
    import pandas as pd
    # Create dictionary of all games. From previous webscraping, we have a csv with all ranked games on BGG. We'll limit our recommendations to games on the list.
    games_list = pd.read_csv('bgg id output.csv')

    # Remove all NaN rows
    games_list.dropna(axis=0,how='any',inplace=True)
    games_list.reset_index(drop=True,inplace=True)

    # There are repeat titles in the list. Remove them.
    rep_games_idx = games_list[games_list['Game'].duplicated()].index.tolist() #Returns the indices of all repeat titles. This list does NOT include the first appearance
    games_list.drop(games_list.index[rep_games_idx],inplace=True)
    games_list.reset_index(drop=True,inplace=True)

    # Get just the titles
    games = games_list['Game']

    # Create empty dictionary
    games_idx = dict()

    # The dictionary games_idx will use Game Titles as the key and a number as the value
    for i in range(0,len(games)):
        game_title = games[i]
        games_idx[game_title] = i

    return games_idx

In [61]:
games_dict = bgg_dict()

# Create all predictor vectors and predicted vectors

Iterate through the collection vector and return all possible vectors. All other games in the collection will predict the existance of the current game.

For example, if a collection has [A, B, C, D], then [A,B,C] predicts [D], [B,C,D] predicts [A], etc.

In [62]:
collection

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [63]:
# Find all indices in the collection array with value=1
in_collection = np.nonzero(collection)[1] #returns index of nonzeros (aka value=1) of collection array

In [64]:
len(in_collection)

255

In [65]:
in_collection

array([   10,    11,    16,    18,    22,    25,    26,    29,    37,
          41,    42,    43,    44,    48,    49,    51,    57,    62,
          64,    66,    67,    71,    72,    74,    78,    80,    87,
          93,    99,   105,   108,   111,   114,   115,   117,   120,
         127,   132,   134,   141,   153,   159,   161,   165,   166,
         172,   173,   178,   193,   195,   200,   203,   206,   209,
         213,   232,   235,   237,   239,   242,   244,   245,   257,
         267,   270,   271,   275,   276,   286,   289,   298,   301,
         302,   303,   304,   309,   318,   320,   330,   333,   337,
         338,   340,   343,   347,   354,   358,   359,   372,   378,
         386,   403,   412,   440,   448,   452,   465,   466,   467,
         472,   478,   485,   508,   510,   530,   537,   566,   567,
         571,   580,   585,   590,   595,   604,   609,   610,   624,
         679,   681,   689,   693,   701,   706,   718,   723,   748,
         749,   757,

In [66]:
# Create predictor vectors and predicted vectors
#Iterate through every game in the collection, indices given by in_collection
#Set value at the current index to 0 for predictor vector
#Set a predicted vector to have 1 at the current index

# Create two empty np arrays. #Rows = #Games in collection. #Columns = #Game Titles
predictor = np.empty([len(in_collection), len(games_idx)])
predicted = np.zeros([len(in_collection), len(games_idx)])

for i in range(0,len(in_collection)):
    # Index of current game
    current_game = in_collection[i]
    
#     # Create copy of the collection
#     curr_predictor = np.copy(collection)
#    # Set current boardgame value to 0 in predictor
#    curr_predictor[0,current_game] = 0
    
    # Set values in current predictor row to collection
    predictor[i,:] = collection
    # Set current boardgame value to 0 in predictor
    predictor[i,current_game] = 0
    
    # Set current boardgame in predicted vector to 1    
    predicted[i,current_game] = 1

In [67]:
np.nonzero(predictor[3,:])[0]

array([   10,    11,    16,    22,    25,    26,    29,    37,    41,
          42,    43,    44,    48,    49,    51,    57,    62,    64,
          66,    67,    71,    72,    74,    78,    80,    87,    93,
          99,   105,   108,   111,   114,   115,   117,   120,   127,
         132,   134,   141,   153,   159,   161,   165,   166,   172,
         173,   178,   193,   195,   200,   203,   206,   209,   213,
         232,   235,   237,   239,   242,   244,   245,   257,   267,
         270,   271,   275,   276,   286,   289,   298,   301,   302,
         303,   304,   309,   318,   320,   330,   333,   337,   338,
         340,   343,   347,   354,   358,   359,   372,   378,   386,
         403,   412,   440,   448,   452,   465,   466,   467,   472,
         478,   485,   508,   510,   530,   537,   566,   567,   571,
         580,   585,   590,   595,   604,   609,   610,   624,   679,
         681,   689,   693,   701,   706,   718,   723,   748,   749,
         757,   763,

In [68]:
np.nonzero(predicted[3,:])

(array([18], dtype=int64),)

# Predictor and Predicted Array functions

In [69]:
def generate_training(collection, games_dict):
    """Takes a user's collection as input along with the games dictionary.
    Outputs a matrix where each row is the training vector (predictor) for a game
    and another matrix where each row is the expected vector (predicted)"""
    
    import numpy as np
    
    # Find all indices in the collection array with value=1
    in_collection = np.nonzero(collection)[1] #returns index of nonzeros (aka value=1) of collection array

    # Create predictor vectors and predicted vectors
    #Iterate through every game in the collection, indices given by in_collection
    #Set value at the current index to 0 for predictor vector
    #Set a predicted vector to have 1 at the current index

    # Create two empty np arrays. #Rows = #Games in collection. #Columns = #Game Titles
    predictor = np.empty([len(in_collection), len(games_dict)])
    predicted = np.zeros([len(in_collection), len(games_dict)])

    for i in range(0,len(in_collection)):
        # Index of current game
        current_game = in_collection[i]

    #     # Create copy of the collection
    #     curr_predictor = np.copy(collection)
    #    # Set current boardgame value to 0 in predictor
    #    curr_predictor[0,current_game] = 0

        # Set values in current predictor row to collection
        predictor[i,:] = collection
        # Set current boardgame value to 0 in predictor
        predictor[i,current_game] = 0

        # Set current boardgame in predicted vector to 1    
        predicted[i,current_game] = 1
    
    return predictor, predicted

In [70]:
# Test the function against manual code outputs
a, b = generate_training(collection, games_idx)

In [71]:
np.nonzero(a[3,:])[0]

array([   10,    11,    16,    22,    25,    26,    29,    37,    41,
          42,    43,    44,    48,    49,    51,    57,    62,    64,
          66,    67,    71,    72,    74,    78,    80,    87,    93,
          99,   105,   108,   111,   114,   115,   117,   120,   127,
         132,   134,   141,   153,   159,   161,   165,   166,   172,
         173,   178,   193,   195,   200,   203,   206,   209,   213,
         232,   235,   237,   239,   242,   244,   245,   257,   267,
         270,   271,   275,   276,   286,   289,   298,   301,   302,
         303,   304,   309,   318,   320,   330,   333,   337,   338,
         340,   343,   347,   354,   358,   359,   372,   378,   386,
         403,   412,   440,   448,   452,   465,   466,   467,   472,
         478,   485,   508,   510,   530,   537,   566,   567,   571,
         580,   585,   590,   595,   604,   609,   610,   624,   679,
         681,   689,   693,   701,   706,   718,   723,   748,   749,
         757,   763,

In [72]:
np.nonzero(b[3,:])

(array([18], dtype=int64),)

# Get some Collections

In [34]:
#UserCollection('')

# Convert Collections to single dataset

At this point, collected 211 users. Listed under playerlist.xlsx

1) Take list, iterate through every player

A) Clean up player collection with RatingThreshold(collection)

B) Convert to vector with games2vec(user_collection, bgg_dict). Use bgg_dict() to generate dictionary.

C) Create all the training samples using generate_training(collection, bgg_dict).

2) Store full training set as a loadable matrix.

In [18]:
#Need to rewrite Rating Threshold to work with the csv files.
#Since the csv files load in N/A values as NaN, we just need to change each
#N/A to isnull() checks

def RatingThreshold(collection):
    
    print('Collection Size start: ' + str(collection.shape[0]))
    
    # First remove any games that have no User Rating and no BGG rating
    na_idx = collection[(collection['User rating'].isnull()) & (collection['BGG rating'].isnull())].index.tolist() #Grab indices of N/A User and BGG rating rows
    collection.drop(collection.index[na_idx], inplace=True)
    
    print('Collection Size after removing N/A ratings: ' + str(collection.shape[0]))
    
    # Remove any games with User rating < 7
    #Ratings are stored as strings to account for N/A rating
    #Convert column to numeric using pandas to_numeric function
    #Set errors to 'coerce' which turns non-numeric strings to NaN values, which will work for numeric comparisons

    #Keep if User rating == N/A (for BGG based comparison) and User rating >= 7
    collection = collection[(collection['User rating'].isnull()) | (pd.to_numeric(collection['User rating'], errors='coerce') >= 7)]
    
    print('Collection Size after removing low User ratings: ' + str(collection.shape[0]))
    
    #Reset index
    collection.reset_index(drop=True, inplace=True)
    
    # Remove any games with BGG rating < 7 (if there is no User Rating)
    #Similarly, convert BGG rating column to_numeric
    #Find row indices where User rating == N/A and BGG rating < 7
    #Drop relevant rows from DataFrame
    low_bgg_idx = collection[(collection['User rating'].isnull()) & (pd.to_numeric(collection['BGG rating'], errors='coerce') < 7)].index.tolist()
    collection.drop(collection.index[low_bgg_idx],inplace=True)
    
    collection.reset_index(drop=True, inplace=True)
    
    print('Collection Size after removing low BGG ratings (if no User rating available): ' + str(collection.shape[0]))
    
    return collection

In [19]:
playerlist = pd.read_excel('playerlist.xlsx')

In [None]:
playerlist['Username'].iloc[-1]

In [21]:
games = bgg_dict()

In [22]:
len(games)

2

In [None]:
games = bgg_dict()[0]
#training_data = np.empty([1,len(games)])
#exp_output_data = np.empty([1,len(games)])
training_row = 0
ct = 1

print(training_row)

for user in playerlist['Username']:
    # Get the user file
    print(str(ct) + ': ' + str(user))
    file = str(user) + '_raw.csv'
    user_collection = pd.read_csv(file, sep='\t', encoding='ISO-8859-1')
    
    # Drop the column 'Unnamed: 0' that is used to number the rows
    user_collection.drop('Unnamed: 0', inplace=True, axis=1)
    
    # Apply RatingThreshold
    user_collection_cleaned = RatingThreshold(user_collection)
    
    
     # Check if user has more than 1 game after cleaning the collection
    if user_collection_cleaned.shape[0] == 1:
        print('Less than 1 game added')
    else:
        # Vectorize the collection
        user_vector = games2vec(user_collection_cleaned['Game'], games)

        # Generate training set
        user_input, user_exp_output = generate_training(user_vector,games)

        if ct == 1:
            training_data = user_input
            exp_output_data = user_exp_output
        else:
            # Append user's data to the master list
            training_data = np.concatenate((training_data, user_input),axis=0)
            exp_output_data = np.concatenate((exp_output_data, user_exp_output), axis=0)

        training_row = training_row + user_input.shape[0]
        print(user_input.shape[0])
        print(training_row)
        print(training_data.shape[0])

    ct = ct + 1
    print('\n')

In [28]:
print(exp_output_data.shape)
print(training_data.shape)
print(training_row)
print(len(games))

# Expect 8798 training samples from 211 user collections
# Bgg dictionary consists of 13955 ranked games

(8798, 13955)
(8798, 13955)
8798
13955


In [97]:
exp_output_data[0][10]



1.0

In [31]:
#First training sample expected output is game #10
print(np.where(exp_output_data[0] != 0)[0][0])

#This is the equivalent of Caverna: The Cave Farmers
games_decode = bgg_dict()[1]
print(games_decode[10])

10
Caverna: The Cave Farmers
