In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [53]:
ratings = pd.read_csv('../data/ratings_cleaned.csv')

In [54]:
ratings.head()

Unnamed: 0,boardgame_id,ratings,user_id
0,30549,10.0,341
1,822,10.0,341
2,13,10.0,341
3,68448,10.0,341
4,36218,10.0,341


## Create sparse User-Item-Matrix

In [55]:
user_item = csr_matrix((ratings['ratings'], (ratings['user_id'], ratings['boardgame_id'])))

In [56]:
#shape = (number of users, max boardgame_id)
user_item.shape

(219703, 291458)

## Create Model

In [57]:
model = NearestNeighbors(metric='cosine')

In [58]:
model.fit(user_item)

NearestNeighbors(metric='cosine')

In [59]:
with open('../models/small_model.pickle', 'wb') as file:
    pickle.dump(model, file)

## Create User Vectors

### a. of an existing user

In [60]:
users = pd.read_csv('../data/users.csv')
users.head(20)

Unnamed: 0,user_name,num_ratings,user_id
0,Doel,415,0
1,RonCarlson01,373,1
2,arsior,348,2
3,squaredeh,333,3
4,vincivader,327,4
5,Nap16,327,5
6,TomVasel,324,6
7,Hessu68,323,7
8,sinkon,317,8
9,JasonSaastad,317,9


In [61]:
user_id = users[users['user_name']=='Ser0']['user_id'].tolist()[0]
user_id

50751

In [62]:
user = ratings[ratings['user_id']==user_id]
user

Unnamed: 0,boardgame_id,ratings,user_id
4916109,30549,5.0,50751
4916110,68448,8.0,50751
4916111,36218,6.0,50751
4916112,178900,7.0,50751
4916113,167791,9.0,50751
4916114,31260,6.0,50751
4916115,173346,8.0,50751
4916116,40692,7.0,50751
4916117,169786,7.0,50751
4916118,266192,9.0,50751


In [63]:
user_boardgames = user['boardgame_id']

In [64]:
vector_length = ratings['boardgame_id'].max()
vector = np.repeat(0, vector_length+1)

In [65]:
vector[user['boardgame_id']] = user['ratings']

In [66]:
vector.shape

(291458,)

## Find Neighbors

In [67]:
distances, neighbor_ids = model.kneighbors([vector], n_neighbors=10)

In [68]:
neighbor_ids

array([[ 50751,  71055,  46642, 181141,  15336,  77888,  51998,  10679,
         48627, 141415]])

In [69]:
distances

array([[0.        , 0.47471421, 0.48195749, 0.48262405, 0.48475917,
        0.48910315, 0.49341273, 0.49505708, 0.49839764, 0.51460169]])

In [71]:
neighbor_filter = ratings['user_id'].isin(neighbor_ids[0][1:])
ratings[neighbor_filter]

Unnamed: 0,boardgame_id,ratings,user_id
4072670,30549,6.5,48627
4072671,822,7.5,48627
4072672,13,8.0,48627
4072673,68448,8.0,48627
4072674,36218,7.5,48627
...,...,...,...
6852967,45315,5.0,71055
6852968,199042,9.0,71055
6852969,197376,5.0,71055
6852970,191189,8.0,71055


In [72]:
neighbor_taste = ratings[neighbor_filter].groupby('boardgame_id').mean()['ratings'].sort_values(ascending=False)
neighbor_taste

boardgame_id
132531    10.0
177736    10.0
185343    10.0
97207     10.0
209010    10.0
          ... 
2223       3.5
2453       3.0
1294       2.0
1406       1.0
2921       1.0
Name: ratings, Length: 183, dtype: float64

In [73]:
played_filter = ~neighbor_taste.index.isin(user_boardgames)
played_filter

array([ True,  True,  True,  True,  True, False,  True, False, False,
       False,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True,  True, False, False,  True, False, False, False,
        True, False,  True,  True, False, False,  True, False, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True, False,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True, False, False, False,  True,  True, False,  True, False,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True, False,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True, False,  True,  True,  True,
        True,  True,

In [74]:
neighbor_taste[played_filter].index

Int64Index([132531, 177736, 185343,  97207, 209010, 161936,  96848, 229853,
             84876, 205896,
            ...
                50,  27588, 128882, 188834,   1927,   2223,   2453,   1294,
              1406,   2921],
           dtype='int64', name='boardgame_id', length=139)

## Find Top Categories, Mechanics etc.

In [140]:
boardgames = pd.read_csv('../data/boardgames.csv')
boardgames.shape

(45000, 5)

In [141]:
boardgames.head()

Unnamed: 0,rank,name,id,links,num_voters
0,104.0,Pandemic,30549,/boardgame/30549/pandemic,104891
1,184.0,Carcassonne,822,/boardgame/822/carcassonne,104530
2,408.0,Catan,13,/boardgame/13/catan,103960
3,71.0,7 Wonders,68448,/boardgame/68448/7-wonders,86475
4,101.0,Dominion,36218,/boardgame/36218/dominion,79469


In [143]:
boardgames_ext = pd.read_csv('../data/boardgames_extend_backup.csv')
boardgames_ext.shape

(45000, 11)

In [145]:
boardgames_ext.head()

Unnamed: 0,rank,name,links,num_voters,categories,mechanics,family,expansions,integrations,designers,publishers
0,104.0,Pandemic,/boardgame/30549/pandemic,104891,,,,,,,
1,184.0,Carcassonne,/boardgame/822/carcassonne,104530,,,,,,,
2,408.0,Catan,/boardgame/13/catan,103960,,,,,,,
3,71.0,7 Wonders,/boardgame/68448/7-wonders,86475,,,,,,,
4,101.0,Dominion,/boardgame/36218/dominion,79469,,,,,,,


In [146]:
boardgames_ext['id'] = boardgames['id']

In [147]:
boardgames_ext.head()

Unnamed: 0,rank,name,links,num_voters,categories,mechanics,family,expansions,integrations,designers,publishers,id
0,104.0,Pandemic,/boardgame/30549/pandemic,104891,,,,,,,,30549
1,184.0,Carcassonne,/boardgame/822/carcassonne,104530,,,,,,,,822
2,408.0,Catan,/boardgame/13/catan,103960,,,,,,,,13
3,71.0,7 Wonders,/boardgame/68448/7-wonders,86475,,,,,,,,68448
4,101.0,Dominion,/boardgame/36218/dominion,79469,,,,,,,,36218


In [148]:
boardgames_ext = boardgames_ext.set_index('id')

In [149]:
boardgames_ext['categories'].value_counts()

Abstract Strategy                                                                                           4
Card Game                                                                                                   4
Economic                                                                                                    3
Adventure, Exploration, Fantasy, Fighting, Miniatures                                                       3
Card Game, Number                                                                                           3
                                                                                                           ..
Fantasy, Puzzle                                                                                             1
Deduction, Fighting, Nautical, Real-time                                                                    1
Card Game, City Building, Economic                                                                          1
Dice, Econ

In [156]:
def values_to_list(df, column_name):
    categories = []
    for i in df[df[column_name].notna()].iterrows():
        categories = categories + i[1][column_name].split(', ')
    categories = list(dict.fromkeys(categories))
    categories.sort()
    return categories
        

In [243]:
def ohe_user_boardgames(user_name, column, weight=True):
    games_ohe={}
    user_id = users[users['user_name']==user_name]['user_id'].tolist()[0]
    user_ratings = ratings[ratings['user_id']==user_id].set_index('boardgame_id')
    user_boardgames = boardgames_ext.loc[user_ratings.index]
    user_boardgames = user_boardgames[user_boardgames[column].notna()]
    user_categories = values_to_list(user_boardgames, 'categories')    
    for i in user_boardgames.iterrows():
        game_vector = [0]*len(user_categories)
        for c in i[1][column].split(', '):
            index = user_categories.index(c)
            if weight == True:
                game_vector[index]=1 * user_ratings.loc[i[0]]['ratings']
            else: 
                game_vector[index]=1
        games_ohe[i[0]] = game_vector
    df = pd.DataFrame(games_ohe)
    df = df.transpose()
    df.columns = user_categories
    return df

In [245]:
ohe_user_boardgames('Ser0', 'categories', weight=False)

Unnamed: 0,Abstract Strategy,Adventure,Age of Reason,Ancient,Animals,Card Game,Children's Game,City Building,Civilization,Comic Book / Strip,...,Puzzle,Racing,Real-time,Science Fiction,Sports,Territory Building,Transportation,Wargame,Word Game,Zombies
2655,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
162886,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
320,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
237182,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
143884,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
205059,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
244521,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
122522,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
50381,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
199792,0,0,0,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [241]:
users_top_categories = ohe_user_boardgames('Ser0', 'categories').sum().sort_values(ascending=False)

In [242]:
users_top_categories

Fantasy                     78.0
Card Game                   56.0
Adventure                   34.0
Animals                     30.0
Fighting                    30.0
City Building               23.0
Humor                       23.0
Abstract Strategy           22.0
Science Fiction             22.0
Medieval                    22.0
Dice                        20.0
Party Game                  20.0
Miniatures                  19.0
Horror                      19.0
Industry / Manufacturing    18.0
Economic                    17.0
Transportation              17.0
Puzzle                      17.0
Mythology                   15.0
Nautical                    15.0
Ancient                     15.0
Environmental               15.0
Murder/Mystery              15.0
Territory Building          14.0
Real-time                   14.0
Exploration                 14.0
Expansion for Base-game     13.0
Deduction                   12.0
Children's Game              9.0
Sports                       9.0
Educationa

In [257]:
def ohe_user_boardgames(user_name, column, weight=True):
    games_ohe={}
    user_id = users[users['user_name']==user_name]['user_id'].tolist()[0]
    user_ratings = ratings[ratings['user_id']==user_id].set_index('boardgame_id')
    user_boardgames = boardgames_ext.loc[user_ratings.index]
    user_boardgames = user_boardgames[user_boardgames[column].notna()]
    user_categories = values_to_list(user_boardgames, column)    
    for i in user_boardgames.iterrows():
        game_vector = [0]*len(user_categories)
        for c in i[1][column].split(', '):
            index = user_categories.index(c)
            if weight == True:
                game_vector[index]=1 * user_ratings.loc[i[0]]['ratings']
            else: 
                game_vector[index]=1
        games_ohe[i[0]] = game_vector
    df = pd.DataFrame(games_ohe)
    df = df.transpose()
    df.columns = user_categories
    return df

In [258]:
ohe_user_boardgames('Ser0', 'mechanics', weight=True)

Unnamed: 0,Action Points,Action Queue,Action Retrieval,Area Majority / Influence,Area Movement,Bag,Bias,Bingo,Campaign / Battle Card Driven,Card Drafting,...,Tech Trees / Tech Tracks,Three Dimensional Movement,Tile Placement,Traitor Game,Turn Order: Progressive,Turn Order: Stat-Based,Variable Player Powers,Variable Set-up,Worker Placement,and Pool Building
2655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,7.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162886,0.0,0.0,7.0,7.0,0.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0
320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
237182,0.0,6.0,6.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,6.0,6.0,0.0,0.0
143884,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
205059,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,9.0,0.0,0.0,9.0,0.0,0.0,0.0
244521,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,9.0
122522,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0
50381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199792,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,10.0,0.0


In [264]:
users_top_mechanics = ohe_user_boardgames('Ser0', 'mechanics').sum().sort_values(ascending=False)[:5].index.tolist()
users_top_mechanics

['Hand Management',
 'Card Drafting',
 'Solo / Solitaire Game',
 'Dice Rolling',
 'Variable Set-up']