In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [2]:
ratings = pd.read_csv('../data/ratings_cleaned.csv')

In [3]:
ratings.head()

Unnamed: 0,boardgame_id,ratings,user_id
0,30549,10.0,214
1,822,10.0,214
2,13,10.0,214
3,68448,10.0,214
4,36218,10.0,214


## Create sparse User-Item-Matrix

In [4]:
user_item = csr_matrix((ratings['ratings'], (ratings['user_id'], ratings['boardgame_id'])))

In [5]:
#shape = (number of users, max boardgame_id)
user_item.shape

(265863, 340910)

## Create Model

In [6]:
model = NearestNeighbors(metric='cosine')

In [7]:
model.fit(user_item)

NearestNeighbors(metric='cosine')

In [8]:
with open('../models/knn_model_cosine.pickle', 'wb') as file:
    pickle.dump(model, file)

## Create User Vectors

### a. of an existing user

In [9]:
users = pd.read_csv('../data/users.csv')
users.head(20)

Unnamed: 0,user_name,num_ratings,user_id
0,Walt Mulder,8675,0
1,Doel,8599,1
2,TomVasel,7052,2
3,warta,6902,3
4,jmdsplotter,6210,4
5,leffe dubbel,6131,5
6,Nap16,6097,6
7,Hessu68,5689,7
8,loopoocat,5663,8
9,oldgoat3769967,5299,9


In [10]:
user_id = users[users['user_name']=='Ser0']['user_id'].tolist()[0]
user_id

40701

In [11]:
user = ratings[ratings['user_id']==user_id]
user

Unnamed: 0,boardgame_id,ratings,user_id
13495904,30549,5.0,40701
13495905,68448,8.0,40701
13495906,36218,6.0,40701
13495907,178900,7.0,40701
13495908,167791,9.0,40701
...,...,...,...
13496026,303734,5.0,40701
13496027,42636,6.0,40701
13496028,165984,6.0,40701
13496029,135213,9.0,40701


In [12]:
user_boardgames = user['boardgame_id']

In [13]:
vector_length = ratings['boardgame_id'].max()
vector = np.repeat(0, vector_length+1)

In [14]:
vector[user['boardgame_id']] = user['ratings']

In [15]:
vector.shape

(340910,)

## Find Neighbors

In [16]:
distances, neighbor_ids = model.kneighbors([vector], n_neighbors=10)

In [17]:
neighbor_ids

array([[ 40701,  88695,  73514, 221205,  74607, 163111,  78724, 127072,
        123549, 161858]])

In [18]:
distances

array([[0.        , 0.67720592, 0.71442184, 0.72092662, 0.72340518,
        0.72377673, 0.72456408, 0.72643922, 0.72915679, 0.72969398]])

In [19]:
neighbor_filter = ratings['user_id'].isin(neighbor_ids[0][1:])
ratings[neighbor_filter]

Unnamed: 0,boardgame_id,ratings,user_id
6004886,30549,8.0,73514
6004887,822,8.0,73514
6004888,13,6.5,73514
6004889,68448,6.0,73514
6004890,36218,7.5,73514
...,...,...,...
19160361,281259,9.0,161858
19160362,251247,9.0,161858
19160363,184267,9.0,161858
19160364,283155,9.0,161858


In [20]:
neighbor_taste = ratings[neighbor_filter].groupby('boardgame_id').mean()['ratings'].sort_values(ascending=False)
neighbor_taste

boardgame_id
188       10.0
463       10.0
246900    10.0
3076      10.0
242277    10.0
          ... 
320        5.0
271264     5.0
180263     5.0
223779     3.0
2921       1.0
Name: ratings, Length: 220, dtype: float64

In [21]:
played_filter = ~neighbor_taste.index.isin(user_boardgames)
played_filter

array([ True,  True,  True,  True,  True,  True, False,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True, False,  True, False,
        True, False, False, False,  True,  True,  True, False,  True,
        True, False,  True,  True, False, False, False, False, False,
        True, False,  True, False,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True, False, False,
        True, False, False,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True, False,  True, False,  True, False,  True, False, False,
        True,  True,  True,  True,  True,  True,  True, False,  True,
       False,  True, False,  True, False,  True,  True,  True, False,
       False,  True,

In [22]:
neighbor_taste[played_filter].index

Int64Index([   188,    463, 246900,   3076, 242277, 269385, 201808, 184267,
            126163, 232414,
            ...
              2719,  40398, 234877, 260300,   2223, 205896, 271264, 180263,
            223779,   2921],
           dtype='int64', name='boardgame_id', length=169)

## Find Top Categories, Mechanics etc.

In [140]:
boardgames = pd.read_csv('../data/boardgames.csv')
boardgames.shape

(45000, 5)

In [141]:
boardgames.head()

Unnamed: 0,rank,name,id,links,num_voters
0,104.0,Pandemic,30549,/boardgame/30549/pandemic,104891
1,184.0,Carcassonne,822,/boardgame/822/carcassonne,104530
2,408.0,Catan,13,/boardgame/13/catan,103960
3,71.0,7 Wonders,68448,/boardgame/68448/7-wonders,86475
4,101.0,Dominion,36218,/boardgame/36218/dominion,79469


In [23]:
boardgames_ext = pd.read_csv('../data/boardgames_extend_backup.csv')
boardgames_ext.shape

(45000, 12)

In [24]:
boardgames_ext.head()

Unnamed: 0,id,rank,name,links,num_voters,categories,mechanics,family,expansions,integrations,designers,publishers
0,30549,104.0,Pandemic,/boardgame/30549/pandemic,104891,Medical,"Action Points, Cooperative Game, Hand Manageme...","Components: Map (Global Scale), Components: Mu...",Pandemic: Gen Con 2016 Promos – Z-Force Team M...,,Matt Leacock,"Z-Man Games, Inc., (Unknown), Albi, Asmodee, A..."
1,822,184.0,Carcassonne,/boardgame/822/carcassonne,104530,"City Building, Medieval, Territory Building","Area Majority / Influence, Map Addition, Tile ...","Cities: Carcassonne (France), Components: Meep...","20 Jahre Darmstadt Spielt, Apothecaries (fan e...",Carcassonne: Wheel of Fortune,Klaus-Jürgen Wrede,"Hans im Glück, 999 Games, Albi, Bard Centrum G..."
2,13,408.0,Catan,/boardgame/13/catan,103960,"Economic, Negotiation","Dice Rolling, Hexagon Grid, Income, Modular Bo...","Animals: Sheep, Components: Hexagonal Tiles, C...","20 Jahre Darmstadt Spielt, Brettspiel Adventsk...",,Klaus Teuber,"KOSMOS, 999 Games, Albi, Astrel Games, Bergsal..."
3,68448,71.0,7 Wonders,/boardgame/68448/7-wonders,86475,"Ancient, Card Game, City Building, Civilizatio...","Drafting, Hand Management, Set Collection, Sim...","Digital Implementations: Board Game Arena, Gam...","7 Wonders: Armada, 7 Wonders: Babel, 7 Wonders...",,Antoine Bauza,"Repos Production, ADC Blackfire Entertainment,..."
4,36218,101.0,Dominion,/boardgame/36218/dominion,79469,"Card Game, Medieval","Deck, Bag, and Pool Building, Delayed Purchase...","Crowdfunding: Wspieram, Game: Dominion, Misc: ...","Ancient Times (fan expansion for Dominion), An...",Dominion: Intrigue,Donald X. Vaccarino,"Rio Grande Games, 999 Games, Albi, Bard Centru..."


In [146]:
boardgames_ext['id'] = boardgames['id']

In [147]:
boardgames_ext.head()

Unnamed: 0,rank,name,links,num_voters,categories,mechanics,family,expansions,integrations,designers,publishers,id
0,104.0,Pandemic,/boardgame/30549/pandemic,104891,,,,,,,,30549
1,184.0,Carcassonne,/boardgame/822/carcassonne,104530,,,,,,,,822
2,408.0,Catan,/boardgame/13/catan,103960,,,,,,,,13
3,71.0,7 Wonders,/boardgame/68448/7-wonders,86475,,,,,,,,68448
4,101.0,Dominion,/boardgame/36218/dominion,79469,,,,,,,,36218


In [148]:
boardgames_ext = boardgames_ext.set_index('id')

In [149]:
boardgames_ext['categories'].value_counts()

Abstract Strategy                                                                                           4
Card Game                                                                                                   4
Economic                                                                                                    3
Adventure, Exploration, Fantasy, Fighting, Miniatures                                                       3
Card Game, Number                                                                                           3
                                                                                                           ..
Fantasy, Puzzle                                                                                             1
Deduction, Fighting, Nautical, Real-time                                                                    1
Card Game, City Building, Economic                                                                          1
Dice, Econ

In [156]:
def values_to_list(df, column_name):
    categories = []
    for i in df[df[column_name].notna()].iterrows():
        categories = categories + i[1][column_name].split(', ')
    categories = list(dict.fromkeys(categories))
    categories.sort()
    return categories
        

In [276]:
def ohe_user_boardgames(user_name, column, weight=True):
    games_ohe={}
    user_id = users[users['user_name']==user_name]['user_id'].tolist()[0]
    user_ratings = ratings[ratings['user_id']==user_id].set_index('boardgame_id')
    user_boardgames = boardgames_ext.loc[user_ratings.index]
    user_boardgames = user_boardgames[user_boardgames[column].notna()]
    user_categories = values_to_list(user_boardgames, column)    
    for i in user_boardgames.iterrows():
        game_vector = [0]*len(user_categories)
        for c in i[1][column].split(', '):
            index = user_categories.index(c)
            if weight == True:
                game_vector[index]=1 * user_ratings.loc[i[0]]['ratings']
            else: 
                game_vector[index]=1
        games_ohe[i[0]] = game_vector
    df = pd.DataFrame(games_ohe)
    df = df.transpose()
    df.columns = user_categories
    return df

In [277]:
for row in ohe_user_boardgames('Ser0', 'mechanics', weight=False).iterrows():
    print(row[1]['name'],row[1]['mechanics'])

Hive Enclosure, Grid Movement, Hexagon Grid, Pieces as Map, Slide/Push, Three Dimensional Movement, Tile Placement
Spirit Island Action Retrieval, Area Majority / Influence, Campaign / Battle Card Driven, Cooperative Game, Events, Hand Management, Modular Board, Set Collection, Simultaneous Action Selection, Solo / Solitaire Game, Variable Player Powers
Scrabble End Game Bonuses, Hand Management, Square Grid, Tile Placement
Root Action Queue, Action Retrieval, Area Majority / Influence, Area Movement, Dice Rolling, Hand Management, Point to Point Movement, Race, Variable Player Powers, Variable Set-up
Machi Koro Dice Rolling, Random Production
Mansions of Madness: Second Edition Area Movement, Cooperative Game, Dice Rolling, Hand Management, Modular Board, Role Playing, Scenario / Mission / Campaign Game, Solo / Solitaire Game, Stat Check Resolution, Team-Based Game, Traitor Game, Variable Player Powers
The Quacks of Quedlinburg Catch the Leader, Deck, Bag, and Pool Building, Dice Roll

In [241]:
users_top_categories = ohe_user_boardgames('Ser0', 'categories').sum().sort_values(ascending=False)

In [242]:
users_top_categories

Fantasy                     78.0
Card Game                   56.0
Adventure                   34.0
Animals                     30.0
Fighting                    30.0
City Building               23.0
Humor                       23.0
Abstract Strategy           22.0
Science Fiction             22.0
Medieval                    22.0
Dice                        20.0
Party Game                  20.0
Miniatures                  19.0
Horror                      19.0
Industry / Manufacturing    18.0
Economic                    17.0
Transportation              17.0
Puzzle                      17.0
Mythology                   15.0
Nautical                    15.0
Ancient                     15.0
Environmental               15.0
Murder/Mystery              15.0
Territory Building          14.0
Real-time                   14.0
Exploration                 14.0
Expansion for Base-game     13.0
Deduction                   12.0
Children's Game              9.0
Sports                       9.0
Educationa

In [257]:
def ohe_user_boardgames(user_name, column, weight=True):
    games_ohe={}
    user_id = users[users['user_name']==user_name]['user_id'].tolist()[0]
    user_ratings = ratings[ratings['user_id']==user_id].set_index('boardgame_id')
    user_boardgames = boardgames_ext.loc[user_ratings.index]
    user_boardgames = user_boardgames[user_boardgames[column].notna()]
    user_categories = values_to_list(user_boardgames, column)    
    for i in user_boardgames.iterrows():
        game_vector = [0]*len(user_categories)
        for c in i[1][column].split(', '):
            index = user_categories.index(c)
            if weight == True:
                game_vector[index]=1 * user_ratings.loc[i[0]]['ratings']
            else: 
                game_vector[index]=1
        games_ohe[i[0]] = game_vector
    df = pd.DataFrame(games_ohe)
    df = df.transpose()
    df.columns = user_categories
    return df

In [266]:
sero = ohe_user_boardgames('Ser0', 'mechanics', weight=True)['Hand Management']

In [267]:
sero

2655       0.0
162886     7.0
320        6.0
237182     6.0
143884     0.0
205059     9.0
244521     0.0
122522     4.0
50381      5.0
199792    10.0
172225     7.0
224517    10.0
21790      7.0
218603     0.0
209778     0.0
147151     0.0
160477     7.0
194594     0.0
239188     0.0
51811      7.0
180974     0.0
167355     0.0
66690      6.0
217372     9.0
63268      0.0
91536      0.0
263918     0.0
45315      7.0
247763     8.0
Name: Hand Management, dtype: float64

In [268]:
sero['boardgame_name'] = boardgames_ext.loc[sero.index]['name']

In [269]:
sero

2655                                                              0
162886                                                            7
320                                                               6
237182                                                            6
143884                                                            0
205059                                                            9
244521                                                            0
122522                                                            4
50381                                                             5
199792                                                           10
172225                                                            7
224517                                                           10
21790                                                             7
218603                                                            0
209778                                          

In [264]:
users_top_mechanics = ohe_user_boardgames('Ser0', 'mechanics').sum().sort_values(ascending=False)[:5].index.tolist()
users_top_mechanics

['Hand Management',
 'Card Drafting',
 'Solo / Solitaire Game',
 'Dice Rolling',
 'Variable Set-up']