In [1]:
# Import modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

# Data science
import pandas as pd
import numpy as np

# Visualization imports
import matplotlib.pyplot as plt
import seaborn as sns

# ML Helpers/Metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve

# Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Read in the 'games' dataset.
df_games = pd.read_csv("data/games.csv")
df_games.sort_values(by="name", ascending=True, ignore_index=True).head()

Unnamed: 0,id,type,name,yearpublished,minplayers,maxplayers,playingtime,minplaytime,maxplaytime,minage,users_rated,average_rating,bayes_average_rating,total_owners,total_traders,total_wanters,total_wishers,total_comments,total_weights,average_weight
0,42039,boardgame,,2004.0,2.0,4.0,40.0,40.0,40.0,10.0,10,6.0,0.0,11,0,1,5,1,0,0.0
1,182991,boardgame,,1979.0,2.0,4.0,60.0,45.0,60.0,5.0,0,0.0,0.0,0,0,0,0,0,0,0.0
2,107551,boardgame,,2010.0,2.0,2.0,60.0,60.0,60.0,12.0,3,7.0,0.0,5,0,2,2,1,0,0.0
3,87749,boardgame,,2006.0,2.0,2.0,60.0,60.0,60.0,7.0,1,6.0,0.0,0,0,1,0,0,0,0.0
4,180080,boardgame,,0.0,2.0,6.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1,0,0,0,0,0,0.0


In [3]:
# 'Games' dataframe snapshot.
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81312 entries, 0 to 81311
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    81312 non-null  int64  
 1   type                  81312 non-null  object 
 2   name                  81271 non-null  object 
 3   yearpublished         81309 non-null  float64
 4   minplayers            81309 non-null  float64
 5   maxplayers            81309 non-null  float64
 6   playingtime           81309 non-null  float64
 7   minplaytime           81309 non-null  float64
 8   maxplaytime           81309 non-null  float64
 9   minage                81309 non-null  float64
 10  users_rated           81312 non-null  int64  
 11  average_rating        81312 non-null  float64
 12  bayes_average_rating  81312 non-null  float64
 13  total_owners          81312 non-null  int64  
 14  total_traders         81312 non-null  int64  
 15  total_wanters      

In [4]:
# Retrieve dataframe column names and shape (as prep for deletion confirmations).
print(df_games.columns)
print(df_games.shape)

Index(['id', 'type', 'name', 'yearpublished', 'minplayers', 'maxplayers',
       'playingtime', 'minplaytime', 'maxplaytime', 'minage', 'users_rated',
       'average_rating', 'bayes_average_rating', 'total_owners',
       'total_traders', 'total_wanters', 'total_wishers', 'total_comments',
       'total_weights', 'average_weight'],
      dtype='object')
(81312, 20)


In [5]:
# Drop the non-beneficial columns.
df_games = df_games.drop(columns=["total_traders", "total_wanters", "total_wishers"])
df_games.shape

(81312, 17)

In [6]:
# Drop duplicated rows.
df_games.drop_duplicates(keep='first', inplace=True, ignore_index=True)
df_games.shape

(79463, 17)

In [7]:
# 'Games' dataframe summary statistics.
df_games.describe()

Unnamed: 0,id,yearpublished,minplayers,maxplayers,playingtime,minplaytime,maxplaytime,minage,users_rated,average_rating,bayes_average_rating,total_owners,total_comments,total_weights,average_weight
count,79463.0,79460.0,79460.0,79460.0,79460.0,79460.0,79460.0,79460.0,79463.0,79463.0,79463.0,79463.0,79463.0,79463.0,79463.0
mean,72614.516907,1802.337465,1.989995,5.651359,50.29664,48.231626,50.29664,6.90185,101.474925,4.146446,1.036639,182.068661,32.36495,10.440004,0.872706
std,58830.246442,593.904255,0.936143,56.717431,344.224007,335.562391,344.224007,5.044081,841.30443,3.058123,2.220574,1134.929841,210.832116,84.798331,1.159645
min,1.0,-3500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,21594.5,1984.0,2.0,2.0,5.0,5.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,43454.0,2003.0,2.0,4.0,30.0,30.0,30.0,8.0,2.0,5.16667,0.0,6.0,1.0,0.0,0.0
75%,129041.5,2010.0,2.0,6.0,60.0,60.0,60.0,12.0,14.0,6.64973,0.0,43.0,7.0,2.0,1.7778
max,184451.0,2018.0,99.0,11299.0,60120.0,60120.0,60120.0,120.0,53680.0,10.0,8.22186,73188.0,11798.0,5996.0,5.0


In [8]:
# Read in the 'top 5000' dataset.
df_top5000 = pd.read_csv("data/top-5000-20190206.csv")
df_top5000.sort_values(by="names", ascending=True, ignore_index=True).head()

Unnamed: 0,rank,bgg_url,game_id,names,min_players,max_players,avg_time,min_time,max_time,year,...,age,mechanic,owned,category,designer,publisher,weight,expands,reimplements,num_fans
0,2745,https://boardgamegeek.com/boardgame/153999/and...,153999,"...and then, we held hands.",2,2,45,30,45,2015,...,12,"Cooperative Play, Hand Management, Point to Po...",3503,"Card Game, Print & Play","David Chircop, Yannick Massa",LudiCreations,1.74,,,113
1,4406,https://boardgamegeek.com/boardgame/853/und-ts...,853,...und tschüss!,4,6,30,30,30,1997,...,10,Simultaneous Action Selection,387,Card Game,Martin Wallace,"Artra Design, Ltd.",1.3,,,3
2,1764,https://boardgamegeek.com/boardgame/7865/10-da...,7865,10 Days in Africa,2,4,30,20,30,2003,...,10,"Hand Management, Route/Network Building, Tile ...",2087,"Educational, Travel","Alan R. Moon, Aaron Weissblum",Beautiful Africa,1.3879,,,22
3,1975,https://boardgamegeek.com/boardgame/22398/10-d...,22398,10 Days in Asia,2,4,25,25,25,2007,...,10,"Hand Management, Route/Network Building, Tile ...",1303,"Educational, Travel","Alan R. Moon, Aaron Weissblum",Out of the Box Publishing,1.4286,,,15
4,1740,https://boardgamegeek.com/boardgame/5867/10-da...,5867,10 Days in Europe,2,4,30,30,30,2002,...,10,"Hand Management, Route/Network Building",2048,"Educational, Travel","Grafik Studio Krüger, Alan R. Moon, Aaron Weis...",Out of the Box Publishing,1.3452,,,24


In [9]:
# Top 5000 dataframe snapshot.
df_top5000.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 25 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   rank          5000 non-null   int64  
 1   bgg_url       5000 non-null   object 
 2   game_id       5000 non-null   int64  
 3   names         5000 non-null   object 
 4   min_players   5000 non-null   int64  
 5   max_players   5000 non-null   int64  
 6   avg_time      5000 non-null   int64  
 7   min_time      5000 non-null   int64  
 8   max_time      5000 non-null   int64  
 9   year          5000 non-null   int64  
 10  avg_rating    5000 non-null   float64
 11  geek_rating   5000 non-null   float64
 12  num_votes     5000 non-null   int64  
 13  image_url     5000 non-null   object 
 14  thumb_url     5000 non-null   object 
 15  age           5000 non-null   int64  
 16  mechanic      5000 non-null   object 
 17  owned         5000 non-null   int64  
 18  category      5000 non-null 

In [10]:
# Retrieve dataframe column names and shape (as prep for deletion confirmations).
print(df_top5000.columns)
print(df_top5000.shape)

Index(['rank', 'bgg_url', 'game_id', 'names', 'min_players', 'max_players',
       'avg_time', 'min_time', 'max_time', 'year', 'avg_rating', 'geek_rating',
       'num_votes', 'image_url', 'thumb_url', 'age', 'mechanic', 'owned',
       'category', 'designer', 'publisher', 'weight', 'expands',
       'reimplements', 'num_fans'],
      dtype='object')
(5000, 25)


In [11]:
# Drop the non-beneficial columns.
df_top5000 = df_top5000.drop(columns=["bgg_url", "geek_rating", "num_votes"])
df_top5000.shape

(5000, 22)

In [12]:
# 'Top 5000' dataframe summary statistics
df_top5000.describe()

Unnamed: 0,rank,game_id,min_players,max_players,avg_time,min_time,max_time,year,avg_rating,age,owned,weight,expands,reimplements,num_fans
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,3.0,833.0,5000.0
mean,2500.5,99061.15,2.0022,5.2012,112.7774,77.2004,112.631,1998.8028,6.999493,10.5036,3414.0784,2.351609,18009.666667,45541.690276,137.9822
std,1443.520003,82798.023416,0.687815,7.891006,453.828138,193.172975,453.852836,141.747928,0.558673,3.093267,7101.823261,0.801568,15009.028094,59862.362096,353.641359
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,-3000.0,5.79723,0.0,55.0,0.0,712.0,11.0,0.0
25%,1250.75,13883.75,2.0,4.0,30.0,30.0,30.0,2004.0,6.57854,8.0,725.0,1.748475,13220.5,1927.0,20.0
50%,2500.5,94607.0,2.0,4.0,60.0,45.0,60.0,2011.0,6.94087,12.0,1356.0,2.29025,25729.0,16986.0,44.0
75%,3750.25,172515.25,2.0,6.0,106.25,90.0,100.0,2015.0,7.363548,12.0,3104.0,2.9,26658.5,70323.0,112.0
max,5000.0,269210.0,8.0,200.0,22500.0,6000.0,22500.0,2019.0,9.13515,18.0,121071.0,4.8841,27588.0,268098.0,5863.0


In [13]:
# Determine the number of unique values in each column of the 'games' dataframe.
cat_cols_games = df_games.select_dtypes(exclude=[np.number]).columns

# value counts
for col in cat_cols_games:
    print(col)
    print(df_games[col].nunique())
    print(df_games[col].value_counts())
    print()

type
2
boardgame             68985
boardgameexpansion    10478
Name: type, dtype: int64

name
76035
                      16
                      15
Grand Prix            11
Arena                 11
Waterloo               9
                      ..
Hab acht!              1
Challenge Sudoku       1
Hamertjesspel          1
Felsberger Nuggets     1
Bingo Animal Kids      1
Name: name, Length: 76035, dtype: int64



In [14]:
# Determine the number of unique values in each column of the 'top 5000' dataframe.
cat_cols_top5000 = df_top5000.select_dtypes(exclude=[np.number]).columns

# value counts
for col in cat_cols_top5000:
    print(col)
    print(df_top5000[col].nunique())
    print(df_top5000[col].value_counts())
    print()

names
4974
Cosmic Encounter                        4
Samurai                                 3
Wizard                                  2
Barbarossa                              2
Lord of the Rings: The Confrontation    2
                                       ..
The Fury of Dracula                     1
Stop Thief!                             1
Dungeon Command: Curse of Undeath       1
Deckscape: Test Time                    1
Time Barons                             1
Name: names, Length: 4974, dtype: int64

image_url
5000
https://cf.geekdo-images.com/original/img/lDN358RgcYvQfYYN6Oy2TXpifyM=/0x0/pic2437871.jpg    1
https://cf.geekdo-images.com/original/img/jKOPojAUPrX7fHmkj6F0Nxj97M0=/0x0/pic3999734.jpg    1
https://cf.geekdo-images.com/original/img/0_kz-cMn28Tghu4WDy2ezdXBumc=/0x0/pic78647.jpg      1
https://cf.geekdo-images.com/original/img/Ss1Aw3ivEwsSVgEAGbjYvpJLncU=/0x0/pic1356415.jpg    1
https://cf.geekdo-images.com/original/img/SpkV_Xe92K0c6KxkpAlSC9L5J9Q=/0x0/pic4096767.jpg  

In [None]:
# merge code example... we may need to use a full "outer" join.
# df = pd.merge(df, team_sub, on=["teamID", "yearID"], how="inner")
# print(df.shape)
# df.head()