# Game Classification Model

#### execute in same directory as 'game_data.csv'

In [1]:
import os, math
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split 
from sklearn import metrics

# needs to be reset if a model is saved
# os.chdir(working_dir_path)
working_dir = os.getcwd()
game_data = pd.read_csv('game_data.csv')
game_data.head() # See the first 5 rows to check data import

Unnamed: 0,id,type,name,year,minplayers,maxplayers,playingtime,minplaytime,maxplaytime,minage,...,bay_rating,owners,traders,wanters,wishers,total_comments,total_weights,complexity,categories,mechanics
0,13,boardgame,Catan,1995,3,4,120,60,120,10,...,7.00456,141505,1792,463,5218,17610,7137,2.3277,"['Economic', 'Negotiation']","['Dice Rolling', 'Hexagon Grid', 'Income', 'Mo..."
1,822,boardgame,Carcassonne,2000,2,5,45,30,45,7,...,7.31303,137009,1577,539,6164,17506,7239,1.9171,"['City Building', 'Medieval', 'Territory Build...","['Area Majority / Influence', 'Map Addition', ..."
2,30549,boardgame,Pandemic,2008,2,4,45,45,45,8,...,7.52214,141355,2157,650,8444,15545,5180,2.4154,['Medical'],"['Action Points', 'Cooperative Game', 'Hand Ma..."
3,68448,boardgame,7 Wonders,2010,2,7,30,30,30,10,...,7.66507,103879,1342,1042,10713,13112,4360,2.3323,"['Ancient', 'Card Game', 'City Building', 'Civ...","['Card Drafting', 'Drafting', 'Hand Management..."
4,36218,boardgame,Dominion,2008,2,4,30,30,30,13,...,7.52473,96360,1887,629,7478,12876,4820,2.3591,"['Card Game', 'Medieval']","['Deck / Bag / Pool Building', 'Delayed Purcha..."


## clean and filter data

#### selective filtering for removing extremes, retain 'game_data' in case any reference to complete set is needed

In [2]:
bgg_games = game_data[game_data['maxplayers'] <= 30]
bgg_games = bgg_games[bgg_games['minplaytime'] <= 180]
bgg_games = bgg_games[bgg_games['maxplaytime'] <= 720]
bgg_games = bgg_games[bgg_games['minage'] <= 21]
bgg_games = bgg_games[bgg_games['playingtime'] >= 10]
bgg_games = bgg_games[bgg_games['maxplayers'] >= bgg_games['minplayers']]

#### first selection of columns potentially relevant to 'categories'

In [3]:
dtc_test = bgg_games[['type', 'year', 'minplayers', 'maxplayers', 'playingtime',
       'minplaytime', 'maxplaytime', 'minage', 'users_rated', 'avg_rating',
       'bay_rating', 'owners', 'traders', 'wanters', 'wishers',
       'total_comments', 'total_weights', 'complexity', 'categories',
       'mechanics']]

#### convert strings of multiple values into lists

In [4]:
dtc_test['categories'] = dtc_test['categories'].apply(lambda x: x.strip('][').split(', ') )
dtc_test['mechanics'] = dtc_test['mechanics'].apply(lambda x: x.strip('][').split(', ') )

#### count number of mechanics and categories for each game, make new columns

In [5]:
dtc_test['num_mechs'] = dtc_test.apply(lambda row: len(row['mechanics']), axis=1)
dtc_test['num_cats'] = dtc_test.apply(lambda row: len(row['categories']), axis=1)
dtc_test['rating_diff'] = dtc_test.avg_rating - dtc_test.bay_rating
dtc_test['player_diff'] = dtc_test.maxplayers - dtc_test.minplayers

In [6]:
def split_data_frame_list(df, target_column, output_type=str):
    ''' 
    Accepts a column with list values and splits into several rows.

    df: dataframe to split
    target_column: the column containing the values to split
    output_type: type of all outputs
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    row_accumulator = []

    def split_list_to_rows(row):
        split_row = row[target_column]
        if isinstance(split_row, list):
          for s in split_row:
              new_row = row.to_dict()
              new_row[target_column] = output_type(s)
              row_accumulator.append(new_row)
        else:
          new_row = row.to_dict()
          new_row[target_column] = output_type(split_row)
          row_accumulator.append(new_row)
  
    df.apply(split_list_to_rows, axis=1)
    new_df = pd.DataFrame(row_accumulator)
  
    return new_df

#### split 'mechanics' lists into multiple rows for decision tree training

In [7]:
dtc_test = split_data_frame_list(dtc_test, 'mechanics')

#### Before filtering by year for model, split and test to remove old mechanics/categories that are no longer relevant 

In [8]:

# year by which to remove uniques to prevent models from being incompatible
# -math.inf will include all games in dataframe
break_year = 2000 #-math.inf

old_mechs = list(dtc_test[dtc_test['year'] < break_year].mechanics.unique())
new_mechs = list(dtc_test[dtc_test['year'] >= break_year].mechanics.unique())
unique_old_mechs = list(set(old_mechs).difference(new_mechs))

# remove rows with irrelevant mechanics
dtc_test = dtc_test[~dtc_test.mechanics.isin(unique_old_mechs)]

#### second selection of columns to further filter for decision tree model

In [9]:
# columns to use
desired_cols = ['type', 'year', 'minplayers','maxplayers', 'playingtime', 'minplaytime', 'maxplaytime', 'avg_rating', 'bay_rating', 'complexity', 'categories', 'mechanics', 'num_mechs', 'num_cats', 'minage', 'rating_diff', 'player_diff']

# columns available
#all_cols = ['type', 'year', 'minplayers', 'maxplayers', 'playingtime', 'minplaytime', 'maxplaytime', 'minage', 'users_rated', 'avg_rating', 'bay_rating', 'owners', 'traders', 'wanters', 'wishers', 'total_comments', 'total_weights', 'complexity', 'categories', 'mechanics', 'num_mechs', 'num_cats', 'minage', 'rating_diff', 'player_diff']

# make dummies and attach to frame for tree model, leave categories alone
total_frame = dtc_test[desired_cols]
mech_dummies = pd.get_dummies(total_frame['mechanics'], prefix='mech', drop_first=True)
total_frame = pd.concat([total_frame, mech_dummies], axis=1)
type_dummies = pd.get_dummies(total_frame['type'], prefix='type', drop_first=True)
total_frame = pd.concat([total_frame, type_dummies], axis=1)

In [10]:
# preserve 'total frame' for testing with all years later
tree_frame = total_frame[total_frame['year'] >= break_year]

#### filter out categorical columns for tree fitting

In [11]:
# list of just desired features, now including one-hot cols and remove categorical cols
features = list(tree_frame.columns)

# remove categorical cols
features.remove('mechanics')
features.remove('type')

## Divide the data set
#### split data into training portions

In [12]:
# filtered features with one-hot fixes for categorical columns
X = tree_frame[features]

# target variable
y = tree_frame[['categories']]

# Split method, 0.3 == 30% of data saved for testing, choosen randomly from set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### split training data so model can "learn" different categories separately
#### keep testing data intact for checking accuracy

In [13]:
# split training data by category
X_train = split_data_frame_list(X_train, 'categories')
y_train = split_data_frame_list(y_train, 'categories')

# remove target column from X_data
# typically done before, but was needed for splitting training data
X_train.drop('categories', inplace=True, axis=1)
X_test.drop('categories', inplace=True, axis=1)

## Train the model 
#### adjust variables prior to loop for hyperparameter adjustments

In [14]:
from copy import deepcopy

best_accuracy = 0
models_to_compare = 3
tree_depth = 20
for j in range(models_to_compare):
    # Decision Tree classifer object
    dtc = DecisionTreeClassifier(criterion="entropy", splitter='best', max_depth=tree_depth)

    # Train Decision Tree Classifer
    dtc = dtc.fit(X_train,y_train)

    # predictions by model for y
    y_pred = dtc.predict(X_test)

    # custom accuracy check
    correct = 0
    y_targets = y_test["categories"].tolist() 
    for i in range(len(y_pred)):
        if y_pred[i] in y_targets[i]:
            correct += 1

    accuracy = correct / len(y_pred)
    if accuracy > best_accuracy:
        best_dtc = deepcopy(dtc)
        best_accuracy = accuracy
        print("best accuracy: ", accuracy)

print('done!')

best accuracy:  0.8910792883268241
best accuracy:  0.891513235385283
done!


#### Save model created in loop above

In [18]:
from joblib import dump, load

# swtich to model directory
model_dir = working_dir + "\models"
os.chdir(model_dir)

# create and save file
# [model type]_[deatils]_[accuracy]
joblib_file = "dtc_test_8752.joblib"  
dump(best_dtc, joblib_file)

['dtc_test_8752.joblib']

## load and test model

In [19]:
# swtich to model directory and load
model_dir = working_dir + "\models"
os.chdir(model_dir)

# name of model in folder
joblib_file = "dtc_test_8752.joblib"  
joblib_model = load(joblib_file)

#### get new set not limited by year (if desired for 'complete' check)

In [20]:
# before year trimmed
X = total_frame[features]
y = total_frame[['categories']]

#### split data into new set to compare against

In [21]:
# percent of data to use for test
pod = 50

# split into new set to test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(pod/100), random_state=1)

# then remove category column
X_test.drop('categories', inplace=True, axis=1)

#### test loaded model's accuracy

In [22]:
y_pred = joblib_model.predict(X_test)

correct = 0
y_targets = y_test["categories"].tolist() 
for i in range(len(y_pred)):
    if y_pred[i] in y_targets[i]:
        correct += 1

print("Accuracy:", correct / len(y_pred))

Accuracy: 0.9615815233037971
