# Game Classification Model

In [21]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split 
from sklearn import metrics

os.chdir(r"C:\Users\Zack\Desktop\work\OSU\406 - p2 - learning\jupyter practice\final models")
game_data = pd.read_csv('game_data.csv') #names = col_names if not in CSV
# game_data.head() # See the first 5 rows

### clean and filter data

In [22]:
# bgg_games = game_data[game_data['type'] == 'boardgame'] # no expansions
# bgg_games = bgg_games[bgg_games['year'] > 1980]
# bgg_games = game_data[game_data['year'] >= 1980]
bgg_games = game_data[game_data['maxplayers'] <= 30]
bgg_games = bgg_games[bgg_games['minplaytime'] <= 180] # 120 - 90th percentile
bgg_games = bgg_games[bgg_games['maxplaytime'] <= 720]
bgg_games = bgg_games[bgg_games['minage'] <= 21]
bgg_games = bgg_games[bgg_games['playingtime'] >= 10]

In [23]:
# cell for data exploration
# bgg_games.columns

### select cells potentially relevant to categories

In [24]:
dtc_test = bgg_games[['type', 'year', 'minplayers', 'maxplayers', 'playingtime',
       'minplaytime', 'maxplaytime', 'minage', 'users_rated', 'avg_rating',
       'bay_rating', 'owners', 'traders', 'wanters', 'wishers',
       'total_comments', 'total_weights', 'complexity', 'categories',
       'mechanics']]
dtc_test = dtc_test[dtc_test['maxplayers'] >= dtc_test['minplayers']]

In [25]:
# for player diff potential and playtime 

#### convert mechanics and categories into lists with values

In [26]:
dtc_test['categories'] = dtc_test['categories'].apply(lambda x: x.strip('][').split(', ') )
dtc_test['mechanics'] = dtc_test['mechanics'].apply(lambda x: x.strip('][').split(', ') )

#### count number of mechanics and categories for each game, make new columns

In [27]:
dtc_test['num_mechs'] = dtc_test.apply(lambda row: len(row['mechanics']), axis=1)
dtc_test['num_cats'] = dtc_test.apply(lambda row: len(row['categories']), axis=1)
dtc_test['rating_diff'] = dtc_test.avg_rating - dtc_test.bay_rating
dtc_test['player_diff'] = dtc_test.maxplayers - dtc_test.minplayers

In [28]:
# explore data
# dtc_test.head()

In [29]:
# data exploration
# dtc_test['playingtime'].value_counts(dropna=False)

In [30]:
# data exploration
# dtc_test.loc[dtc_test.playingtime < 10, 'playingtime'].count()

# dtc_test['time_diff'] = dtc_test.maxplaytime - dtc_test.minplaytime

In [31]:
def split_data_frame_list(df, target_column, output_type=str):
    ''' 
    Accepts a column with list values and splits into several rows.

    df: dataframe to split
    target_column: the column containing the values to split
    output_type: type of all outputs
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    row_accumulator = []

    def split_list_to_rows(row):
        split_row = row[target_column]
        if isinstance(split_row, list):
          for s in split_row:
              new_row = row.to_dict()
              new_row[target_column] = output_type(s)
              row_accumulator.append(new_row)
        else:
          new_row = row.to_dict()
          new_row[target_column] = output_type(split_row)
          row_accumulator.append(new_row)
  
    df.apply(split_list_to_rows, axis=1)
    new_df = pd.DataFrame(row_accumulator)
  
    return new_df

# not needed?!?
# def dupe_data_frame_list(df, target_column, output_type=list):
#     ''' 
#     duplicate rows for each value in a list instead (for 'y_test' only)
#     '''
#     row_accumulator = []

#     def dupe_list_to_rows(row):
#         split_row = row[target_column]
#         if isinstance(split_row, list):
#           for s in split_row:
#               new_row = row.to_dict()
#               new_row[target_column] = output_type(split_row)
#               row_accumulator.append(new_row)
#         else:
#           new_row = row.to_dict()
#           new_row[target_column] = output_type(split_row)
#           row_accumulator.append(new_row)
  
#     df.apply(dupe_list_to_rows, axis=1)
#     new_df = pd.DataFrame(row_accumulator)
  
#     return new_df

#### split lists into multiple rows for decision tree

In [32]:
# TESTING: after data split for train/test portions?
# dtc_test = split_data_frame_list(dtc_test, 'categories')
dtc_test = split_data_frame_list(dtc_test, 'mechanics')

#### Before splitting for model, split an test to find old mechanics/categories that are no longer relevant 

In [42]:
import math
# year by which to remove uniques to prevent models from being incompatible

break_year = -math.inf

True

In [34]:
old_mechs = list(dtc_test[dtc_test['year'] < break_year].mechanics.unique())
new_mechs = list(dtc_test[dtc_test['year'] >= break_year].mechanics.unique())
unique_old_mechs = list(set(old_mechs).difference(new_mechs))

# remove rows with irrelevant mechanics
dtc_test = dtc_test[~dtc_test.mechanics.isin(unique_old_mechs)]

# old_cats = list(temp_frame[temp_frame['year'] < 1980].categories.unique())
# new_cats = list(temp_frame[temp_frame['year'] >= 1980].categories.unique())
# unique_old_cats = list(set(old_cats).difference(new_cats))

#### exploration

In [35]:
# # exploration
# print(unique_old_mechs)
# print(len(dtc_test))
# new_df = dtc_test[dtc_test.mechanics.isin(unique_old_mechs)]
# print(len(new_df))
# new_df.head(10)
# one_mech = dtc_test[dtc_test['num_mechs'] == 1]
# unique_mech = one_mech[one_mech['mechanics'] == "'Physical Removal'"]
# unique_mech.head()
# print(len(unique_old_cats))
# what_war = temp_frame[temp_frame['categories'] == "'Korean War'"]
# what_years = list(what_war['year'].unique())
# print(what_years)
# print(len(list(temp_frame[temp_frame['year'] < 1970].categories.unique())))

#### get all desired cols and apply one-hot fix to categorical features

In [36]:
# for filtering cols
desired_cols = ['type', 'year', 'minplayers','maxplayers', 'playingtime', 'minplaytime', 'maxplaytime', 'avg_rating', 'bay_rating', 'complexity', 'categories', 'mechanics', 'num_mechs', 'num_cats', 'minage', 'rating_diff', 'player_diff']
#all_cols = ['type', 'year', 'minplayers', 'maxplayers', 'playingtime', 'minplaytime', 'maxplaytime', 'minage', 'users_rated', 'avg_rating', 'bay_rating', 'owners', 'traders', 'wanters', 'wishers', 'total_comments', 'total_weights', 'complexity', 'categories', 'mechanics', 'num_mechs', 'num_cats', 'minage', 'rating_diff', 'player_diff']

# make dummies and attach to frame for tree model, leave categories alone
total_frame = dtc_test[desired_cols]
mech_dummies = pd.get_dummies(total_frame['mechanics'], prefix='mech', drop_first=True)
total_frame = pd.concat([total_frame, mech_dummies], axis=1)
type_dummies = pd.get_dummies(total_frame['type'], prefix='type', drop_first=True)
total_frame = pd.concat([total_frame, type_dummies], axis=1)

In [37]:
# far more accurate with just recent years...
tree_frame = total_frame[total_frame['year'] >= break_year]

In [38]:
print(len(total_frame))
print(len(tree_frame))

62521
62521


#### filter out categorical columns for tree fitting

In [401]:
# list of just desired features, now including one-hot cols and remove categorical cols
features = list(tree_frame.columns)

# remove either categorical cols or all cols
features.remove('mechanics')
# features.remove('categories')
features.remove('type')

# for col in desired_cols:
#     features.remove(col)
# print(features)

### Divide the data set
#### split data into training portions

In [402]:
# filtered features with one-hot fixes for categorical columns
# keep categories for now for splitting into rows
X = tree_frame[features]

# target variable
y = tree_frame[['categories']]

# Split method, 0.3 == 30% of data saved for testing, choosen randomly from set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

### split X_train, X_test, and y_train so model can "learn" different categories separately
### keep y_test.categories intact for y_prediction comparisons

In [403]:
# split training and x_test by category
X_train = split_data_frame_list(X_train, 'categories')
# X_test = split_data_frame_list(X_test, 'categories')
y_train = split_data_frame_list(y_train, 'categories')

# then remove categorical column
X_train.drop('categories', inplace=True, axis=1)
X_test.drop('categories', inplace=True, axis=1)

# dupe instead of split for accuracy tests
# y_test = dupe_data_frame_list(y_test, 'categories')

In [404]:
# exploration
# print(len(X.columns))
# # features = list(X_train.columns)
# # features.remove('categories')
# print(len(X_train.columns))
# X_train.drop('categories', inplace=True, axis=1)
# X_test.drop('categories', inplace=True, axis=1)
print("done!")


done!


### Train the model 

In [411]:
from copy import deepcopy

best_acc = 0
for j in range(1, 500):
    # Decision Tree classifer object
    dtc = DecisionTreeClassifier(criterion="entropy", splitter='best', max_depth=20)

    # Train Decision Tree Classifer
    dtc = dtc.fit(X_train,y_train)

    # predictions by model for y
    y_pred = dtc.predict(X_test)

    # custom accuracy check - NON-SPLIT DATA
    correct = 0
    y_targets = y_test["categories"].tolist() 
    for i in range(len(y_pred)):
        if y_pred[i] in y_targets[i]:
            correct += 1

    accuracy = correct / len(y_pred)
    if accuracy > best_acc:
        best_dtc = deepcopy(dtc)
        best_acc = accuracy
        # print("depth: ", 20+j)
        print("best acc: ", accuracy)

print('done!')

best acc:  0.8771396576547752
best acc:  0.8780461792779822
best acc:  0.8787927264970938


MemoryError: could not allocate 3670016 bytes

### SAVE MODEL!! (and test accuracy of accuracy rating)

In [370]:
from joblib import dump, load

# swtich to model directory
os.chdir(r"C:\Users\Zack\Desktop\work\OSU\406 - p2 - learning\jupyter practice\final models\models")

# create and save file
joblib_file = "dtc_true_8766.joblib"  
dump(best_dtc, joblib_file)

['dtc_true_8766.joblib']

#### load and test model

In [371]:
# swtich to model directory
os.chdir(r"C:\Users\Zack\Desktop\work\OSU\406 - p2 - learning\jupyter practice\final models\models")
joblib_model = load(joblib_file)

y_pred = joblib_model.predict(X_test)

# custom accuracy check
correct = 0
y_targets = y_test["categories"].tolist() 
for i in range(len(y_pred)):
    if y_pred[i] in y_targets[i]:
        correct += 1

print("Accuracy:", correct / len(y_pred))

Accuracy: 0.876553084839759


#### test with different data

In [379]:
# before year trimmed
X = total_frame[features]
y = total_frame[['categories']]

# Split to new sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [380]:
# new split/dupes
# X_train = split_data_frame_list(X_train, 'categories')
# X_test = split_data_frame_list(X_test, 'categories')
# y_train = split_data_frame_list(y_train, 'categories')

# then remove categorical column
X_train.drop('categories', inplace=True, axis=1)
X_test.drop('categories', inplace=True, axis=1)

# dupe instead of split for accuracy tests
# y_test = dupe_data_frame_list(y_test, 'categories')

In [381]:
# new predictions
y_pred = joblib_model.predict(X_test)

# custom accuracy check
correct = 0
y_targets = y_test["categories"].tolist() 
for i in range(len(y_pred)):
    if y_pred[i] in y_targets[i]:
        correct += 1

print("Accuracy:", correct / len(y_pred))

Accuracy: 0.9592152263155089


### visualize training depths

In [None]:


max_depth = []
acc_gini = []
acc_entropy = []

best_acc = correct / len(y_pred)
best_dtc = dtc

y_targets = y_test["categories"].tolist() 
for i in range(1,36):
    # testing entropy
    dtree = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=i)
    dtree.fit(X_train, y_train)
    y_pred = dtree.predict(X_test)
    correct = 0
    for j in range(len(y_pred)):
        if y_pred[j] in y_targets[j]:
            correct += 1

    accuracy = correct / len(y_pred)
    acc_entropy.append(accuracy)
    if accuracy > best_acc:
        best_dtc = deepcopy(dtree)
        best_acc = accuracy
        print("best acc: ", accuracy)

    # testing gini
    dtree = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=i)
    dtree.fit(X_train, y_train)
    y_pred = dtree.predict(X_test)
    correct = 0
    for j in range(len(y_pred)):
        if y_pred[j] in y_targets[j]:
            correct += 1

    accuracy = correct / len(y_pred)
    acc_gini.append(accuracy)

    if accuracy > best_acc:
        best_dtc = deepcopy(dtree)
        best_acc = accuracy
        print("best acc: ", accuracy)

    # track depth for values
    max_depth.append(i)


# data frame with tracked values to graph
df = pd.DataFrame({'acc_gini':pd.Series(acc_gini), 
'acc_entropy':pd.Series(acc_entropy),
'max_depth':pd.Series(max_depth)})

In [None]:
joblib_file = "dtc_8936.joblib"  
dump(best_dtc, joblib_file)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# graph folder
os.chdir(r"C:\Users\Zack\Desktop\work\OSU\406 - p2 - learning\jupyter practice\final models\graphs")

#size 
sns.set(rc={'figure.figsize': (6, 6)})

print(max_depth)
# visualizing changes in parameters
sns.lineplot(x='max_depth', y='acc_gini', data=df)
sns.lineplot(x='max_depth', y='acc_entropy', data=df)
plt.xlabel('max depth')
plt.ylabel('accuracy')
# plt.xlim(1,30)
plt.savefig('gini vs entropy', bbox_inches = 'tight')
plt.show()

### Visualize training tree model

In [None]:
from sklearn.tree import export_graphviz
from six import StringIO  
from IPython.display import Image  
import pydotplus

# dot_data = StringIO()
# # number of unique values in target col
# class_names = list(tree_frame.categories.unique())

# # use trained decision tree model, feature columns, and clases in target col
# export_graphviz(dtc, out_file = dot_data, filled=True, rounded=True, special_characters=True,
#                 feature_names = features,
#                 class_names = class_names)

# # creates image and then displays in Jupyter
# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
# graph.write_png('game_classes.png')
# Image(graph.create_png())