# Score Prediction Regression Model

In [23]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures

os.chdir(r"C:\Users\Zack\Desktop\work\OSU\406 - p2 - learning\jupyter practice\final models")
game_data = pd.read_csv('game_data.csv') # already filtered buy num_reviews >= 30
# game_data.head() # See the first 5 rows

### clean and filter data

In [24]:
# bgg_games = game_data[game_data['type'] == 'boardgame'] # no expansions
# bgg_games = bgg_games[bgg_games['year'] > 1980]
bgg_games = game_data[game_data['year'] > 1980]
bgg_games = bgg_games[bgg_games['maxplayers'] <= 30]
bgg_games = bgg_games[bgg_games['minplaytime'] <= 180] # 120 - 90th percentile
bgg_games = bgg_games[bgg_games['maxplaytime'] <= 720]
bgg_games = bgg_games[bgg_games['minage'] <= 21]
bgg_games = bgg_games[bgg_games['playingtime'] >= 10]

In [25]:
# cell for data exploration
# bgg_games.columns

### select cells potentially relevant to rating (before community interaction)

In [26]:
dtc_test = bgg_games[['type', 'minplayers', 'maxplayers', 'playingtime',
       'minplaytime', 'maxplaytime', 'minage', 'avg_rating', 'mechanics',
       'bay_rating', 'total_comments', 'total_weights', 'complexity', 'categories']]

In [27]:
# for player pool size
dtc_test = dtc_test[dtc_test['maxplayers'] >= dtc_test['minplayers']]

#### convert mechanics and categories into lists with values

In [28]:
dtc_test['categories'] = dtc_test['categories'].apply(lambda x: x.strip('][').split(', ') )
dtc_test['mechanics'] = dtc_test['mechanics'].apply(lambda x: x.strip('][').split(', ') )

#### count number of mechanics and categories for each game, make new columns

In [29]:
num_mechs = []
num_cats = []
for index, row in dtc_test.iterrows():
    num_mechs.append(len(row['mechanics']))
    num_cats.append(len(row['categories']))

dtc_test['num_mechs'] = num_mechs
dtc_test['num_cats'] = num_cats
dtc_test['player_diff'] = dtc_test.maxplayers - dtc_test.minplayers

In [30]:
# explore data
dtc_test['playingtime'].describe()

count    20905.000000
mean        64.891701
std         54.955134
min         10.000000
25%         30.000000
50%         45.000000
75%         90.000000
max        720.000000
Name: playingtime, dtype: float64

In [31]:
# data exploration
dtc_test['playingtime'].value_counts(dropna=False)

30     3722
60     3703
45     2489
120    2237
90     2227
       ... 
23        1
38        1
165       1
68        1
95        1
Name: playingtime, Length: 65, dtype: int64

In [32]:
# data exploration
dtc_test.loc[dtc_test.playingtime < 10, 'playingtime'].count()

# dtc_test['time_diff'] = dtc_test.maxplaytime - dtc_test.minplaytime

0

In [33]:
def split_data_frame_list(df, target_column, output_type=str):
    ''' 
    Accepts a column with list values and splits into several rows.

    df: dataframe to split
    target_column: the column containing the values to split
    output_type: type of all outputs
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    row_accumulator = []

    def split_list_to_rows(row):
        split_row = row[target_column]
        if isinstance(split_row, list):
          for s in split_row:
              new_row = row.to_dict()
              new_row[target_column] = output_type(s)
              row_accumulator.append(new_row)
        else:
          new_row = row.to_dict()
          new_row[target_column] = output_type(split_row)
          row_accumulator.append(new_row)
  
    df.apply(split_list_to_rows, axis=1)
    new_df = pd.DataFrame(row_accumulator)
  
    return new_df

#### split lists into multiple rows for regression model

In [34]:
dtc_test = split_data_frame_list(dtc_test, 'categories')
dtc_test = split_data_frame_list(dtc_test, 'mechanics')

In [35]:
# print(dtc_test.dtypes)

#### get all desired cols and apply one-hot fix to categorical features

In [36]:
# for filtering cols
desired_cols = ['type', 'minplayers', 'maxplayers', 'playingtime', 'minplaytime', 'maxplaytime', 'minage', 'avg_rating', 'bay_rating', 'complexity', 'categories', 'mechanics', 'num_mechs', 'num_cats', 'player_diff']

# make dummies (one-hot fix) for categorical values
# will remove categorical columns
model_frame = dtc_test[desired_cols]
model_frame = pd.get_dummies(model_frame, drop_first=True)

#### filter out categorical columns for tree fitting

In [37]:
# list of just desired features, now including one-hot cols
features = list(model_frame.columns)

# remove target cols
features.remove('avg_rating')
features.remove('bay_rating')

# for col in desired_cols:
#     features.remove(col)
# print(features)

### Divide the data set
#### split data into training portions

In [38]:
# filtered features with one-hot fixes for categorical columns
X = model_frame[features]

# remove undesired columns
X.drop("categories_'Expansion for Base-game'", inplace=True, axis=1) # duplicate of "is expansion"

# change rating to int value for library methods
# model_frame['avg_rating'] = model_frame['avg_rating'].apply(lambda x: int(round(100000 * x, 0)))

# target variable - bays because it's not as dramatic
y = model_frame[['avg_rating']]

# Split method, 0.3 == 30% of data saved for testing, choosen randomly from set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.0001, random_state=1)
print('done!')

done!


### feature selection

In [39]:
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from functools import partial

# f_regression is univarite - direct correlations
# mutual_info compares multiple feature pairs

def feature_selection(X_train, y_train, m=0, n='all'):
	'''produces feature values to help with selection
	IN: 3 frames split from data
		m = type of method desired (int)
		n = number of top features to select
	OUT: transformed X-data and feature selection model'''
	# partial to establish params for mutual info
	# CAN'T HANDLE
	mutual_info = partial(mutual_info_regression, random_state=0)

	# scoring functions to use
	methods = [f_regression, mutual_info]
	# configure to select all features
	fs = SelectKBest(score_func=methods[m], k=n)
	# learn relationship from training data
	fs.fit(X_train, y_train)
	# # transform train input data
	# X_train_fs = fs.transform(X_train)
	# # transform test input data
	# X_test_fs = fs.transform(X_test)
	# return feature selection model and scores
	return fs

In [40]:
# feature selection scores (currently set to check all features)
# compare mutual selection values to univariate values
# arbitrarily chose '100' as the f_value cutoff for desireed features
fs = feature_selection(X_train, y_train, 0, 15)

# BAD IDEA - don't turn a countinuous value (rating float) into 'multiclass' representation with integers!
# X_train_fs_mut, X_test_fs_mut, fs_mut = feature_selection(X_train, y_train, X_test, 2)

print('done!')

done!


In [41]:
print(len(list(X.columns)))

278


In [42]:
# apply top feature to new DF for model
feature_mask = fs.get_support()
top_features = X.columns[feature_mask]
# print(top_features)

### reset test data for mutual information filtering

# NOPE!

In [43]:
# X = model_frame[top_features]

# # still the same
# y = model_frame[['avg_rating']]

# # 0 - find correlations with all data available
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.0001, random_state=2)
# print("done!")

In [55]:
# new feature selection model
fs_mut = feature_selection(X_train, y_train, 1, 15)

MemoryError: Unable to allocate 356. KiB for an array with shape (45581,) and data type int64

In [45]:
# feature_mask = fs_mut.get_support()
# top_features = X.columns[feature_mask]
# print(top_features)

## reset test data with new features that were selected

In [46]:
X = model_frame[top_features]

# still the same
y = model_frame[['avg_rating']]

# Split method, 0.3 == 30% of data saved for testing, choosen randomly from set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

### Exploration of stats for feature values

In [47]:
# get features only with a p-value < 0.05 (indicates it has an effect on target variable)
# scores = []
# big_scores = []

# for i in range(len(fs.pvalues_)):
#     if fs.pvalues_[i] < 0.05:
#         # tuple of feature number and feature value
#         scores.append((i, fs.scores_[i]))
#         if fs.scores_[i] >= 100:
#             big_scores.append((i, fs.scores_[i]))
#     # same check for mutual values
#     # if fs_mut.pvalues_[i] < 0.05:
#     #     # tuple of feature number and feature value
#     #     mut_scores.append((i, fs_mut.scores_[i]))
# print(len(scores))
# print(len(big_scores)) # 68 features

In [48]:

# # remove ouliers to prep for getting standard deviation, then feature trimming
# score_nums = sorted(list(list(zip(*big_scores))[1]))
# # quartile splits of data, and range of difference
# q1, q3 = np.percentile(score_nums, [25, 75])
# iqr = q3 - q1
# # bounds to trim outliers
# low_bound = q1 - (1.5 * iqr)
# up_bound = q3 + (1.5 * iqr)


In [49]:
# import statistics as stats
# import bisect

# # get indices of where to trim outliers
# idx_left = bisect.bisect_right(score_nums, low_bound)
# idx_right = bisect.bisect_left(score_nums, up_bound)

# # trim outliers and get math
# timmed_nums = score_nums[idx_left:idx_right]
# scores_std = stats.stdev(timmed_nums)
# scores_mean = stats.mean(timmed_nums)
# print(scores_std, " ", scores_mean)

In [50]:
# # scores for the features
# scores = []
# for i in range(len(fs.scores_)):
# 	# print('Feature %d: %f' % (i, fs.scores_[i]))
# 	scores.append((i, fs.scores_[i]))

# sort by feature value
# scores.sort(key = lambda x: x[1])
# plot the scores
# plt.bar([i for i in range(len(fs.scores_))], fs.scores_)
# plt.show()

### Train the model 

In [51]:
# X_train.describe()
print(len(X_train))

137767


In [52]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from copy import deepcopy
import math


best_r2 = 0
best_mse = math.inf

# for i in range(10,20):
    # Regression Model objects
    # lgm = LinearRegression()
    # min_samples_leaf=5
rfr = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42)

# fit regression models
# lgm.fit(X_train,y_train)
rfr.fit(X_train,y_train)

# predictions by model for y
# y_pred_L = lgm.predict(X_test)
y_pred_F = rfr.predict(X_test)

# accuracy check, lower is better
# mse_L = mean_squared_error(y_test, y_pred_L)
mse_F = mean_squared_error(y_test, y_pred_F)
# print('MSE linear: ', mse_L)
# print('MSE Forest: ', mse_F)

# The coefficient of determination: 1 is perfect prediction
# r2_L = r2_score(y_test, y_pred_L)
r2_F = r2_score(y_test, y_pred_F)
# print('r2 Linear: ', r2_L)
print('r2 Forest: ', r2_F)

    # if r2_F > best_r2:
    #     best_lgm = deepcopy(lgm)
    #     best_r2 = r2_F
    #     best_mse = mse_F
    #     print("mse: ", best_mse)
    #     print("r2: ", best_r2)

print('done!')

r2 Forest:  0.9411821686719545
done!


### save best model and test accuracy of accuracy rating

In [53]:
from joblib import dump, load

# swtich to model directory
os.chdir(r"C:\Users\Zack\Desktop\work\OSU\406 - p2 - learning\jupyter practice\final models\models")

# create and save file
joblib_file = "rfr_r29424_mse0498.joblib"  
dump(rfr, joblib_file)

# test load
joblib_model = load(joblib_file)

y_pred = joblib_model.predict(X_test)

# accuracy check
mse = mean_squared_error(y_test, y_pred)
print('MSE: ', mse)

r2 = r2_score(y_test, y_pred)
print('r2: ', r2)

MSE:  0.050877176437359596
r2:  0.9411821686719545


In [54]:
# check specific instances for accuracy
print(y_test.iloc[170])
print(y_pred[170])

avg_rating    8.22286
Name: 42866, dtype: float64
8.218860365220493


### visualize training depths

In [None]:


max_depth = []
acc_gini = []
acc_entropy = []

best_acc = correct / len(y_pred)
best_dtc = dtc

y_targets = y_test["categories"].tolist() 
for i in range(1,36):
    # testing entropy
    dtree = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=i)
    dtree.fit(X_train, y_train)
    y_pred = dtree.predict(X_test)
    correct = 0
    for j in range(len(y_pred)):
        if y_pred[j] in y_targets[j]:
            correct += 1

    accuracy = correct / len(y_pred)
    acc_entropy.append(accuracy)
    if accuracy > best_acc:
        best_dtc = deepcopy(dtree)
        best_acc = accuracy
        print("best acc: ", accuracy)

    # testing gini
    dtree = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=i)
    dtree.fit(X_train, y_train)
    y_pred = dtree.predict(X_test)
    correct = 0
    for j in range(len(y_pred)):
        if y_pred[j] in y_targets[j]:
            correct += 1

    accuracy = correct / len(y_pred)
    acc_gini.append(accuracy)

    if accuracy > best_acc:
        best_dtc = deepcopy(dtree)
        best_acc = accuracy
        print("best acc: ", accuracy)

    # track depth for values
    max_depth.append(i)


# data frame with tracked values to graph
df = pd.DataFrame({'acc_gini':pd.Series(acc_gini), 
'acc_entropy':pd.Series(acc_entropy),
'max_depth':pd.Series(max_depth)})

In [None]:
joblib_file = "dtc_8936.joblib"  
dump(best_dtc, joblib_file)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# graph folder
os.chdir(r"C:\Users\Zack\Desktop\work\OSU\406 - p2 - learning\jupyter practice\final models\graphs")

#size 
sns.set(rc={'figure.figsize': (6, 6)})

print(max_depth)
# visualizing changes in parameters
sns.lineplot(x='max_depth', y='acc_gini', data=df)
sns.lineplot(x='max_depth', y='acc_entropy', data=df)
plt.xlabel('max depth')
plt.ylabel('accuracy')
# plt.xlim(1,30)
plt.savefig('gini vs entropy', bbox_inches = 'tight')
plt.show()

### Visualize training tree model

In [None]:
from sklearn.tree import export_graphviz
from six import StringIO  
from IPython.display import Image  
import pydotplus

# dot_data = StringIO()
# # number of unique values in target col
# class_names = list(model_frame.categories.unique())

# # use trained decision tree model, feature columns, and clases in target col
# export_graphviz(dtc, out_file = dot_data, filled=True, rounded=True, special_characters=True,
#                 feature_names = features,
#                 class_names = class_names)

# # creates image and then displays in Jupyter
# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
# graph.write_png('game_classes.png')
# Image(graph.create_png())