In [146]:
import pandas as pd
import webbrowser
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.externals import joblib
from datetime import datetime
from datetime import date

#Load the data set
df = pd.read_csv("data//steam_game_dataset.csv")

In [147]:
#Delete all rows where the game is free or the product is not a game
df = df.drop(df[df.IsFree == True].index)
df = df.drop(df[df.GenreIsNonGame == True].index)

#Delete all rows with outlier amount of sales (>5000000)
df = df.drop(df[df.SteamSpyOwners > 4000000].index)

#Delete all rows with 0 owners
df = df.drop(df[df.SteamSpyOwners == 0].index)

#Method for calculating age from date
def calculate_age(releaseDate):
    today = date.today()
    age = pd.to_datetime('today') - releaseDate
    return age.days

df['ReleaseDate'] = pd.to_datetime(df['ReleaseDate'])
df['Age'] = df['ReleaseDate'].apply(calculate_age)
df = df.drop('ReleaseDate',1)

print(df.shape)

(9227, 78)


In [148]:
#Select all columns where feature importance > 0.75%
df_features = pd.DataFrame(df,columns=['Age','RequiredAge',
                                      'Metacritic', 'MovieCount','PackageCount',
                                      'ScreenshotCount', 'AchievementCount',
                                      'ControllerSupport', 'PlatformLinux', 'PlatformMac',
                                      'CategoryMultiplayer', 'CategoryCoop', 'PriceInitial'])

X = df_features.values
Y = df['SteamSpyOwners'].values

print(df_features.shape)
df_features.head(10)

(9227, 13)


Unnamed: 0,Age,RequiredAge,Metacritic,MovieCount,PackageCount,ScreenshotCount,AchievementCount,ControllerSupport,PlatformLinux,PlatformMac,CategoryMultiplayer,CategoryCoop,PriceInitial
28,4978,0,69,1,1,5,0,False,False,False,True,False,9.99
29,4825,0,81,6,2,37,44,False,True,True,True,False,9.99
30,3668,17,72,0,4,39,285,False,True,True,True,True,19.99
31,4768,0,75,1,1,5,0,False,False,False,False,False,9.99
32,4768,0,75,1,1,5,0,False,False,False,False,False,9.99
33,4768,0,75,1,1,5,0,False,False,False,False,False,9.99
34,5068,0,84,2,4,5,0,False,True,True,False,False,9.99
35,4663,0,75,1,2,4,0,False,True,True,False,False,9.99
36,4626,0,84,0,3,5,22,False,True,True,True,False,9.99
37,3905,0,76,7,3,16,12,False,False,True,True,False,9.99


In [156]:
#Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=7)

#Fit Regression model
model = ensemble.GradientBoostingRegressor(
    n_estimators=1000, #how many decision trees to build
    learning_rate=0.1, #how much decision trees influence overall prediction
    max_depth=6, #how many layers deep each individual decision tree can be
    min_samples_leaf=50, #how many times a value must appear in the training set for a decision tree to make a decision based on it
    max_features='sqrt', #percentage of features in our model that we randomly choose to consider each time we create a branch in our decision tree
    loss='huber', #how scikit-learn calculates the model’s error rate or cost as it learns
    random_state=7
)
model.fit(X_train, Y_train)

#Save the trained model to a file so we can use it in other programs
joblib.dump(model, 'models//trained_steam_sale_count_classifier3.pk1')

['models//trained_steam_sale_count_classifier3.pk1']

In [157]:
#Find the error rate on the training set
mse = mean_absolute_error(Y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

#Find the error rate on the test set
mse = mean_absolute_error(Y_test, model.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)

#Find the error rate on the training set
r2s = r2_score(Y_train, model.predict(X_train))
print("Training Set r2 score: %.4f" % r2s)

#Find the error rate on the test set
r2s = r2_score(Y_test, model.predict(X_test))
print("Test Set r2 score: %.4f" % r2s)

Training Set Mean Absolute Error: 46121.7536
Test Set Mean Absolute Error: 94853.5749
Training Set r2 score: 0.7490
Test Set r2 score: 0.4467


In [151]:
# These are the feature labels from our data set
feature_labels = np.array(['Age','RequiredAge',
                           'Metacritic', 'MovieCount','PackageCount',
                           'ScreenshotCount', 'AchievementCount',
                           'ControllerSupport', 'PlatformLinux', 'PlatformMac',
                           'CategoryMultiplayer', 'CategoryCoop', 'PriceInitial'])

# Load the trained model created with train_model.py
model = joblib.load('models//trained_steam_sale_count_classifier3.pk1')

# Create a numpy array based on the model's feature importances
importance = model.feature_importances_

# Sort the feature labels based on the feature importance rankings from the model
feature_indexes_by_importance = importance.argsort()

# Print each feature label, from most important to least important (reverse order)
for index in feature_indexes_by_importance:
    print("{} - {:.2f}%".format(feature_labels[index], (importance[index] * 100.0)))

CategoryCoop - 0.79%
PlatformLinux - 1.20%
ControllerSupport - 1.23%
CategoryMultiplayer - 1.36%
PlatformMac - 1.99%
MovieCount - 4.41%
RequiredAge - 5.29%
ScreenshotCount - 5.30%
PriceInitial - 7.49%
AchievementCount - 9.76%
PackageCount - 10.48%
Metacritic - 24.72%
Age - 25.97%


In [152]:
df = df.drop(df[df.Metacritic == 0].index)

#Select all columns where feature importance > 0.75%
df_features2 = pd.DataFrame(df,columns=['Age','RequiredAge',
                                      'Metacritic', 'MovieCount','PackageCount',
                                      'ScreenshotCount', 'AchievementCount',
                                      'ControllerSupport', 'PlatformLinux', 'PlatformMac',
                                      'CategoryMultiplayer', 'CategoryCoop', 'PriceInitial'])

X2 = df_features2.values
Y2 = df['SteamSpyOwners'].values

print(df_features2.shape)
df_features2.head(10)

(2073, 13)


Unnamed: 0,Age,RequiredAge,Metacritic,MovieCount,PackageCount,ScreenshotCount,AchievementCount,ControllerSupport,PlatformLinux,PlatformMac,CategoryMultiplayer,CategoryCoop,PriceInitial
28,4978,0,69,1,1,5,0,False,False,False,True,False,9.99
29,4825,0,81,6,2,37,44,False,True,True,True,False,9.99
30,3668,17,72,0,4,39,285,False,True,True,True,True,19.99
31,4768,0,75,1,1,5,0,False,False,False,False,False,9.99
32,4768,0,75,1,1,5,0,False,False,False,False,False,9.99
33,4768,0,75,1,1,5,0,False,False,False,False,False,9.99
34,5068,0,84,2,4,5,0,False,True,True,False,False,9.99
35,4663,0,75,1,2,4,0,False,True,True,False,False,9.99
36,4626,0,84,0,3,5,22,False,True,True,True,False,9.99
37,3905,0,76,7,3,16,12,False,False,True,True,False,9.99


In [153]:
#Split the data set in a training set (70%) and a test set (30%)
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X2, Y2, test_size=0.3, random_state=7)

#Fit Regression model
model = ensemble.GradientBoostingRegressor(
    n_estimators=1000, #how many decision trees to build
    learning_rate=0.1, #how much decision trees influence overall prediction
    max_depth=6, #how many layers deep each individual decision tree can be
    min_samples_leaf=50, #how many times a value must appear in the training set for a decision tree to make a decision based on it
    max_features='sqrt', #percentage of features in our model that we randomly choose to consider each time we create a branch in our decision tree
    loss='huber', #how scikit-learn calculates the model’s error rate or cost as it learns
    random_state=7
)
model.fit(X_train2, Y_train2)

#Save the trained model to a file so we can use it in other programs
joblib.dump(model, 'models//trained_steam_sale_count_classifier4.pk1')

['models//trained_steam_sale_count_classifier4.pk1']

In [154]:
#Find the error rate on the training set
mse = mean_absolute_error(Y_train2, model.predict(X_train2))
print("Training Set Mean Absolute Error: %.4f" % mse)

#Find the error rate on the test set
mse = mean_absolute_error(Y_test2, model.predict(X_test2))
print("Test Set Mean Absolute Error: %.4f" % mse)

#Find the error rate on the training set
r2s = r2_score(Y_train2, model.predict(X_train2))
print("Training Set r2 score: %.4f" % r2s)

#Find the error rate on the test set
r2s = r2_score(Y_test2, model.predict(X_test2))
print("Test Set r2 score: %.4f" % r2s)

Training Set Mean Absolute Error: 135668.5598
Test Set Mean Absolute Error: 256122.6769
Training Set r2 score: 0.7289
Test Set r2 score: 0.4550


In [155]:
# These are the feature labels from our data set
feature_labels = np.array(['Age','RequiredAge',
                           'Metacritic', 'MovieCount','PackageCount',
                           'ScreenshotCount', 'AchievementCount',
                           'ControllerSupport', 'PlatformLinux', 'PlatformMac',
                           'CategoryMultiplayer', 'CategoryCoop', 'PriceInitial'])

# Load the trained model created with train_model.py
model = joblib.load('models//trained_steam_sale_count_classifier4.pk1')

# Create a numpy array based on the model's feature importances
importance = model.feature_importances_

# Sort the feature labels based on the feature importance rankings from the model
feature_indexes_by_importance = importance.argsort()

# Print each feature label, from most important to least important (reverse order)
for index in feature_indexes_by_importance:
    print("{} - {:.2f}%".format(feature_labels[index], (importance[index] * 100.0)))

CategoryMultiplayer - 1.05%
CategoryCoop - 1.14%
PlatformLinux - 1.17%
ControllerSupport - 1.36%
PlatformMac - 2.63%
ScreenshotCount - 4.89%
MovieCount - 5.32%
PackageCount - 6.49%
PriceInitial - 6.89%
RequiredAge - 9.30%
AchievementCount - 11.99%
Age - 21.72%
Metacritic - 26.06%
