In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV

#Load the data set
df = pd.read_csv("data//steam_game_dataset.csv")

#Remove the fields from the data set that we don't want to include in our model
df_features = pd.DataFrame(df,columns=['Age','RequiredAge',
                                      'Metacritic', 'MovieCount','PackageCount',
                                      'ScreenshotCount', 'AchievementCount',
                                      'ControllerSupport', 'PlatformLinux', 'PlatformMac',
                                      'CategoryMultiplayer', 'CategoryCoop', 'PriceInitial'])
df_features.fillna(0)
df.fillna(0)

#Create the X and Y arrays
X = df_features.values
Y = df['SteamSpyOwners'].values

In [18]:
#Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=7)

#Fit Regression model
model = ensemble.GradientBoostingRegressor()

#Parameters we want to try
param_grid = {
    'n_estimators': [500, 1000, 3000], #how many decision trees to build
    'learning_rate': [0.1, 0.05, 0.02, 0.01], #how much decision trees influence overall prediction
    'max_depth': [4, 6],
    'min_samples_leaf': [3, 5, 9, 17],
    'max_features': [1.0, 0.3, 0.1],
    'loss': ['ls', 'lad', 'huber']
}

#Define the grid search we want to run. Run it with eight cpus in parallel.
gs_cv = GridSearchCV(model, param_grid, n_jobs=-1, verbose=100)

#Run the grid search - on only the training data!
gs_cv.fit(X_train, Y_train)

#Print the parameters that gave us the best result!
print(gs_cv.best_params_)

Fitting 3 folds for each of 864 candidates, totalling 2592 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Pickling array (shape=(8743, 13), dtype=object).
Pickling array (shape=(8743,), dtype=int64).
Pickling array (shape=(5828,), dtype=int32).
Pickling array (shape=(2915,), dtype=int32).
Pickling array (shape=(8743, 13), dtype=object).
Pickling array (shape=(8743,), dtype=int64).
Pickling array (shape=(5829,), dtype=int32).
Pickling array (shape=(2914,), dtype=int32).
Pickling array (shape=(8743, 13), dtype=object).
Pickling array (shape=(8743,), dtype=int64).
Pickling array (shape=(5829,), dtype=int32).
Pickling array (shape=(2914,), dtype=int32).
Pickling array (shape=(8743, 13), dtype=object).
Pickling array (shape=(8743,), dtype=int64).
Pickling array (shape=(5828,), dtype=int32).
Pickling array (shape=(2915,), dtype=int32).
Pickling array (shape=(8743, 13), dtype=object).
Pickling array (shape=(8743,), dtype=int64).
Pickling array (shape=(5829,),

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
# Find the error rate on the training set using the best parameters
mse = mean_absolute_error(Y_train, gs_cv.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

# Find the error rate on the test set using the best parameters
mse = mean_absolute_error(Y_test, gs_cv.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)

# Find the error rate on the test set using the best parameters
r2s = r2_score(Y_test, gs_cv.predict(X_test))
print("Test Set R2 Score: %.4f" % r2s)

# Find the error rate on the test set using the best parameters
r2s = r2_score(Y_test, gs_cv.predict(X_test))
print("Test Set R2 Score: %.4f" % r2s)