# Imports

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
import pandas as pd 
import helper

pd.set_option("max_columns", 200)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Load database tables into Pandas dataframe

Using our helper script with db credentials/connection pre-defined

In [2]:
raw_df = helper.get_all().set_index('appid')
raw_df.head(2)

Unnamed: 0_level_0,required_age,supported_languages,developers,publishers,categories,genres,achievements,linux,mac,windows,price,coming_soon,date,positive,negative,owners,average_forever,average_2weeks,median_forever,median_2weeks,ccu,name
appid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10,0,"['English', 'French', 'German', 'Italian', 'Sp...",['Valve'],['Valve'],"['Multi-player', 'PvP', 'Online PvP', 'Shared/...",['Action'],0,1,1,1,999,False,2000-11-01,183324,4765,"10,000,000 .. 20,000,000",10560,633,185,1007,13403,Counter-Strike
20,0,"['English', 'French', 'German', 'Italian', 'Sp...",['Valve'],['Valve'],"['Multi-player', 'PvP', 'Online PvP', 'Shared/...",['Action'],0,1,1,1,499,False,1999-04-01,5204,861,"2,000,000 .. 5,000,000",245,0,16,0,54,Team Fortress Classic


# Drop columns we won't be predicting on to shrink dataframe

In [3]:
dropcolumns = ['required_age','supported_languages','developers','publishers','achievements','linux','mac','windows','date','owners','average_forever','average_2weeks','median_forever','median_2weeks','ccu','name']
trimmed_df = raw_df.drop(columns = dropcolumns)
trimmed_df.head(2)

Unnamed: 0_level_0,categories,genres,price,coming_soon,positive,negative
appid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10,"['Multi-player', 'PvP', 'Online PvP', 'Shared/...",['Action'],999,False,183324,4765
20,"['Multi-player', 'PvP', 'Online PvP', 'Shared/...",['Action'],499,False,5204,861


## Filter out any rows 'coming soon'

In [4]:
out_now_df = trimmed_df[~trimmed_df['coming_soon']].drop(columns='coming_soon')
out_now_df.head(2)

Unnamed: 0_level_0,categories,genres,price,positive,negative
appid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10,"['Multi-player', 'PvP', 'Online PvP', 'Shared/...",['Action'],999,183324,4765
20,"['Multi-player', 'PvP', 'Online PvP', 'Shared/...",['Action'],499,5204,861


## Filter out rows with not enough data to make an interesting prediction

Filter on at least 100 total reviews

In [5]:
total_reviews = out_now_df['positive'] + out_now_df['negative']

filter_reviews_df = out_now_df[total_reviews > 100]
filter_reviews_df.head(2)

Unnamed: 0_level_0,categories,genres,price,positive,negative
appid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10,"['Multi-player', 'PvP', 'Online PvP', 'Shared/...",['Action'],999,183324,4765
20,"['Multi-player', 'PvP', 'Online PvP', 'Shared/...",['Action'],499,5204,861


## Define values for 'good' or 'bad' user review ratios and make categorical bins from the result

Investigate what the distribution of review ratios is

In [6]:
ratios = out_now_df['positive'].div(out_now_df['negative'])
print(ratios.describe())
quantiles = ratios.quantile([0.1, 0.25, 0.50, 0.75]).get_values()
print(quantiles)

count    3.274300e+04
mean              inf
std               NaN
min      0.000000e+00
25%      1.500000e+00
50%      3.394366e+00
75%      8.666667e+00
max               inf
dtype: float64
[0.66666667 1.5        3.3943662  8.66666667]


Create bins for the quantiles above

In [7]:
def bin_ratio(ratio):
    if ratio < quantiles[0]:
        return f'LT10%'
    if ratio < quantiles[1]:
        return f'LT25%'
    if ratio < quantiles[2]:
        return f'LT50%'
    if ratio < quantiles[3]: 
        return f'LT75%'
    else:
        return f'GT75%'

## Apply the binning function and drop the original review count columns

In [8]:
binned_df = out_now_df.drop(columns = ['positive', 'negative'])
binned_df['quantiles'] = ratios.apply(bin_ratio)
binned_df.head(2)

Unnamed: 0_level_0,categories,genres,price,quantiles
appid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,"['Multi-player', 'PvP', 'Online PvP', 'Shared/...",['Action'],999,GT75%
20,"['Multi-player', 'PvP', 'Online PvP', 'Shared/...",['Action'],499,LT75%


# Convert categorical columns into binary dummy columns


Pivot a column with lists as values into a series of columns for each distinct value in any of the contained lists.

The resulting columns will have a '1' value if the value was in that rows list of values, making them appropriately dummied values for later ML processing

In [9]:
def explodelist(column_name, alias, dataframe):
    df = dataframe[[column_name]]
    test = df.join(pd.Series(df[column_name].apply(eval).apply(pd.Series).stack().reset_index(1, drop=True),
                        name=alias))
    dummies_testdf = pd.get_dummies(test.drop(column_name, axis=1),
    columns=[alias]).groupby(test.index).sum()
    return dummies_testdf.join(dataframe).drop(columns= column_name)

Apply the newly created function to columns containing lists and dummy the remaining categorical columns

In [10]:
list_cols = [('categories', 'category'), ('genres', 'genre')]
dummy_columns = ['quantiles']

dummy_df = binned_df
for col, alias in list_cols:
    dummy_df = explodelist(col, alias, dummy_df)

dummy_df = pd.get_dummies(dummy_df, columns=dummy_columns, dtype=float)
dummy_df.shape

(32889, 77)

## Split X and Y data

In [11]:
y_columns = [x for x in dummy_df.columns if x.startswith('quantiles')]
y_columns

['quantiles_GT75%',
 'quantiles_LT10%',
 'quantiles_LT25%',
 'quantiles_LT50%',
 'quantiles_LT75%']

In [12]:
X = dummy_df.drop(columns = y_columns)
Y = dummy_df[y_columns]
print(Y.columns)
X.head(2)

Index(['quantiles_GT75%', 'quantiles_LT10%', 'quantiles_LT25%',
       'quantiles_LT50%', 'quantiles_LT75%'],
      dtype='object')


Unnamed: 0_level_0,genre_Accounting,genre_Action,genre_Adventure,genre_Animation & Modeling,genre_Audio Production,genre_Casual,genre_Design & Illustration,genre_Early Access,genre_Education,genre_Free to Play,genre_Game Development,genre_Gore,genre_Gratuito p/ Jogar,genre_Indie,genre_Massively Multiplayer,genre_Movie,genre_Multijogador Massivo,genre_Nudity,genre_Photo Editing,genre_RPG,genre_Racing,genre_Sexual Content,genre_Simulation,genre_Software Training,genre_Sports,genre_Strategy,genre_Utilities,genre_Video Production,genre_Violent,genre_Web Publishing,category_Captions available,category_Co-op,category_Commentary available,category_Compras em aplicativo,category_Cross-Platform Multiplayer,category_Full controller support,category_In-App Purchases,category_Includes Source SDK,category_Includes level editor,category_JxJ,category_JxJ online,category_LAN Co-op,category_LAN PvP,category_MMO,category_Mods,category_Mods (require HL2),category_Multi-player,category_Multijogador,category_Multijogador entre plataformas,category_Online Co-op,category_Online PvP,category_Partial Controller Support,category_PvP,category_Remote Play Together,category_Remote Play on Phone,category_Remote Play on TV,category_Remote Play on Tablet,category_Shared/Split Screen,category_Shared/Split Screen Co-op,category_Shared/Split Screen PvP,category_Single-player,category_Stats,category_Steam Achievements,category_Steam Cloud,category_Steam Leaderboards,category_Steam Trading Cards,category_Steam Turn Notifications,category_Steam Workshop,category_SteamVR Collectibles,category_VR Support,category_Valve Anti-Cheat enabled,price
appid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1
10,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,999
20,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,499


## Apply StandardScaler to any numeric columns

First split out training and testing data

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 50)

## Use a ColumnTransformer so we don't have to pass the dummy columns into the scaler

In [14]:
scale_cols = ['price']

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

ct = ColumnTransformer([
        ('ct', StandardScaler(), scale_cols)
     ], remainder='passthrough')

X_train_scaled = ct.fit_transform(X_train)
X_test_scaled = ct.fit_transform(X_test)

X_train_scaled

array([[-0.61803171,  0.        ,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.80849515,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.3309009 ,  0.        ,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.58330452,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.04377008,  0.        ,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.62620181,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

## Define Neural Network architecture

In [15]:
nn2 = tf.keras.models.Sequential()

nn2.add(tf.keras.layers.Dense(units=50, activation="relu",input_dim=X_train_scaled.shape[1]))

nn2.add(tf.keras.layers.Dense(units=40, activation="relu"))

nn2.add(tf.keras.layers.Dense(units=30, activation="relu"))

nn2.add(tf.keras.layers.Dense(units=20, activation="relu"))

nn2.add(tf.keras.layers.Dropout(0.3))

nn2.add(tf.keras.layers.Dense(units=len(y_columns), activation="sigmoid"))

nn2.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 50)                3650      
_________________________________________________________________
dense_1 (Dense)              (None, 40)                2040      
_________________________________________________________________
dense_2 (Dense)              (None, 30)                1230      
_________________________________________________________________
dense_3 (Dense)              (None, 20)                620       
_________________________________________________________________
dropout (Dropout)            (None, 20)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 105       
Tota

In [16]:
nn2.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

fit_model = nn2.fit(X_train_scaled, y_train, epochs=10)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Print out accuracy and loss based on test data

In [17]:
model_loss, model_accuracy = nn2.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

8223/8223 - 0s - loss: 0.4656 - acc: 0.8004
Loss: 0.4655855799314243, Accuracy: 0.8003901243209839
