# Steam - Game Recommender
![steam](https://logosdownload.com/logo/Steam-Icon-logo-512.png)

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() 
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Data Loading and Exploration

In [None]:
steam_raw = pd.read_csv("../input/steam-video-games/steam-200k.csv",usecols=[0,1,2,3],names=['userid','game','behavior','hoursplayed'])
steam_raw.head()
steam_raw.isnull().values.any()
steam_raw['userid'] = steam_raw.userid.astype(str)
steam_raw.describe()

In [None]:
steam_raw.groupby(['game']).mean().sort_values(by="hoursplayed",ascending=False).head()
steam_raw.groupby(['game']).sum().sort_values(by="hoursplayed",ascending=False).head()

**Eastside Hockey Manager has the highest average hours played while Dota 2 has the most hours played.**

In [None]:
len(steam_raw['game'].unique())
len(steam_raw['userid'].unique())

**There is 5155 unique games and 12393 unique players in the dataset.**

In [None]:
gb = steam_raw.groupby('game')['userid'].nunique().sort_values(ascending=False).head()
ax = gb.plot(kind='bar', title='Number of players for Most Popular Games', ylabel='No. of players',
         xlabel='Game', figsize=(6, 5))

ax.bar_label(ax.containers[0])

## Feature Engineering and Metrics
**Supposedly if a user plays a game for more than 40 hours, then the user enjoys the game.<br>
Thus, we define a binary column "like" that indicates 1 if the user enjoys the game, and 0 if he/she doesn't.**

In [None]:
steam_df = steam_raw.copy()
steam_df['like'] = [1 if x > 40 else 0 for x in steam_df['hoursplayed']]
steam_df['like'].value_counts()
steam_df.head()

bg=steam_df.groupby('game')['like'].apply(lambda x: (x==1).sum()).sort_values(ascending=False)
bg.head()
gb.head()
#Plot grouped bar-chart of common games
gbbg = pd.merge(gb, bg, on='game')
gbc = gbbg.plot.bar(logy=True)



**From the graph, Half-Life 2 Lost Coast had one of the highest unique players of 981 (purchased and played) but none of them played the game more than 40 hours.<br>
Now, let's find those who purchased a game and didn't play it at all.<br>
We would want to reassign hoursplayed for these players to 0 instead of 1.<br>
And change the behavior to play and finally drop rows that are purchase.<br>
<br>
This would leave the dataframe to only containing play behaviors and if those that are purchased and not played, the hoursplayed will be 0.**

In [None]:
x = steam_df.groupby(['userid', 'game'])['behavior'].size()
s = x[x == 1]
len(s)
len(x)

boolean_index = steam_df.groupby(['userid','game'])['behavior'].transform('size') < 2
steam_df.loc[boolean_index,'hoursplayed'] = 0
steam_df.loc[steam_df['hoursplayed']==0]

steam_df.loc[steam_df.hoursplayed==0,'behavior'] = 'play'

steam_df.loc[steam_df['hoursplayed'] ==0]
steam_df = steam_df[steam_df.behavior != 'purchase']

**There are 57904 games purchased that have not been played yet.<br>
Next, we define the metrics to calculate a simple recommendation based on popularity and what other players like.**

In [None]:
# Create a new dataframe to store metrics
d = {'like':'Sum Likes','hoursplayed':'Avg Hours Played'}
metrics_df = steam_df.groupby(['game'], as_index=False).agg({'like':'sum','hoursplayed':'mean'}).rename(columns=d)
metrics_df.loc[metrics_df['game'] == "Dota 2"] #Check Dota 2

# Calculate mean of Hours Played average
c = metrics_df['Avg Hours Played'].mean()
print("Average hours played across all games is " + str(round(c,2)))

# Calculate the minimum number of likes required, set to 95 percentile
m = metrics_df['Sum Likes'].quantile(0.95)
print("Minimum number of likes for a game is " + str(m))

**Here the cut-off for the minimum number of likes is 5, this mean that there should be at least 5 user that played the game for more than 40 hours. <br> 
If a game has no more than 5 likes, we wouldn't recommend it to others.<br>
Now, we can proceed to trim and filter out the dataframe that meet this minimum number of likes.**

In [None]:
metrics_df.shape
metrics_df = metrics_df.loc[metrics_df['Sum Likes'] >= m]
metrics_df.shape
metrics_df.head()

## Simple Recommender
**Next, we will create the scoring system for each game.<br>
Define the score as Average Hours Played for the Game multiplied by Sum Likes Fraction Add Average Hours Across Games multilpied by minimum number of Likes Fraction**

In [None]:
def weighted_rating(df, m=m, C=c):
    l = df['Sum Likes']
    a = df['Avg Hours Played']
    return (l/(l+m) * a) + (m/(l+m) * C)

metrics_df['score'] = metrics_df.apply(weighted_rating, axis=1)
metrics_df.head()

In [None]:
metrics_df.sort_values(by=['score'],ascending=False).head(15)

**Using the Simple Recommender score, the top games are** 
1. Football Manager, 
2. CSGO, 
3. and Dota 2.<br>
**This yields the most popular games/games that are well-liked by others.**<br>

## Restricted Boltzman Machine
**Develop RBM a stochastic ANN to generate construct recommendations.**

In [None]:
steam_df
len(steam_df['game'].unique())
len(steam_df['userid'].unique())
len(steam_df)

In [None]:
games_df = pd.DataFrame(steam_df.game.unique(), columns=['game'])
games_df['index_col'] = games_df.index
games_df

steam_df = steam_df.merge(games_df, on='game')
steam_df.head()

steam_df['hoursplayed'].std()
steam_df['hoursplayed'].mean()

In [None]:
usergroup = steam_df.groupby('userid')
usergroup.head()

amountOfUsedUsers = 1000

train_list = []

i = 0
# For each user in the group
for userID, curUser in usergroup:
    # Create a temp that stores every game's hours played
    temp = [0]*len(games_df)
    # For each game in list
    for num, game in curUser.iterrows():
        temp[game['index_col']] = game['hoursplayed']
        i+=1
    train_list.append(temp)
    
    if amountOfUsedUsers == 0:
        break
    amountOfUsedUsers -= 1


In [None]:
# Setting the models Parameters
hiddenUnits = 50
visibleUnits = len(steam_raw['game'].unique())
vb = tf.placeholder(tf.float32, [visibleUnits])  
hb = tf.placeholder(tf.float32, [hiddenUnits]) 
W = tf.placeholder(tf.float32, [visibleUnits, hiddenUnits]) 

# Phase 1: Input Processing
v0 = tf.placeholder("float", [None, visibleUnits])
_h0 = tf.nn.sigmoid(tf.matmul(v0, W) + hb)  
h0 = tf.nn.relu(tf.sign(_h0 - tf.random_uniform(tf.shape(_h0)))) 

# Phase 2: Reconstruction
_v1 = tf.nn.sigmoid(tf.matmul(h0, tf.transpose(W)) + vb) 
v1 = tf.nn.relu(tf.sign(_v1 - tf.random_uniform(tf.shape(_v1))))
h1 = tf.nn.sigmoid(tf.matmul(v1, W) + hb)

# Learning rate
alpha = 1

# Create the gradients
w_pos_grad = tf.matmul(tf.transpose(v0), h0)
w_neg_grad = tf.matmul(tf.transpose(v1), h1)

# Calculate the Contrastive Divergence to maximize
CD = (w_pos_grad - w_neg_grad) / tf.to_float(tf.shape(v0)[0])

# Create methods to update the weights and biases
update_w = W + alpha * CD
update_vb = vb + alpha * tf.reduce_mean(v0 - v1, 0)
update_hb = hb + alpha * tf.reduce_mean(h0 - h1, 0)

# Set the error function, here we use Mean Absolute Error Function
err = v0 - v1
err_sum = tf.reduce_mean(err*err)

err_sum

In [None]:
cur_w = np.zeros([visibleUnits, hiddenUnits], np.float32)

cur_vb = np.zeros([visibleUnits], np.float32)

cur_hb = np.zeros([hiddenUnits], np.float32)

prv_w = np.zeros([visibleUnits, hiddenUnits], np.float32)

prv_vb = np.zeros([visibleUnits], np.float32)

prv_hb = np.zeros([hiddenUnits], np.float32)
sess = tf.Session()
sess.run(tf.global_variables_initializer())

epochs = 30
batchsize = 150
errors = []
for i in range(epochs):
    for start, end in zip(range(0, len(train_list), batchsize), range(batchsize, len(train_list), batchsize)):
        batch = train_list[start:end]
        cur_w = sess.run(update_w, feed_dict={v0: batch, W: prv_w, vb: prv_vb, hb: prv_hb})
        cur_vb = sess.run(update_vb, feed_dict={v0: batch, W: prv_w, vb: prv_vb, hb: prv_hb})
        cur_hb = sess.run(update_hb, feed_dict={v0: batch, W: prv_w, vb: prv_vb, hb: prv_hb})
        prv_w = cur_w
        prv_vb = cur_vb
        prv_hb = cur_hb
    errors.append(sess.run(err_sum, feed_dict={v0: train_list, W: cur_w, vb: cur_vb, hb: cur_hb}))
    print(errors[-1])
plt.plot(errors)
plt.ylabel('Error')
plt.xlabel('Epoch')
plt.show()

In [None]:
inputUser = [train_list[150]]
hh0 = tf.nn.sigmoid(tf.matmul(v0, W) + hb)
vv1 = tf.nn.sigmoid(tf.matmul(hh0, tf.transpose(W)) + vb)
feed = sess.run(hh0, feed_dict={v0: inputUser, W: prv_w, hb: prv_hb})
rec = sess.run(vv1, feed_dict={hh0: feed, W: prv_w, vb: prv_vb})

# List the 10 most recommended games for our mock user
inputuser_games = games_df
inputuser_games["Recommendation Score"] = rec[0]
inputuser_games.sort_values(["Recommendation Score"], ascending=False).head(10)

userid = steam_df.iloc[150]['userid']

#Find the games this input user has played
muser_df = steam_df.loc[(steam_df['userid'] == userid) & (steam_df['hoursplayed'] >0)]
muser_df

In [None]:
df_all = inputuser_games.merge(muser_df, how='left', indicator=True)
unplayed_games = df_all[df_all['_merge']=='left_only']

# Any Top 5 recommended games for input user which he haven't played
unplayed_games.loc[:,['game','Recommendation Score']].sort_values(['Recommendation Score'], ascending=False).head(5)

**The top 5 recommended games for this user is Football Manager 2014, Arma 3, Euro Truck Simulator 2, H1Z1 and Torchlight II.**