In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import gzip
from collections import defaultdict
import math
import sklearn
from surprise import SVD, Reader, Dataset
from surprise.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import random

2023-12-04 11:41:48.313718: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-04 11:41:48.313745: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-04 11:41:48.313763: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-04 11:41:48.318510: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [3]:
game_meta = []
users = []

In [4]:
for d in readGz("steam_games.json.gz"):
    game_meta.append(d)
for d in readGz("australian_users_items.json.gz"):
    #remove all user without purchase
    if len(d['items']) > 0:
        users.append(d)

In [5]:
def price_to_text(price):
    if type(price) == str:
        return "Free to Play"
    price = float(price)
    if price<10:
        return "low-price"
    if price<35:
        return "medium-price"
    return "high-price"

In [6]:
all_genre_tags = defaultdict(int)
game_tags = defaultdict(set)
for d in game_meta:
    keys = d.keys()
    if ('id' not in keys):
        #print(d)
        if (d['url'] == 'http://store.steampowered.com/app/200260'):
            g = 200260
        else:
            continue
    else:
        g = d['id']
    if 'price' not in keys:
        game_tags[g].add('Free to Play')
    else:
        game_tags[g].add(price_to_text(d['price']))
    if ('tags' in keys):      
        for tag in d['tags']:
            game_tags[g].add(tag)
    if ('genres' in keys):
        for genre in d['genres']:
            game_tags[g].add(genre)
    # if ('specs' in keys):
    #     for spec in d['specs']:
    #         game_tags[g].add(spec)
    for tag in game_tags[g]:
        all_genre_tags[tag] += 1

In [7]:
game_purchased_count = defaultdict(int)
for d in users:
    items = d['items']
    for item in items:
        if (item['playtime_forever'] > 60):
            game_purchased_count[item['item_id']] += 1

In [8]:
game_to_keep = set()
for g in game_purchased_count:
    if (game_purchased_count[g]>100):
        game_to_keep.add(g)

In [9]:
sorted_tags = sorted(list(all_genre_tags.keys()), key = lambda x: all_genre_tags[x], reverse = True)

In [10]:
def process_data(all_users, tag_to_keep):
    A_i = defaultdict(list)
    tags_kept = set(sorted_tags[:tag_to_keep])
    tags_kept_l = sorted_tags[:tag_to_keep]
    rare_bonus = {k:(lambda x: x*1.0/tag_to_keep*0.6-0.3 if x>tag_to_keep/2 else 0)(i) for i,k in enumerate(tags_kept)}
    games_no_tags = []
    for g in game_to_keep:
        tags = game_tags[g]
        tag_list = list(game_tags[g])
        new_tags = tags_kept.intersection(tags)
        if (len(new_tags) == 0):
            #print("no tag!!")
            games_no_tags.append(g)
            continue
        else:
            for tag in tags_kept_l:
                if tag not in tags:
                    A_i[g].append(0)
                else:
                    #using gamma transformation to transform ranking of tag to 0-1
                    relevance = 1-(tag_list.index(tag)*1.0/len(game_tags[g]))**3
                    #apply rare bonus
                    relevance += rare_bonus[tag]
                    if (relevance>1):
                        relevance = 1.0
                    A_i[g].append(relevance)
    games_w_tags = list(A_i.keys())
    games_w_tags_set = set(A_i.keys())
    interactions = []
    users_kept = []
    for u in all_users:
        num_recorded = 0
        for item in u['items']:
            if item['item_id'] in games_w_tags_set:
                num_recorded += 1
        if num_recorded>=10:
            users_kept.append(u)
    a = 0
    above_zero = lambda x: x if x > 0 else 0
    for u in tqdm(users_kept):
        for item in u['items']:
            if item['item_id'] in games_w_tags_set:
                if (item['playtime_forever'] > 120):
                    interactions.append((a, games_w_tags.index(item['item_id']), above_zero(np.log2(item['playtime_forever']/60))))
        a += 1
    A_i_matrix = np.zeros((tag_to_keep, len(games_w_tags)))
    for a,g in enumerate(games_w_tags):
        for b,relevance in enumerate(A_i[g]):
            A_i_matrix[b,a] = relevance
    return A_i_matrix, interactions, users_kept, games_w_tags

In [11]:
A_i_matrix_30, interactions_30, users_kept_30, games_w_tags_30 = process_data(users, 30)

  0%|          | 0/54667 [00:00<?, ?it/s]

In [12]:
average_hours = float(sum([d[2] for d in interactions_30])*1.0/len(interactions_30))

In [13]:
class tagMF(tf.keras.Model):
    def __init__(self, F, lamb, A_i, mu, U, I):
        super(tagMF, self).__init__()
        self.alpha = tf.Variable(mu)
        self.P = tf.Variable(tf.random.normal([U,F],stddev=0.001)) #User factors, R^(U, F)
        self.Theta_i = tf.Variable(tf.random.normal([len(A_i),F],stddev=0.001)) #tag to factors, R^(T, F)
        self.A_i = tf.constant(A_i) #Item-tag relevance, R^(I, T)
        self.betaU = tf.Variable(tf.random.normal([U],stddev=0.001))
        self.betaI = tf.Variable(tf.random.normal([I],stddev=0.001))
        self.lamb = lamb
        self.F = F

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p_u = tf.reshape(self.P[u], [1, -1])
        p = tf.linalg.matmul(p_u, tf.transpose(self.Theta_i))
        p = tf.squeeze(p)
        final_result = tf.reduce_sum(tf.multiply(p, self.A_i[:,i]))
        print(final_result)
        return self.alpha + self.betaU[u] + self.betaI[i] + final_result
        
    def predict_given_user(self, u, i):
        p_u = tf.reshape(tf.constant(u), [1, -1])
        p = tf.linalg.matmul(p_u, self.Theta_i)
        p = tf.squeeze(p)
        Theta_Ai = tf.linalg.matmul(tf.transpose(self.Theta_i), tf.reshape(self.A_i[:,i], [-1, 1]))
        Theta_Ai = tf.squeeze(Theta_Ai)
        final_result = tf.reduce_sum(tf.multiply(p, Theta_Ai))
        return final_result
        
    # Regularizer
    def reg(self):
        return self.lamb * (tf.reduce_sum(self.P**2) + tf.reduce_sum(self.Theta_i**2) + tf.reduce_sum(self.betaU**2) + tf.reduce_sum(self.betaI**2))
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        p_u = tf.transpose(tf.nn.embedding_lookup(self.P, u))
        A = tf.gather(self.A_i, indices=i, axis=1)
        i_factors = tf.linalg.matmul(tf.transpose(self.Theta_i), A)
        pred = tf.reduce_sum(tf.multiply(p_u, i_factors),0)
        return self.alpha + beta_u + beta_i + pred
    
    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        squared_diff = tf.square(pred - r)
        mse_loss = tf.reduce_mean(squared_diff)
        
        return mse_loss

In [14]:
def trainingStep(model, interactions):
    Nsamples = 50000
    reg = 0
    with tf.GradientTape() as tape:
        sampleU,sampleI,sampleR = zip(*random.sample(interactions, Nsamples))
        loss = model(sampleU,sampleI,sampleR)
        reg = model.reg()
        loss += reg
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()-reg

In [15]:
def MSE(y, ypred):
            return sum((yt - yp) ** 2 for yt, yp in zip(y, ypred)) / len(y)

In [17]:
tag_num = 30
lamb = 0.0001
random.seed(116)
train_interaction, test_interaction = train_test_split(interactions_30, train_size=0.98, random_state=116)
model = tagMF(30,lamb,A_i_matrix_30.astype("float32"), average_hours, len(users_kept_30), len(games_w_tags_30))
optimizer = tf.keras.optimizers.Adam(0.05)
for i in tqdm(range(101)):
    obj = trainingStep(model, train_interaction)
    if (i%20 == 0):
        sampleU,sampleI,sampleR =zip(*test_interaction)
        predR = model.predictSample(sampleU,sampleI)
        mse_vali = MSE(sampleR,predR)
        print(obj)
        print(f"MSE:{mse_vali}")

2023-12-04 11:43:19.971987: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-04 11:43:19.976245: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-04 11:43:19.976400: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

  0%|          | 0/101 [00:00<?, ?it/s]

2023-12-04 11:43:28.234419: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x1df24420 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-12-04 11:43:28.234435: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3090, Compute Capability 8.6
2023-12-04 11:43:28.237374: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-12-04 11:43:28.246265: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8600
2023-12-04 11:43:28.299913: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


tf.Tensor(3.6794248, shape=(), dtype=float32)
MSE:3.6381757259368896
tf.Tensor(2.593016, shape=(), dtype=float32)
MSE:2.7658345699310303
tf.Tensor(2.4180946, shape=(), dtype=float32)
MSE:2.702401638031006
tf.Tensor(2.3359957, shape=(), dtype=float32)
MSE:2.6372506618499756
tf.Tensor(2.3230011, shape=(), dtype=float32)
MSE:2.626047372817993
tf.Tensor(2.2549117, shape=(), dtype=float32)
MSE:2.6174163818359375


In [18]:
sorted_tags[:30]

['low-price',
 'Indie',
 'Action',
 'Adventure',
 'Casual',
 'Simulation',
 'Strategy',
 'RPG',
 'medium-price',
 'Free to Play',
 'Singleplayer',
 'Multiplayer',
 'Great Soundtrack',
 'Puzzle',
 'Early Access',
 '2D',
 'Atmospheric',
 'VR',
 'Sports',
 'Platformer',
 'Story Rich',
 'Sci-fi',
 'Fantasy',
 'Horror',
 'Open World',
 'Difficult',
 'Anime',
 'Massively Multiplayer',
 'Pixel Graphics',
 'Co-op']

In [19]:
#select from tags above, replace index with index of selected tags in sorted_tags
Selected_tags = np.zeros(30).astype("float32")
Selected_tags[5] = 0.85
Selected_tags[6] = 0.95
Selected_tags[21] = 1.0

In [None]:
result = []
for i in range(len(games_w_tags_30)):
    result.append(model.predict_given_user(Selected_tags, i).numpy())

In [None]:
result_sorted = [(x,i) for i,x in enumerate(result)]
recommendation = sorted(result_sorted, reverse = True)[:20]
for rec in recommendation:
    print(f"https://store.steampowered.com/app/{games_w_tags_30[rec[1]]}   {rec[0]}")