In [187]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy
import string
import random
import string
from sklearn import linear_model
import tarfile 
from implicit import bpr
import tensorflow as tf

In [188]:
def readJSON(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        d = eval(l)
        u = d['userID']
        g = d['gameID']
        yield u,g,d

In [189]:
dataset = []
for l in readJSON("train.json.gz"):
    dataset.append(l)

In [190]:
userIDs,itemIDs = {},{}
sample = []
interactions = []
allItems = set()
itemsPerUser = defaultdict(set)
usersPerItem = defaultdict(set)
itemCount = defaultdict(int)
total = 0

for u,g,_ in dataset:
    sample.append((u,g,1))
    itemsPerUser[u].add(g)
    usersPerItem[g].add(u)
    allItems.add(g)
    itemCount[g] += 1
    total += 1
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not g in itemIDs: itemIDs[g] = len(itemIDs)

isPopular = []
mostPopular = [(itemCount[x], x) for x in itemCount]
mostPopular.sort()
mostPopular.reverse()

count = 0
for ic, i in mostPopular:
    count += ic
    isPopular.append(i)
    if count > total * 0.5:
        break
    
nUsers,nItems = len(userIDs),len(itemIDs)

In [191]:
items = list(itemIDs.keys())

In [192]:
class BPRbatch(tf.keras.Model):
    def __init__(self, K, lamb):
        super(BPRbatch, self).__init__()
        # Initialize variables
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        # Regularization coefficient
        self.lamb = lamb

    # Prediction for a single instance
    def predict(self, u, i):
        p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.nn.l2_loss(self.betaI) +\
                            tf.nn.l2_loss(self.gammaU) +\
                            tf.nn.l2_loss(self.gammaI))
    
    def score(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return x_ui

    def call(self, sampleU, sampleI, sampleJ):
        x_ui = self.score(sampleU, sampleI)
        x_uj = self.score(sampleU, sampleJ)
        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))

In [193]:
optimizer = tf.keras.optimizers.Adam(0.1)



In [194]:
modelBPR = BPRbatch(5, 0.00001)

In [195]:
def trainingStepBPR(model, interaction):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleJ = [], [], []
        for _ in range(Nsamples):
            u,i,_ = random.choice(interaction) # positive sample
            j = random.choice(list(allItems.difference(itemsPerUser[u]))) # negative sample
            while j in itemsPerUser[u]:
                j = random.choice(items)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleJ.append(itemIDs[j])

        loss = model(sampleU,sampleI,sampleJ)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [196]:
for i in range(200):
    obj = trainingStepBPR(modelBPR, sample)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))

iteration 10, objective = 0.5108999
iteration 20, objective = 0.48332793
iteration 30, objective = 0.47399753
iteration 40, objective = 0.46034557
iteration 50, objective = 0.45629472
iteration 60, objective = 0.45243314
iteration 70, objective = 0.44801155
iteration 80, objective = 0.44625986
iteration 90, objective = 0.44294336
iteration 100, objective = 0.4446679
iteration 110, objective = 0.4437706
iteration 120, objective = 0.44518203
iteration 130, objective = 0.43935502
iteration 140, objective = 0.44386712
iteration 150, objective = 0.44133848
iteration 160, objective = 0.44336075
iteration 170, objective = 0.43982622
iteration 180, objective = 0.43985367
iteration 190, objective = 0.4400969
iteration 200, objective = 0.44245327


In [197]:
predictions_csv = open("predictions_Played.csv", 'w')
index = 0
for l in open("pairs_Played.csv"):
    if l.startswith("userID"):
        predictions_csv.write(l)
        continue
    u,g = l.strip().split(',')
    if u in userIDs and g in itemIDs:
        pred = modelBPR.predict(userIDs[u], itemIDs[g]).numpy() > 0.45
    else:
        pred = g in isPopular
    _ = predictions_csv.write(u + ',' + g + ',' + str(int(pred)) + '\n')
    index += 1
predictions_csv.close()

In [379]:
userHours = defaultdict(list)
itemHours = defaultdict(list)
rating = {}
globalAverage = 0

for u,g,d in dataset:
    h = d['hours_transformed']
    userHours[u].append(d)
    itemHours[g].append(d)
    globalAverage += h
    
betaU = {}
betaI = {}

for u in userHours:
    betaU[u] = numpy.mean([d['hours_transformed'] for d in userHours[u]])
for g in itemHours:
    betaI[g] = numpy.mean([d['hours_transformed'] for d in itemHours[g]])
    
alpha = globalAverage

In [382]:
def iterate(lamb):
    temp = 0
    for u,g,d in dataset:
        temp += (d['hours_transformed'] - (betaU[u] + betaI[g]))
    alpha = temp / len(dataset)
    for u, ds in userHours.items():
        temp = 0
        for d in ds:
            g = d["gameID"]
            temp += (d['hours_transformed'] - (alpha + betaI[g]))
        betaU[u] = temp / (lamb + len(ds))
    for g, ds in itemHours.items():
        temp = 0
        for d in ds:
            u = d["userID"]
            temp += (d['hours_transformed'] - (alpha + betaU[u]))
        betaI[g] = temp / (lamb + len(ds))

In [389]:
for i in range(200):
    iterate(5)

In [390]:
def feat(u,g):
    bu, bi = 0, 0
    if u in betaU:
        bu = betaU[u]
    if g in betaI:
        bi = betaI[g]
    return alpha + bu + bi

In [391]:
predictions_csv = open("predictions_Hours.csv", 'w')
for l in open("pairs_Hours.csv"):
    if l.startswith("userID"):
        predictions_csv.write(l)
        continue
    u,g = l.strip().split(',')
    _ = predictions_csv.write(u + ',' + g + ',' + str(tp.predict(u,g)) + '\n')
    index += 1
predictions_csv.close()