In [1]:
import gzip
import numpy as np
import pandas as pd
import scipy.sparse as sp

In [2]:
def readData(path):
    f = gzip.open(path, 'rt')
    f.readline()
    data = []
    for l in f:
        d = eval(l)
        userId = d['userID']
        gameId = d['gameID']
        hours = d['hours_transformed']
        data.append({
            'userId':userId,
            'gameId':gameId,
            'hours':hours,
        })
    return pd.DataFrame(data)

In [3]:
data = readData("train.json.gz")

In [4]:
data = data.dropna()
data['originalUserId'] = data['userId']
data['originalGameId'] = data['gameId']
data['userId'] = data['userId'].astype("category").cat.codes
data['gameId'] = data['gameId'].astype("category").cat.codes
itemLookup = data[['userId', 'gameId','originalUserId','originalGameId']].drop_duplicates()
itemLookup['userId'] = itemLookup.userId.astype(str)
itemLookup['gameId'] = itemLookup.gameId.astype(str)

In [5]:
users = list(np.sort(data.userId.unique()))
games = list(np.sort(data.gameId.unique()))
hours = list(data.hours)

In [6]:
rows = data.userId.astype(float)
cols = data.gameId.astype(float)

In [7]:
dataSparse = sp.csr_matrix((hours, (rows, cols)), shape=(len(users), len(games)))

from implicit.nearest_neighbours import bm25_weight

dataSparse = bm25_weight(dataSparse, K1 = 1, B = 0.8)
dataSparse = dataSparse.tocsr()

In [8]:
from implicit.lmf import LogisticMatrixFactorization

model = LogisticMatrixFactorization(
    factors = 50,
    learning_rate = 1.0,
    regularization = 10.0,
    neg_prop = 100,
)
model.fit(dataSparse)

  0%|          | 0/30 [00:00<?, ?it/s]

In [9]:
predictions = open('predictions_Played.csv', 'w')
users = []
games = []
for l in open("pairs_Played.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    user,game = l.strip().split(',')
    users.append(user)
    games.append(game)

transformedUserIdLookup = {str(row['originalUserId']): int(row['userId']) for _, row in itemLookup.iterrows()}
usersTrans = []
notFoundUserCount = 0
for user in users:
    if user not in transformedUserIdLookup:
        usersTrans.append(-1)
        notFoundUserCount = notFoundUserCount + 1
    else:
        usersTrans.append(transformedUserIdLookup[user]) 
print(str(notFoundUserCount) + ' Users Not Found')
allRecommendations, _ = model.recommend(usersTrans, dataSparse[usersTrans], N = 600, filter_already_liked_items = True)

print('Recommend Done')

originalGameIdLookup = {str(row['gameId']): row['originalGameId'] for _, row in itemLookup.iterrows()}
recSets = [set(originalGameIdLookup[str(r)] for r in rec) for rec in allRecommendations]

i = 0
for u,g,rec in zip(usersTrans,games,recSets):
    pred = 0
    if (u != -1) and g in rec:
        pred = 1
    else:
        pred = 0
    _ = predictions.write(users[i] + ',' + g + ',' + str(pred) + '\n')
    i = i + 1

predictions.close()

2 Users Not Found
Recommend Done


In [10]:
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering, Reader, Dataset
from surprise.model_selection import train_test_split, cross_validate
from surprise import accuracy

In [11]:
data = readData("train.json.gz")

In [12]:
reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(data, reader)

In [13]:
benchmark = []
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    results = cross_validate(algorithm, data, measures=['MSE'], cv = 5, verbose=False)
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = pd.concat([tmp, pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])])
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_mse')

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Don

Unnamed: 0_level_0,test_mse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,3.08073,0.109447,0.249424
KNNBaseline,3.174553,0.976283,2.129861
SVD,3.277039,0.769895,0.227221
KNNBasic,3.343875,0.851399,1.914416
SlopeOne,3.395117,0.20894,0.539351
KNNWithMeans,3.431356,0.898679,2.047828
KNNWithZScore,3.459373,0.989679,2.050098
SVDpp,3.64781,2.326558,0.687022
CoClustering,3.681952,1.008229,0.187731
NMF,3.928221,1.139842,0.172154


In [14]:
bslOptions = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
model = BaselineOnly(bsl_options = bslOptions)
cross_validate(model, data, measures = ['MSE'], cv = 5, verbose = False)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


{'test_mse': array([3.0899807 , 3.02378583, 3.02483092, 3.04125602, 3.08441461]),
 'fit_time': (0.06663990020751953,
  0.07636404037475586,
  0.08256196975708008,
  0.07338309288024902,
  0.08058977127075195),
 'test_time': (0.03319382667541504,
  0.3156452178955078,
  0.31651806831359863,
  0.31752896308898926,
  0.318742036819458)}

In [18]:
trainingData = data.build_full_trainset()

In [19]:
model = BaselineOnly(bsl_options = bslOptions)
model.fit(trainingData)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x2b72f5f10>

In [20]:
predictions = open("predictions_Hours.csv", 'w')
for l in open("pairs_Hours.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    predictedPlayTime = model.predict(u, g).est
    predictions.write(u + ',' + g + ',' + str(predictedPlayTime) + '\n')

predictions.close()