In [32]:
import gzip
import math
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from collections import defaultdict
from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
import random
from scipy import sparse
import numpy as np

In [7]:
def readGz(path):
    for l in gzip.open(path, 'rt', encoding='cp437', errors='ignore'):
        yield eval(l)

In [8]:
dataset = []
for l in readGz("australian_users_items.json.gz"):
    dataset.append(l)

In [9]:
pairs = []
userID, itemID = {}, {}
for d in dataset:
    if len(d['items']) < 3: continue
    if d['steam_id'] not in userID: userID[d['steam_id']] = len(userID)
    for i in d['items']:
        if i['item_id'] not in itemID: itemID[i['item_id']] = len(itemID)
        pairs.append((d['steam_id'], i['item_id'], i['playtime_forever']))

In [10]:
random.shuffle(pairs)

In [11]:
X = sparse.lil_matrix((len(pairs), len(userID)+len(itemID)))
for i in range(len(pairs)):
    user = userID[pairs[i][0]]
    item = itemID[pairs[i][1]]
    X[i, user] = 1
    X[i, len(userID) + item] = 1

In [12]:
y = np.array([1 if d[2] > 2 else -1 for d in pairs])

In [13]:
n = len(pairs)
X_train = X[:round(n*.9)]
X_validation = X[round(n*.7):round(n*.9)]
X_test = X[round(n*.9):]

y_train = y[:round(n*.9)]
y_validation = y[round(n*.7):round(n*.9)]
y_test = y[round(n*.9):]

In [19]:
pairs_df = pd.DataFrame(pairs)
pairs_df.columns = ['user', 'item', 'hours']

In [29]:
pairs_df['played'] = (pairs_df['hours'] > 2).astype(int)

In [28]:
reader = Reader(rating_scale = (0, 1))

In [31]:
data = Dataset.load_from_df(pairs_df[['user', 'item', 'played']], reader)

In [20]:
algo = SVD()

In [33]:
cross_validate(algo, data, measures=['MAE'], cv=5, verbose=True)

Evaluating MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.2932  0.2928  0.2925  0.2937  0.2929  0.2930  0.0004  
Fit time          203.48  198.77  201.98  213.54  214.24  206.40  6.30    
Test time         5.31    5.15    5.72    5.98    8.49    6.13    1.22    


{'test_mae': array([0.293224  , 0.29284497, 0.29249923, 0.29369377, 0.29290446]),
 'fit_time': (203.47556686401367,
  198.7730541229248,
  201.98195791244507,
  213.54193329811096,
  214.23575019836426),
 'test_time': (5.312453031539917,
  5.153742790222168,
  5.716975450515747,
  5.980062007904053,
  8.494498491287231)}

In [21]:
pairs_df

Unnamed: 0,user,item,hours
0,76561198040028503,204240,216
1,76561198041648531,9880,2842
2,76561198012453955,55230,368
3,76561198045633202,8190,2377
4,76561198039593988,208090,379
...,...,...,...
5146356,76561198068492230,9420,3158
5146357,76561198058478694,242720,13
5146358,76561198059920635,253710,1
5146359,76561198043784371,232770,0
