# CSE 258: Assignment 1
### Benjamin Xia

### Setup

In [27]:
import numpy as np
import pandas as pd
from scipy import sparse

from sklearn import preprocessing
from sklearn.decomposition import LatentDirichletAllocation
from sklearn import feature_extraction
from sklearn.model_selection import cross_validate

from rankfm.rankfm import RankFM
from fastFM import als, sgd

import random
from collections import defaultdict
from tqdm import tqdm
import gzip

import os
import pickle
import copy

RANDOM_SEED = 0
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

test = False

### Preprocessing

#### Preprocess user/item ID's, compensation, early_access, and time

In [38]:
user_oe = preprocessing.OrdinalEncoder(dtype=np.int32, min_frequency=5, handle_unknown='use_encoded_value', unknown_value=6710)
item_oe = preprocessing.OrdinalEncoder(dtype=np.int32, min_frequency=5)

itemset = set() # Set of all unique users
userset = set() # Set of all unique items
U = defaultdict(set)
I = defaultdict(set)
time_played = defaultdict(dict)
item_mean_hr = defaultdict()
user_mean_hr = defaultdict()
ft = ['early_access', 'compensation'] # features unavailable/cannot be approximated in inference
def read_json(path):
    f: gzip.GzipFile = gzip.open(path)
    f.readline()
    for line in f:
        entry = eval(line)
        yield entry

# Encode userID and itemID as integers
def process_data():
    global itemset, userset, U, I, user_mean_hr, item_mean_hr
    data = []
    for entry in read_json('train.json.gz'):
        data.append(entry)
        time_played[entry['userID']][entry['gameID']] = entry['hours_transformed']

    df: pd.DataFrame = pd.DataFrame(data)
    del data
    itemset = set(df['gameID'].unique())
    userset = set(df['userID'].unique())

    U = dict(df.groupby('gameID')['userID'].unique())
    I = dict(df.groupby('userID')['gameID'].unique())
    U = { g : set(U[g]) for g in U }
    I = { u : set(I[u]) for u in I }

    df['userIDX'] = user_oe.fit_transform(df[['userID']])
    df['itemIDX'] = item_oe.fit_transform(df[['gameID']])
    df.rename({'gameID' : 'itemID'}, axis=1, inplace=True)

    df.drop(labels=['hours', 'user_id', 'date'], axis=1, inplace=True)

    # Get features that won't be available
    df.fillna(value=0, axis=1, inplace=True)
    df['compensation'] = df['compensation'].map(lambda x : x if x == 0 else 1)
    df[['early_access', 'compensation']] = df[['early_access', 'compensation']].astype(np.int32)

    time_label = df['hours_transformed']
    item_mean_hr = dict(df.groupby('itemID')['hours_transformed'].mean())
    user_mean_hr = dict(df.groupby('userID')['hours_transformed'].mean())
    return df, time_label

df, time_label = process_data()
user_mean_ft = df.groupby('userIDX')[ft].mean()
item_mean_ft = df.groupby('itemIDX')[ft].mean()
df.drop(labels=ft + ['hours_transformed', 'found_funny'], axis=1, inplace=True)

In [4]:
ustoi = dict(df.groupby('userID')['userIDX'].unique().apply(lambda x: x[0]))
istoi = dict(df.groupby('itemID')['itemIDX'].unique().apply(lambda x: x[0]))

#### Preprocess user text and convert to descriptors

In [5]:
def get_text_embedding():
    if not os.path.isfile('./text_embed.npy'): # Generate new descriptors for each review using pretrained transformer
        dftext = df.groupby('itemIDX')['text'].apply(' '.join).reset_index()
        counter = feature_extraction.text.CountVectorizer(min_df=0.05, max_df=0.5, stop_words='english', max_features=2000, ngram_range=(1, 2))
        wordcount = counter.fit_transform(dftext['text'])
        LDA = LatentDirichletAllocation(n_components=20, random_state=RANDOM_SEED)
        text_embed = LDA.fit_transform(wordcount)
        np.save('text_embed.npy', text_embed)
    else: # Text descriptors already computed
        text_embed = np.load('./text_embed.npy')

    return text_embed

text_embed = get_text_embedding()
# text_embed = text_embed / np.linalg.norm(text_embed, axis=1)[...,None]

df.drop('text', axis=1, inplace=True)


In [6]:
text_embed = np.concatenate((np.arange(0, len(text_embed))[:,  None], text_embed, item_mean_ft.to_numpy()), axis=1)

In [7]:
df_train = df.iloc[:150000]
df_time_train_label = time_label[:150000]
df_valid = df.iloc[150000:]
df_time_valid_label = time_label[150000:]

### Played Predictions

In [8]:
played_model = RankFM(factors=5,
               loss='warp',
               max_samples=300,
               learning_exponent=0.25,
               learning_schedule='invscaling')

In [9]:
# Construct a new validation set w/ negative pairs
neg_pairs = []
for review in df_valid.iterrows():
    review = review[1]
    sample = random.sample(itemset.difference(I[review['userID']]), k=1)[0]
    neg_pairs.append([review['userIDX'], istoi[sample]])
pos_pairs = df_valid[['userIDX', 'itemIDX']].to_numpy()
neg_pairs = np.array(neg_pairs)

def played_validate(model):
    pos_scores = model.predict(pos_pairs)
    neg_scores = model.predict(neg_pairs)
    acc = (np.mean(pos_scores >= 0) + np.mean(neg_scores < 0)) / 2
    print(f'Validation %: {acc * 100}')
    return acc

In [24]:
# # Validation stuff - determine factor dimensions
# from sklearn.model_selection import KFold

# if not test:
#     kf = KFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)

#     for k in [1, 2, 3, 4, 5, 6, 10, 20]:
#         played_model = RankFM(factors=k,
#                     loss='warp',
#                     max_samples=300,
#                     learning_exponent=0.25,
#                     learning_schedule='invscaling')
#         fold_accs = []
#         for i, (train, test) in enumerate(kf.split(df[['userIDX', 'itemIDX']])):
#             played_model.fit(df.iloc[train][['userIDX', 'itemIDX']], item_features=text_embed, epochs=20, verbose=False)
#             neg_pairs = []
#             for review in df.iloc[test].iterrows():
#                 review = review[1]
#                 sample = random.sample(itemset.difference(I[review['userID']]), k=1)[0]
#                 neg_pairs.append([review['userIDX'], istoi[sample]])
#             pos_pairs = df.iloc[test][['userIDX', 'itemIDX']].to_numpy()
#             neg_pairs = np.array(neg_pairs)
#             pos_scores = played_model.predict(pos_pairs)
#             neg_scores = played_model.predict(neg_pairs)
#             acc = (np.mean(pos_scores >= 0) + np.mean(neg_scores < 0)) / 2
#             fold_accs.append(acc)
#             print(f'Validation %: {acc * 100}')
#         print(f'k: {k} = {np.mean(fold_accs)}')

In [25]:
# # Determine training epochs
# kf = KFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)
# if not test:
#     accs = np.zeros((10, 50))
#     for j, (train, test) in enumerate(kf.split(df[['userIDX', 'itemIDX']])):
#         played_model = RankFM(factors=5,
#                 loss='warp',
#                 max_samples=300,
#                 learning_exponent=0.25,
#                 learning_schedule='invscaling')
#         for i in range(50):
#             played_model.fit_partial(df.iloc[train][['userIDX', 'itemIDX']], item_features=text_embed, epochs=4, verbose=False)
#             neg_pairs = []
#             for review in df.iloc[test].iterrows():
#                 review = review[1]
#                 sample = random.sample(itemset.difference(I[review['userID']]), k=1)[0]
#                 neg_pairs.append([review['userIDX'], istoi[sample]])
#             pos_pairs = df.iloc[test][['userIDX', 'itemIDX']].to_numpy()
#             neg_pairs = np.array(neg_pairs)
#             pos_scores = played_model.predict(pos_pairs)
#             neg_scores = played_model.predict(neg_pairs)
#             acc = (np.mean(pos_scores >= 0) + np.mean(neg_scores < 0)) / 2
#             print(f'Validation %: {acc * 100}')
#             accs[j, i] = acc

#     print(accs)

factors=10 ~.71.5
factors=5  ~.72

In [26]:
train = False
save = False
played_model = RankFM(factors=5,
        loss='warp',
        max_samples=300,
        learning_exponent=0.25,
        learning_schedule='invscaling')
if train == True:
    best_model = None
    best_acc = 0
    for i in range(50):
        # switch fit_partial's dataframe to df_train for testing, "df" for actual predictions
        if test == True:
            played_model.fit_partial(df[['userIDX', 'itemIDX']], item_features=text_embed, epochs=4, verbose=False)
        else:
            played_model.fit_partial(df_time_train_label[['userIDX', 'itemIDX']], item_features=text_embed, epochs=4, verbose=False)
        acc = played_validate(played_model)
        if acc > best_acc:
            best_model = copy.deepcopy(played_model)
            best_acc = acc
    if save == True:
        model_file = open('rankfm.obj', 'wb')
        pickle.dump(best_model, model_file)
        model_file.close()
else:
    model_file = open('rankfm.obj', 'rb')
    best_model = pickle.load(model_file)
    played_model = best_model
    model_file.close()

#### Make and write predictions

In [None]:
# test = pd.read_csv('./pairs_Played.csv')
# testpred = test.copy()
# test['itemID'] = test['gameID']
# # Map unseen entries to default user (this user is already grouped with other users due to their few # of reviews in training set)
# test['userID'] = test['userID'].map(lambda x: x if x in userset else 'u03473346')
# test['userIDX'] = user_oe.transform(test[['userID']])
# test['itemIDX'] = item_oe.transform(test[['gameID']])
# test.drop(columns=['gameID', 'prediction'], inplace=True)
# scores = best_model.predict(test[['userIDX', 'itemIDX']])
# testpred = pd.read_csv('./pairs_Played.csv')
# testpred['prediction'] = (scores >= np.median(scores)).astype(np.int32)
# testpred.to_csv('./predictions_Played.csv', index=False)

### Time Prediction

In [15]:
from sklearn.model_selection import cross_validate

(6698, 10)

In [16]:
def convert_df(df: pd.DataFrame):
    datum = np.zeros((len(df), 10 + 10 + 22))
    for i, (idx, row) in enumerate(df.iterrows()):
        user = row['userIDX']
        item = row['itemIDX']
        datum[i, :10] = played_model.v_u[user]
        datum[i, 10:20] = played_model.v_i[item]
        datum[i, 20:] = text_embed[item, 1:]
    return datum
time_train = convert_df(df_train)
time_valid = convert_df(df_valid)

#### Collaborative Filtering with played prediction latent factors (this sucks)

In [72]:
def lr_sim(item_i, item_j):
    lr_item_i = best_model.v_i[item_i]
    lr_item_j = best_model.v_i[item_j]
    return np.dot(lr_item_i, lr_item_j) / (np.linalg.norm(lr_item_i) * np.linalg.norm(lr_item_j))

def jaccard_sim(item_i, item_j):
    s1 = U[item_i]
    s2 = U[item_j]
    return len(s1.intersection(s2)) / len(s1.union(s2))

def cf_predict(user_id, user_idx, item_id, item_idx):
    sim_sum = 0 # Sum of similarity scores (besides current)
    output = 0
    for item_j in time_played[user_id]:
        if item_j == item_id:
            continue
        sim = lr_sim(item_idx, istoi[item_j])
        # sim = jaccard_sim(item_j, item_id)
        score = sim * (time_played[user_id][item_j] - item_mean_hr[item_j])
        output += score
        sim_sum += np.abs(sim)
    if sim_sum == 0:
        return item_mean_hr[item_id]
    output /= sim_sum
    output += item_mean_hr[item_id]
    return output

# preds = np.zeros((len(df_train)))
# for i in range(len(df_train)):
#     row = df_train.iloc[i]
#     preds[i] = cf_predict(row['userID'], row['userIDX'], row['itemID'], row['itemIDX'])
# print(np.mean((preds - df_time_train_label)**2))
# preds = np.zeros((len(df_time_valid_label)))
# for i in range(len(df_valid)):
#     row = df_valid.iloc[i]
#     preds[i] = cf_predict(row['userID'], row['userIDX'], row['itemID'], row['itemIDX'])
# print(np.mean((preds - df_time_valid_label)**2))

3.1271098722319834


#### XGBoost with played predictioin latent factors (this sucks)

In [91]:
import xgboost
from sklearn import ensemble
# time_model = xgboost.XGBRegressor(n_estimators=100, reg_alpha=1, gamma=5, max_depth=5)
# # time_model = ensemble.RandomForestRegressor(n_estimators=10, max_depth=10, max_features='sqrt', n_jobs=-1)
# time_model.fit(time_train, df_time_train_label)

# train_preds = time_model.predict(time_train)
# # train_preds[train_preds < 0] = 0
# # train_preds[train_preds > 14] = 14
# print(np.mean((train_preds - df_time_train_label)**2))
# valid_preds = time_model.predict(time_valid)
# # valid_preds[valid_preds < 0] = 0
# # valid_preds[valid_preds > 14] = 14
# MSE = np.mean((valid_preds - df_time_valid_label)**2)
# print(MSE)

2.9195459794186234
3.1357034540118587


#### FastFM,(this sucks but not as much, with or without features)

In [94]:
# time_model = als.FMRegression(n_iter=1,
#                               rank=3,
#                               random_state=RANDOM_SEED,
#                               l2_reg=0.01)

# df_train.head()
# def convert_sparse_df(df: pd.DataFrame):
#     datum = sparse.lil_matrix((len(df), len(itemset) + len(userset) + 22))
#     for i, (idx, row) in enumerate(df.iterrows()):
#         user = row['userIDX']
#         item = row['itemIDX']
#         datum[i, user] = 1
#         datum[i, len(userset) + item] = 1
#         datum[i, len(userset) + len(itemset):] = text_embed[item, 1:]
#     return datum

# time_train = convert_sparse_df(df_train)
# time_valid = convert_sparse_df(df_valid)
# time_model = als.FMRegression(n_iter=1000,
#                               rank=0,
#                               init_stdev=0.001,
#                               random_state=RANDOM_SEED,
#                               l2_reg_w=5,
#                               l2_reg_V=2)

# time_model.fit(time_train, df_time_train_label)
# train_preds = time_model.predict(time_train)
# # train_preds[train_preds < 0] = 0
# # train_preds[train_preds > 14] = 14
# print(np.mean((train_preds - df_time_train_label)**2))
# valid_preds = time_model.predict(time_valid)
# # valid_preds[valid_preds < 0] = 0
# # valid_preds[valid_preds > 14] = 14
# MSE = np.mean((valid_preds - df_time_valid_label)**2)
# print(MSE)

2.7433401715394616
3.05446116469164


In [106]:
beta_u = np.zeros(len(I))
beta_i = np.zeros(len(U))
alpha = time_label.mean()
u_cnts = df['userIDX'].value_counts()
i_cnts = df['itemIDX'].value_counts()
def closed_form(lamb, trainset, trainlabel):
    global alpha
    global beta_u
    global beta_i
    trainset = trainset.copy()
    user_indices = trainset['userIDX'].tolist()
    item_indices = trainset['itemIDX'].tolist()
    trainset['label'] = trainlabel
    labelsum = trainset.groupby('userIDX')['label'].sum().tolist()
    usersum = trainset.groupby('userIDX')['itemIDX'].tolist()
    alpha = np.mean(trainlabel.to_numpy() - beta_u[user_indices] - beta_i[item_indices])
    for u in range(len(beta_u)):
        beta_u[u] = trainsum[u] - (alpha * u_cnts[u]) - beta_i[]
    # beta_u[user_indices] = trainlabel[]
    # for i in range(len(trainset)):
    #     row = trainset.iloc[i]
    #     user = row['userIDX']
    #     item = row['itemIDX']
    #     new_beta_u[user] += (trainlabel[i] - alpha - beta_i[item]) / (lamb  + u_cnts[user])
    # beta_u = new_beta_u

    # for i in range(len(trainset)):
    #     row = trainset.iloc[i]
    #     user = row['userIDX']
    #     item = row['itemIDX']
    #     new_beta_i[item] += (trainlabel[i] - alpha - beta_u[user]) / (lamb + i_cnts[item])
    # beta_i = new_beta_i

for i in tqdm(range(300)):
    closed_form(5, df_train, df_time_train_label)



IndentationError: expected an indented block (363181243.py, line 32)

In [109]:
df_train.groupby('userIDX')['itemIDX']

userIDX  itemIDX
0        21         1
         169        1
         292        1
         710        1
         974        1
                   ..
6697     1765       1
         1782       1
         1944       1
         2183       1
         2242       1
Name: itemIDX, Length: 149999, dtype: int64

#### Make and write predictions

In [None]:
test = pd.read_csv('./pairs_Hours.csv')
testpred = test.copy()
test['itemID'] = test['gameID']
# Map unseen entries to default user (this user is already grouped with other users due to their few # of reviews in training set)
test['userID'] = test['userID'].map(lambda x: x if x in userset else 'u03473346')
test['userIDX'] = user_oe.transform(test[['userID']])
test['itemIDX'] = item_oe.transform(test[['gameID']])
test.drop(columns=['gameID', 'prediction'], inplace=True)

time_test = convert_df(test)
preds = time_model.predict(time_test)

testpred = pd.read_csv('./pairs_Hours.csv')
testpred['prediction'] = preds
testpred.to_csv('./predictions_Hours.csv', index=False)