# CSE 258: Assignment 1
### Benjamin Xia

### Setup

In [1]:
import numpy as np
import pandas as pd
from scipy import sparse

from sklearn import preprocessing
from sklearn.decomposition import LatentDirichletAllocation
from sklearn import feature_extraction
from sklearn.model_selection import KFold

from rankfm.rankfm import RankFM
from fastFM import als, sgd

import random
from collections import defaultdict
from tqdm import tqdm
import gzip

import os
import pickle
import copy

RANDOM_SEED = 0
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

test = False

### Preprocessing

#### Preprocess user/item ID's, compensation, early_access, and time

In [2]:
user_oe = preprocessing.OrdinalEncoder(dtype=np.int32, min_frequency=5, handle_unknown='use_encoded_value', unknown_value=6710)
item_oe = preprocessing.OrdinalEncoder(dtype=np.int32, min_frequency=5)

itemset = set() # Set of all unique users
userset = set() # Set of all unique items
U = defaultdict(set) # Users that played item i
I = defaultdict(set) # Items played by uer u
time_played = defaultdict(dict)

ft = ['early_access', 'compensation'] # features unavailable/cannot be approximated in inference
def read_json(path):
    f: gzip.GzipFile = gzip.open(path)
    f.readline()
    for line in f:
        entry = eval(line)
        yield entry

# Encode userID and itemID as integers
def process_data():
    global itemset, userset, U, I
    data = []
    for entry in read_json('train.json.gz'):
        data.append(entry)
        time_played[entry['userID']][entry['gameID']] = entry['hours_transformed']

    df: pd.DataFrame = pd.DataFrame(data)
    del data

    itemset = set(df['gameID'].unique())
    userset = set(df['userID'].unique())

    U = dict(df.groupby('gameID')['userID'].unique())
    I = dict(df.groupby('userID')['gameID'].unique())
    U = { g : set(U[g]) for g in U }
    I = { u : set(I[u]) for u in I }

    df['userIDX'] = user_oe.fit_transform(df[['userID']])
    df['itemIDX'] = item_oe.fit_transform(df[['gameID']])
    df.rename({'gameID' : 'itemID'}, axis=1, inplace=True)

    df.drop(labels=['hours', 'user_id', 'date'], axis=1, inplace=True)

    # Get features that won't be available
    df.fillna(value=0, axis=1, inplace=True)
    df['compensation'] = df['compensation'].map(lambda x : x if x == 0 else 1)
    df[['early_access', 'compensation']] = df[['early_access', 'compensation']].astype(np.int32)

    return df

df = process_data()
user_mean_ft = df.groupby('userIDX')[ft].mean()
item_mean_ft = df.groupby('itemIDX')[ft].mean()
df.drop(labels=ft + ['found_funny'], axis=1, inplace=True)

In [3]:
ustoi = dict(df.groupby('userID')['userIDX'].unique().apply(lambda x: x[0]))
istoi = dict(df.groupby('itemID')['itemIDX'].unique().apply(lambda x: x[0]))

#### Preprocess user text and convert to descriptors

In [4]:
def get_text_embedding():
    if not os.path.isfile('./text_embed.npy'): # Generate new descriptors for each review using pretrained transformer
        dftext = df.groupby('itemIDX')['text'].apply(' '.join).reset_index()
        counter = feature_extraction.text.CountVectorizer(min_df=0.05, max_df=0.5, stop_words='english', max_features=2000, ngram_range=(1, 2))
        wordcount = counter.fit_transform(dftext['text'])
        LDA = LatentDirichletAllocation(n_components=20, random_state=RANDOM_SEED)
        text_embed = LDA.fit_transform(wordcount)
        np.save('text_embed.npy', text_embed)
    else: # Text descriptors already computed
        text_embed = np.load('./text_embed.npy')

    return text_embed

item_features = get_text_embedding()
# text_embed = text_embed / np.linalg.norm(text_embed, axis=1)[...,None]

df.drop('text', axis=1, inplace=True)


In [5]:
item_features = np.concatenate((np.arange(0, len(item_features))[:,  None], item_features, item_mean_ft.to_numpy()), axis=1)

In [6]:
df_train = df.iloc[:150000]
df_valid = df.iloc[150000:]

### Played Predictions

In [7]:
# Construct a new validation set w/ negative pairs
def gen_validation(df_valid):
    neg_pairs = []
    for review in df_valid.iterrows():
        review = review[1]
        sample = random.sample(itemset.difference(I[review['userID']]), k=1)[0]
        neg_pairs.append([review['userIDX'], istoi[sample]])
    pos_pairs = df_valid[['userIDX', 'itemIDX']].to_numpy()
    neg_pairs = np.array(neg_pairs)
    return pos_pairs, neg_pairs

pos_pairs, neg_pairs = gen_validation(df_valid)

def played_validate(model):
    pos_scores = model.predict(pos_pairs)
    neg_scores = model.predict(neg_pairs)
    acc = (np.mean(pos_scores >= 0) + np.mean(neg_scores < 0)) / 2
    print(f'Validation %: {acc * 100}')
    return acc

In [8]:
train = True
save = False
played_model = RankFM(factors=5, # Hyperparameters tuned from cross-validation
        loss='warp',
        max_samples=300,
        learning_exponent=0.25,
        learning_schedule='invscaling')
if train == True:
    best_model = None
    best_acc = 0
    for i in range(50):
        # switch fit_partial's dataframe to df_train for testing, "df" for actual predictions
        if test == True: # Train on entire dataset
            played_model.fit_partial(df[['userIDX', 'itemIDX']], item_features=item_features, epochs=4, verbose=False)
        else:            # Train only on training set
            played_model.fit_partial(df_train[['userIDX', 'itemIDX']], item_features=item_features, epochs=4, verbose=False)
        acc = played_validate(played_model)
        if acc > best_acc:
            best_model = copy.deepcopy(played_model)
            best_acc = acc
    if save == True:
        model_file = open('rankfm.obj', 'wb')
        pickle.dump(best_model, model_file)
        model_file.close()
else:
    model_file = open('rankfm.obj', 'rb')
    best_model = pickle.load(model_file)
    played_model = best_model
    model_file.close()

Validation %: 69.91679667186688
Validation %: 70.3168126725069
Validation %: 71.06284251370055
Validation %: 71.50286011440457
Validation %: 71.59686387455497
Validation %: 71.26285051402056
Validation %: 71.41685667426697
Validation %: 71.48485939437577
Validation %: 71.45885835433418
Validation %: 71.51486059442378
Validation %: 71.52886115444618
Validation %: 71.7528701148046
Validation %: 72.06288251530061
Validation %: 71.7948717948718


KeyboardInterrupt: 

#### Make and write predictions

In [None]:
# test = pd.read_csv('./pairs_Played.csv')
# testpred = test.copy()
# test['itemID'] = test['gameID']
# # Map unseen entries to default user (this user is already grouped with other users due to their few # of reviews in training set)
# test['userID'] = test['userID'].map(lambda x: x if x in userset else 'u03473346')
# test['userIDX'] = user_oe.transform(test[['userID']])
# test['itemIDX'] = item_oe.transform(test[['gameID']])
# test.drop(columns=['gameID', 'prediction'], inplace=True)
# scores = best_model.predict(test[['userIDX', 'itemIDX']])
# testpred = pd.read_csv('./pairs_Played.csv')
# testpred['prediction'] = (scores >= np.median(scores)).astype(np.int32)
# testpred.to_csv('./predictions_Played.csv', index=False)

### Time Prediction

In [9]:
# alpha = 0
# beta_u = np.zeros(len(df['userIDX'].unique()))
# beta_i = np.zeros(len(df['itemIDX'].unique()))
user_oe2 = preprocessing.OrdinalEncoder(dtype=np.int32)
item_oe2 = preprocessing.OrdinalEncoder(dtype=np.int32)
df['userIDX2'] = user_oe2.fit_transform(df[['userID']])
df['itemIDX2'] = item_oe2.fit_transform(df[['itemID']])
u_cnt = dict(df.iloc[:150000].groupby('userIDX2')['itemIDX2'].count())
i_cnt = dict(df.iloc[:150000].groupby('itemIDX2')['userIDX2'].count())
beta_u = np.zeros(len(I))
beta_i = np.zeros(len(U))
alpha = df['hours_transformed'].mean()
def closed_form(lamb, trainset):
    global alpha
    global beta_u
    global beta_i

    new_beta_u = np.zeros_like(beta_u)
    new_beta_i = np.zeros_like(beta_i)
    new_alpha = 0

    for i in range(len(trainset)):
        new_alpha += (trainset[i, -1] - beta_u[trainset[i, 0]] - beta_i[trainset[i, 1]]) / len(trainset)
    alpha = new_alpha
    for i in range(len(trainset)):
        new_beta_u[trainset[i, 0]] += (trainset[i, -1] - alpha - beta_i[trainset[i, 1]]) / (lamb + u_cnt[trainset[i, 0]])
    beta_u = new_beta_u
    for i in range(len(trainset)):
        new_beta_i[trainset[i, 1]] += (trainset[i, -1] - alpha - beta_u[trainset[i, 0]]) / (lamb + i_cnt[trainset[i, 1]])
    beta_i = new_beta_i

def validate(validset, alpha, beta_u, beta_i):
    label = validset['hours_transformed'].to_numpy()
    validset = validset[['userIDX2', 'itemIDX2']].to_numpy()
    preds = alpha + beta_u[validset[:, 0].astype(np.int32)] + beta_i[validset[:, 1].astype(np.int32)]
    return np.mean((preds - label)**2)

In [10]:
for i in range(100):
    closed_form(5, df.iloc[:150000][['userIDX2', 'itemIDX2', 'hours_transformed']].to_numpy())

In [134]:
u_cnt = dict(df.iloc[:150000].groupby('userIDX2')['itemIDX2'].count())
i_cnt = dict(df.iloc[:150000].groupby('itemIDX2')['userIDX2'].count())
closed_form(5, df.iloc[:150000], 0, np.zeros(len(df['userIDX2'].unique())),  np.zeros(len(df['itemIDX2'].unique())), u_cnt, i_cnt)

(3.7153235425111957,
 array([ 0.18589349, -0.56707517,  1.09466634, ...,  0.46196124,
         0.80780987,  0.97348675]),
 array([-0.43888858, -0.5976366 , -1.22902702, ..., -0.04556356,
        -1.46630672,  0.56756049]))

In [142]:

splitter = KFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)
for it, (train, test) in enumerate(splitter.split(df)):
    alpha = 0
    beta_u = np.zeros(len(df['userIDX2'].unique()))
    beta_i = np.zeros(len(df['itemIDX2'].unique()))
    u_cnt = dict(df.iloc[train].groupby('userIDX2')['itemIDX2'].nunique())
    i_cnt = dict(df.iloc[train].groupby('itemIDX2')['userIDX2'].nunique())
    for i in range(500):
        alpha, beta_u, beta_i = closed_form(5, df.iloc[train], alpha, beta_u, beta_i, u_cnt, i_cnt)
        if i % 100 == 0:
            print(f'lamb = {5}\tfold = {it}\tMSE = {validate(df.iloc[test], alpha, beta_u, beta_i)}')

TypeError: closed_form() takes 2 positional arguments but 7 were given

In [125]:
splitter = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
for i in range(10): # Lambda
    for train, test in splitter.split(df):
        alpha = 0
        beta_u = np.zeros(len(df['userIDX'].unique()))
        beta_i = np.zeros(len(df['itemIDX'].unique()))
        for j in range(10):
            alpha, beta_u, beta_i = closed_form(i, df.iloc[train], alpha, beta_u, beta_i)
            print(f'lamb = {i}\tit = {j}\tMSE = {validate(df.iloc[test], alpha, beta_u, beta_i)}')
        print("----")

TypeError: closed_form() missing 2 required positional arguments: 'u_cnt' and 'i_cnt'

In [126]:
alpha

0

#### Make and write predictions

In [None]:
test = pd.read_csv('./pairs_Hours.csv')
testpred = test.copy()
test['itemID'] = test['gameID']
# Map unseen entries to default user (this user is already grouped with other users due to their few # of reviews in training set)
test['userID'] = test['userID'].map(lambda x: x if x in userset else 'u03473346')
test['userIDX'] = user_oe.transform(test[['userID']])
test['itemIDX'] = item_oe.transform(test[['gameID']])
test.drop(columns=['gameID', 'prediction'], inplace=True)

time_test = convert_df(test)
preds = time_model.predict(time_test)

testpred = pd.read_csv('./pairs_Hours.csv')
testpred['prediction'] = preds
testpred.to_csv('./predictions_Hours.csv', index=False)