In [18]:
""" 
Todo
1. UPL code
2. SVD
3. NDCG per UPL
4. Readme
"""

' \nTodo\n1. UPL code\n2. SVD\n3. NDCG per UPL\n4. Readme\n'

In [37]:
import os
import itertools

import pandas as pd
import numpy as np

from numpy.random import default_rng
from scipy.sparse import csr_matrix
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split

In [20]:
def get_upl(item_group, upl_size, reference_column, rng):
    """
        This function intends to generate user preference list (UPL) with a desired size.
        For example, UPL with size 5 means every user must only have 5 rated items etc. Therefore, from the raw data, select 5 items that the users have rated
        The selection are random. Pseudorandom is used to maintain consistency for every running

        Parameters:
            item_group          : grouped dataframe by user id (dataframe groupby)
            upl_size            : desired upl size (integer)
            reference_column    : item column name (string)
            rng                 : random generator (numpy random generator)

        Return
            item_group          : grouped dataframe by user id (dataframe groupby)
    """

    # create a column named 'upl' as flag container. 0 means the item is not selected as the upl item, while 1 is the opposite
    item_group.loc[:, 'upl'] = 0

    # get all item ids. Shuffle those ids and get top n of them as the selected items for upl. n is the desired upl size.
    # Mark the items by 1 on upl column on the dataset
    upl = item_group[reference_column].values
    if len(upl) != 0:
        rng.shuffle(upl)
        item_group.loc[item_group[reference_column].isin(upl[:upl_size]), 'upl'] = 1
        return item_group
    else:
        return item_group

In [21]:
# get dataset directory
dataset_dir = os.path.expanduser('Datasets/ml-100k/')

In [22]:
# init the upl size
upl_sizes = [10, 20, 30, 40, 50]

In [23]:
# set random generator for pseudo random
random_state = 123
rng = default_rng(random_state)

In [24]:
# reader object for dataset
reader = Reader(rating_scale=(1, 5))

In [25]:
# set testset size
test_size = .2

In [26]:
# load the rating dataset on pandas dataframe
df_rating = pd.read_csv(dataset_dir + 'u.data', sep='|', names=['user id', 'item id', 'rating', 'timestamp'])
df_rating = df_rating[['user id', 'item id', 'rating']]

In [None]:
# for each upl size, generate upl and do training testing to model
for upl_id, upl_size in enumerate(upl_sizes):
    # group the dataset on dataframe by user id
    grouped = df_rating.groupby("user id")

    # call get_upl function to each group to generate upl for each group
    df_generated_upl = grouped.apply(lambda x: get_upl(x, upl_size, 'item id', rng))

    # select only rows where the upl flag is set to 1 as the selected upl items
    df_generated_upl = df_generated_upl[df_generated_upl['upl'] == 1]

    # convert the dataset to dataset object from surprise library
    data = Dataset.load_from_df(df_generated_upl[['user id', 'item id', 'rating']], reader)

    # split to training set and testing set
    trainset, testset = train_test_split(data, test_size=test_size, random_state=random_state)

    # convert testset to csr format
    testset = np.array(list(map(list, testset))).astype(np.int32)

    # get number of users and number of items in testset
    num_user = max(testset[:, 0])
    num_item = max(testset[:, 1])

    # init the svd model
    svd = SVD(random_state=random_state)

    # train the model with the trainset
    svd.fit(trainset)

    # predict all unknown ratings
    arr_predicted_ratings = np.empty([num_user * num_item, 3])
    for user_id, item_id in itertools.product(range(num_user), range(num_item)):
        arr_predicted_ratings[item_id + user_id * num_item, :] = [user_id, item_id, svd.predict(user_id + 1, item_id + 1)[3]]
    csr_predicted_ratings = csr_matrix((arr_predicted_ratings[:,2], (arr_predicted_ratings[:,0], arr_predicted_ratings[:,1])), shape=(num_user, num_item))

    




