Reference code: https://github.com/franckjay/TorchEASE

In [1]:
import sys, os, random
import logging
import pandas as pd
import sys
import logging
import torch
from tqdm import tqdm 
from time import time
import scipy.sparse as sp
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset

def set_seed(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # some cudnn methods can be random even after fixing the seed
    # unless you tell it to be deterministic
    torch.backends.cudnn.deterministic = True

set_seed(seed=42)

# Dataset Define

In [2]:
class BaseDataset(Dataset):
    def __init__(self, path = '../data/', mode = 'train'):
        self.path = path # default: '../data/'

        #get number of users and items
        self.n_users, self.n_items = 0, 0
        self.n_train, self.n_test = 0, 0
        self.neg_pools = {}
        self.exist_users = []

        # data_path는 사용자의 디렉토리에 맞게 설정해야 합니다.
        data_path = os.path.join(self.path, 'train/train_ratings.csv')
        genre_path = os.path.join(self.path, 'train/genres.tsv')
        df = pd.read_csv(data_path)
        genre_data = pd.read_csv(genre_path, sep='\t')


        ############### item based outlier ###############
        # # 아이템 기준 outlier 제거 - 이용율 0.3% 미만인 아이템 제거 (영구히 제거)
        # item_freq_df = (df.groupby('item')['user'].count()/df.user.nunique()).reset_index()
        # item_freq_df.columns = ['item', 'item_freq']
        # # df = df.merge(item_freq_df, on='item').query('item_freq > 0.003')
        # df = df.merge(item_freq_df, on='item').query('item_freq > 0.005')
        # # df = df.merge(item_freq_df, on='item').query('item_freq > 0.01')
        # del df['item_freq'] # 소명을 다하고 삭제! 

        self.ratings_df = df.copy() # for submission
        self.n_train = len(df)

        item_ids = df['item'].unique() # 아이템 고유 번호 리스트
        user_ids = df['user'].unique() # 유저 고유 번호 리스트
        self.n_items, self.n_users = len(item_ids), len(user_ids)
        
        # user, item indexing
        # item2idx = pd.Series(data=np.arange(len(item_ids))+1, index=item_ids) # item re-indexing (1~num_item) ; 아이템을 1부터 설정하는이유? 0을 아무것도 아닌 것으로 blank 하기 위해서
        self.item2idx = pd.Series(data=np.arange(len(item_ids)), index=item_ids) # item re-indexing (0~num_item-1) ; 아이템을 1부터 설정하는이유? 0을 아무것도 아닌 것으로 blank 하기 위해서
        self.user2idx = pd.Series(data=np.arange(len(user_ids)), index=user_ids) # user re-indexing (0~num_user-1)

        # dataframe indexing
        df = pd.merge(df, pd.DataFrame({'item': item_ids, 'item_idx': self.item2idx[item_ids].values}), on='item', how='inner')
        df = pd.merge(df, pd.DataFrame({'user': user_ids, 'user_idx': self.user2idx[user_ids].values}), on='user', how='inner')
        df.sort_values(['user_idx', 'time'], inplace=True)
        genre_data = df.merge(genre_data, on = 'item').copy()
        del df['item'], df['user'], genre_data['item'], genre_data['user']

        self.exist_items = list(df['item_idx'].unique())
        self.exist_users = list(df['user_idx'].unique())

        ############### Used by Sampler ###############
        # # 1. user-based outlier - 상위 20퍼센트 영화를 본 친구들 Weight=0 지정
        # self.user_weights = np.ones_like(self.exist_users)
        # outlier_users = df['user_idx'].unique()[df.groupby('user_idx').item_idx.count()/df['item_idx'].nunique() >= 0.4]
        # self.user_weights[outlier_users] = 0

        t1 = time()
        self.train_items, self.valid_items = {}, {}
        
        items = df.groupby("user_idx")["item_idx"].apply(list) # 유저 아이디 상관 없이, 순서대로 
        if mode == 'train':
            print('Creating interaction Train/ Vaild Split...')
            for uid, item in enumerate(items):            
                num_u_valid_items = min(int(len(item)*0.125), 10) # 유저가 소비한 아이템의 12.5%, 그리고 최대 10개의 데이터셋을 무작위로 Validation Set으로 활용한다.
                ####### Original method : RANDOM #######
                # u_valid_items = np.random.choice(item, size=num_u_valid_items, replace=False)
                # self.valid_items[uid] = u_valid_items
                # self.train_items[uid] = list(set(item) - set(u_valid_items))

                ####### method-1 : Last sequence ####### 마지막 sequence에 있는 정보를 제거
                # u_valid_items = item[-num_u_valid_items:]
                # self.valid_items[uid] = u_valid_items
                # self.train_items[uid] = list(set(item) - set(u_valid_items))

                ####### method-2 : hybrid ####### 마지막꺼:무작위= 1:1
                # num_random = int(num_u_valid_items//2 + num_u_valid_items%2) # 홀수일때는, 무작위로 뽑는것이 1개 더 많게
                # num_last = int(num_u_valid_items - num_random)
                # last_items = item[-num_last:]
                # random_items = np.random.choice(item[:-num_last], size=num_random, replace=False).tolist()
                # u_valid_items = random_items + last_items
                # self.valid_items[uid] = u_valid_items
                # self.train_items[uid] = list(set(item) - set(u_valid_items))

                ####### method-3 : hybrid ####### 마지막꺼:무작위= 6:4
                num_random = np.floor(num_u_valid_items*0.6).astype(int) # 홀수일때는, 무작위로 뽑는것이 1개 더 많게
                num_last = int(num_u_valid_items - num_random)
                last_items = item[-num_last:]
                random_items = np.random.choice(item[:-num_last], size=num_random, replace=False).tolist()
                u_valid_items = random_items + last_items
                self.valid_items[uid] = u_valid_items
                self.train_items[uid] = list(set(item) - set(u_valid_items))




            self.train_data = pd.concat({k: pd.Series(v) for k, v in self.train_items.items()}).reset_index(0)
            self.train_data.columns = ['user', 'item']

            self.valid_data = pd.concat({k: pd.Series(v) for k, v in self.valid_items.items()}).reset_index(0)
            self.valid_data.columns = ['user', 'item']
        
        if mode == 'train_all': #else
            print('Preparing interaction all train set')
            # for uid, item in enumerate(items):            
            #     self.train_items[uid] = item

            # self.train_data = pd.concat({k: pd.Series(v) for k, v in train_items.items()})
            # self.train_data.reset_index(0, inplace=True)
            # self.train_data.columns = ['user', 'item']
            self.train_data = pd.DataFrame()
            self.train_data['user'] = df['user_idx']
            self.train_data['item'] = df['item_idx']

        print('Train/Vaild Split Complete. Takes in', time() - t1, 'sec')
        
        rows, cols = self.train_data['user'], self.train_data['item']
        self.train_input_data = sp.csr_matrix(
            (np.ones_like(rows), (rows, cols)), 
            dtype='float32',
            shape=(self.n_users, self.n_items))
        self.train_input_data = self.train_input_data.toarray()


        print('Making Genre filter ... ')
        genre2item = genre_data.groupby('genre')['item_idx'].apply(set).apply(list)
        # user2genre = genre_data.groupby('user_idx')['genre'].apply(set).apply(list)

        genre_data_freq = genre_data.groupby('user_idx')['genre'].value_counts(normalize=True)
        genre_data_freq_over_5p = genre_data_freq[genre_data_freq > 0.003].reset_index('user_idx')
        genre_data_freq_over_5p.columns = ['user_idx', 'tobedroped']
        genre_data_freq_over_5p = genre_data_freq_over_5p.drop('tobedroped', axis = 1).reset_index()
        user2genre = genre_data_freq_over_5p.groupby('user_idx')['genre'].apply(set).apply(list)

        genre2item_dict = genre2item.to_dict()
        all_set_genre = set(genre_data['genre'].unique())
        user_genre_filter_dict = {}
        for user, genres in tqdm(enumerate(user2genre)):
            unseen_genres = all_set_genre - set(genres) # set
            unseen_genres_item = set(sum([genre2item_dict[genre] for genre in unseen_genres], []))
            user_genre_filter_dict[user] = pd.Series(list(unseen_genres_item), dtype=np.int32)

        user_genre_filter_df = pd.concat(user_genre_filter_dict).reset_index(0)
        user_genre_filter_df.columns = ['user', 'item']
        user_genre_filter_df.index = range(len(user_genre_filter_df))

        rows, cols = user_genre_filter_df['user'], user_genre_filter_df['item']
        self.user_genre_filter = sp.csr_matrix(
            (np.ones_like(rows), (rows, cols)), 
            dtype='float32',
            shape=(self.n_users, self.n_items))
        self.user_genre_filter = self.user_genre_filter.toarray()

    def __len__(self):
        return self.n_users

    def __getitem__(self, idx):
        return self.train_input_data[idx,:]

# Model Define

In [3]:
class TorchEASE:
    def __init__(
        self, train, user_col="user_id", item_col="item_id", score_col=None, reg=250.0, dataset = None):
        """
        :param train: Training DataFrame of user, item, score(optional) values
        :param user_col: Column name for users
        :param item_col: Column name for items
        :param score_col: Column name for scores. Implicit feedback otherwise
        :param reg: Regularization parameter.
                    Change by orders of magnitude to tune (2e1, 2e2, ...,2e4)
        """
        logging.basicConfig(
            format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
            level=logging.INFO,
            datefmt="%Y-%m-%d %H:%M:%S",
            stream=sys.stdout,
        )
        if dataset:
            self.dataset = dataset

        self.logger = logging.getLogger("notebook")
        self.logger.info("Building user + item lookup")
        # How much regularization do you need?
        self.reg = reg

        self.user_col = user_col
        self.item_col = item_col

        self.user_id_col = user_col + "_id"
        self.item_id_col = item_col + "_id"

        self.user_lookup = self.generate_labels(train, self.user_col)
        self.item_lookup = self.generate_labels(train, self.item_col)

        self.item_map = {}
        self.logger.info("Building item hashmap")
        for _item, _item_id in self.item_lookup.values:
            self.item_map[_item_id] = _item

        train = pd.merge(train, self.user_lookup, on=[self.user_col])
        train = pd.merge(train, self.item_lookup, on=[self.item_col])
        self.logger.info("User + item lookup complete")
        self.indices = torch.LongTensor(
            train[[self.user_id_col, self.item_id_col]].values
        )

        if not score_col:
            # Implicit values only
            self.values = torch.ones(self.indices.shape[0])
        else:
            # TODO: Test if score_col works correctly
            self.values = torch.FloatTensor(train[score_col])

        # TODO: Is Sparse the best implementation?
        self.sparse = torch.sparse.FloatTensor(self.indices.t(), self.values)

        self.logger.info("Sparse data built")

    def generate_labels(self, df, col):
        dist_labels = df[[col]].drop_duplicates()
        dist_labels[col + "_id"] = dist_labels[col].astype("category").cat.codes

        return dist_labels

    def fit(self):
        self.logger.info("Building G Matrix")
        G = self.sparse.to_dense().t() @ self.sparse.to_dense()
        G += torch.eye(G.shape[0]) * self.reg

        P = G.inverse()

        self.logger.info("Building B matrix")
        B = P / (-1 * P.diag())
        # Set diagonals to 0. TODO: Use .fill_diag_
        B = B + torch.eye(B.shape[0])

        # Predictions for user `_u` will be self.sparse.to_dense()[_u]@self.B
        self.B = B

        return

    def predict_all(self, pred_df, k=5, remove_owned=True, genre_filter = False):
        """
        :param pred_df: DataFrame of users that need predictions
        :param k: Number of items to recommend to each user
        :param remove_owned: Do you want previously interacted items included?
        :return: DataFrame of users + their predictions in sorted order
        """
        pred_df = pred_df[[self.user_col]].drop_duplicates()
        n_orig = pred_df.shape[0]

        # Alert to number of dropped users in prediction set
        pred_df = pd.merge(pred_df, self.user_lookup, on=[self.user_col])
        n_curr = pred_df.shape[0]
        if n_orig - n_curr:
            self.logger.info(
                "Number of unknown users from prediction data = %i" % (n_orig - n_curr)
            )

        _output_preds = []
        # Select only user_ids in our user data
        _user_tensor = self.sparse.to_dense().index_select(
            dim=0, index=torch.LongTensor(pred_df[self.user_id_col])
        )

        # Make our (raw) predictions
        _preds_tensor = _user_tensor @ self.B
        self.logger.info("Predictions are made")
        if remove_owned:
            # Discount these items by a large factor (much faster than list comp.)
            self.logger.info("Removing owned items")
            _preds_tensor += -1.0 * _user_tensor

        if genre_filter:
            self.logger.info("Removing never seen genre movies")
            _preds_tensor += -1.0 * self.dataset.user_genre_filter


        self.logger.info("TopK selected per user")
        for _preds in _preds_tensor:
            # Very quick to use .topk() vs. argmax()
            _output_preds.append(
                [self.item_map[_id] for _id in _preds.topk(k).indices.tolist()]
            )

        pred_df["predicted_items"] = _output_preds
        self.logger.info("Predictions are returned to user")
        return pred_df

    def score_predictions(self):
        # TODO: Implement this with some common metrics
        return None

# Training

In [4]:
# parameter setting
reg = 600
is_genre_filter = False
score_col=None
k = 10

# dataset setting
dataset = BaseDataset(path = '../data/', mode = 'train')

# model setting
model = TorchEASE(dataset.train_data, user_col='user', item_col='item', score_col=score_col, reg=reg, dataset = dataset)

# fit
model.fit()

Creating interaction Train/ Vaild Split...
Train/Vaild Split Complete. Takes in 20.065412759780884 sec
Making Genre filter ... 


31360it [00:06, 4728.64it/s]


2022-04-14 02:59:31 [INFO] notebook - Building user + item lookup
2022-04-14 02:59:31 [INFO] notebook - Building item hashmap
2022-04-14 02:59:32 [INFO] notebook - User + item lookup complete
2022-04-14 02:59:32 [INFO] notebook - Sparse data built
2022-04-14 02:59:32 [INFO] notebook - Building G Matrix
2022-04-14 02:59:42 [INFO] notebook - Building B matrix


In [5]:
# Validation
output = model.predict_all(dataset.train_data, k=k, genre_filter = is_genre_filter)
solution = output.drop('user_id', axis=1).set_index('user')['predicted_items']
answer = dataset.valid_data.groupby('user')['item'].apply(list)

# Get Recall@10
recall = 0.0
for ans, sol in zip(answer, solution):
    # print(ans, sol)    
    ans_set = set(ans)
    sol_set = set(sol)
    denominator = min(10, len(ans))
    numerator = len(ans_set.intersection(sol_set))
    recall += numerator/denominator
    
recall = recall/len(answer)
print(recall)

2022-04-14 02:59:49 [INFO] notebook - Predictions are made
2022-04-14 02:59:49 [INFO] notebook - Removing owned items
2022-04-14 02:59:49 [INFO] notebook - TopK selected per user
2022-04-14 02:59:50 [INFO] notebook - Predictions are returned to user
0.1588710089690593


In [6]:
###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### 
###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### 
###### ###### ###### ###### ###### ######    submission area   ###### ###### ###### ###### ###### ###### ######
###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### 
###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### ###### 

submission_dataset = BaseDataset(path = '../data/', mode = 'train_all')
submission_model = TorchEASE(submission_dataset.train_data, user_col='user', item_col='item', score_col=None, reg=600)
submission_model.fit()
submission_output = submission_model.predict_all(submission_dataset.train_data, k=10)

submission_output = pd.concat({k: pd.Series(v) for k, v in submission_output.drop('user_id', axis = 1).set_index('user')['predicted_items'].items()}).reset_index(0)
submission_output.columns = ['user', 'item']

idx2item = submission_dataset.item2idx.reset_index()
idx2item.columns = ['item', 'item_idx']
idx2item = idx2item.set_index('item_idx')

idx2user = submission_dataset.user2idx.reset_index()
idx2user.columns = ['user', 'user_idx']
idx2user = idx2user.set_index('user_idx')

submission_output['item'] = submission_output['item'].replace(idx2item.to_dict()['item'])
submission_output['user'] = submission_output['user'].replace(idx2user.to_dict()['user'])
submission_output = submission_output.sort_values('user')
submission_output.to_csv("submission_ease.csv", index=False)

Preparing interaction all train set
Train/Vaild Split Complete. Takes in 2.2489964962005615 sec
Making Genre filter ... 


31360it [00:06, 4637.27it/s]


2022-04-14 03:00:25 [INFO] notebook - Building user + item lookup
2022-04-14 03:00:25 [INFO] notebook - Building item hashmap
2022-04-14 03:00:26 [INFO] notebook - User + item lookup complete
2022-04-14 03:00:26 [INFO] notebook - Sparse data built
2022-04-14 03:00:26 [INFO] notebook - Building G Matrix
2022-04-14 03:00:36 [INFO] notebook - Building B matrix
2022-04-14 03:00:43 [INFO] notebook - Predictions are made
2022-04-14 03:00:43 [INFO] notebook - Removing owned items
2022-04-14 03:00:43 [INFO] notebook - TopK selected per user
2022-04-14 03:00:45 [INFO] notebook - Predictions are returned to user
