In [1]:
import sys
import os
import numpy as np
import pandas as pd
from datetime import date
from tqdm import tqdm

from recommenders.utils.timer import Timer
from recommenders.datasets.split_utils import min_rating_filter_pandas
from recommenders.datasets.python_splitters import numpy_stratified_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.datasets.pandas_df_utils import filter_by, negative_feedback_sampler
from recommenders.datasets.python_splitters import python_stratified_split

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))

System version: 3.8.5 (default, Sep  4 2020, 07:30:14) 
[GCC 7.3.0]
Pandas version: 1.4.1


In [None]:
# from scipy.sparse import csr_matrix
# import numpy as np
# import pandas as pd
# from sklearn.preprocessing import LabelEncoder
# from tqdm import tqdm


# class EASE:
#     def __init__(self):
#         self.user_enc = LabelEncoder()
#         self.item_enc = LabelEncoder()

#     def _get_users_and_items(self, df):
#         users = self.user_enc.fit_transform(df.loc[:, 'userID'])
#         items = self.item_enc.fit_transform(df.loc[:, 'itemID'])
#         return users, items

#     def fit(self, df, lambda_: float = 0.5, implicit=True):
#         """
#         df: pandas.DataFrame with columns user_id, item_id and (rating)
#         lambda_: l2-regularization term
#         implicit: if True, ratings are ignored and taken as 1, else normalized ratings are used
#         """
#         users, items = self._get_users_and_items(df)
#         values = np.ones(df.shape[0]) if implicit else df['rating'].to_numpy() / df['rating'].max()

#         X = csr_matrix((values, (users, items)))
#         self.X = X

#         G = X.T.dot(X).toarray()
#         diagIndices = np.diag_indices(G.shape[0])
#         G[diagIndices] += lambda_
#         P = np.linalg.inv(G)
#         B = P / (-np.diag(P))
#         B[diagIndices] = 0

#         self.B = B
#         self.pred = X.dot(B)

#     def predict(self, train, users, items, k):
#         df = pd.DataFrame()
#         items = self.item_enc.transform(items)
#         dd = train.loc[train.userID.isin(users)]
#         dd['ci'] = self.item_enc.transform(dd.itemID)
#         dd['cu'] = self.user_enc.transform(dd.userID)
#         g = dd.groupby('userID')
#         for user, group in tqdm(g):
#             watched = set(group['ci'])
#             candidates = [item for item in items if item not in watched]
#             u = group['cu'].iloc[0]
#             pred = np.take(self.pred[u, :], candidates)
#             res = np.argpartition(pred, -k)[-k:]
#             r = pd.DataFrame({
#                 "userID": [user] * len(res),
#                 "itemID": np.take(candidates, res),
#                 "score": np.take(pred, res)
#             }).sort_values('score', ascending=False)
#             df = df.append(r, ignore_index=True)
#         df['itemID'] = self.item_enc.inverse_transform(df['itemID'])
#         return df

In [37]:
from scipy.sparse import csr_matrix
import scipy.sparse as sp
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm


class EASE:
    def __init__(self):
        self.user_enc = LabelEncoder()
        self.item_enc = LabelEncoder()

    def _get_users_and_items(self, df):
        users = self.user_enc.fit_transform(df.loc[:, 'userID'])
        items = self.item_enc.fit_transform(df.loc[:, 'itemID'])
        return users, items

    def fit(self, df, lambda_: float = 0.5, implicit=True):
        """
        df: pandas.DataFrame with columns user_id, item_id and (rating)
        lambda_: l2-regularization term
        implicit: if True, ratings are ignored and taken as 1, else normalized ratings are used
        """
        users, items = self._get_users_and_items(df)
        values = np.ones(df.shape[0]) if implicit else df['rating'].to_numpy() / df['rating'].max()

        X = csr_matrix((values, (users, items))).astype(np.float32)
        self.X = X

        G = X.T.dot(X).toarray()
        diagIndices = np.diag_indices(G.shape[0])
        G[diagIndices] += lambda_
        P = np.linalg.inv(G)
        B = P / (-np.diag(P))
        B[diagIndices] = 0
#         reg_weight = 250.

#         G = X.T @ X

#         # add reg to diagonal
#         G += reg_weight * sp.identity(G.shape[0])

#         # convert to dense because inverse will be dense
#         G = G.todense()

#         # invert. this takes most of the time
#         P = np.linalg.inv(G)
#         B = P / (-np.diag(P))
#         # zero out diag
#         np.fill_diagonal(B, 0.)

        self.B = B
        self.pred = X.dot(B)

    def predict(self, train, users, items, k):
        df = pd.DataFrame()
        items = self.item_enc.transform(items)
        dd = train.loc[train.userID.isin(users)]
        dd['ci'] = self.item_enc.transform(dd.itemID)
        dd['cu'] = self.user_enc.transform(dd.userID)
        g = dd.groupby('userID')
        for user, group in tqdm(g):
            watched = set(group['ci'])
            last_year = group['rated_year'].max()
            years = dd[dd['year'] > last_year]['ci'].unique()
            watched.update(years)
            candidates = [item for item in items if item not in watched]
            u = group['cu'].iloc[0]
            pred = np.take(self.pred[u, :], candidates)
            res = np.argpartition(pred, -k)[-k:]
            r = pd.DataFrame({
                "userID": [user] * len(res),
                "itemID": np.take(candidates, res),
                "score": np.take(pred, res)
            }).sort_values('score', ascending=False)
            df = df.append(r, ignore_index=True)
        df['itemID'] = self.item_enc.inverse_transform(df['itemID'])
        return df

In [23]:
from scipy.sparse import csr_matrix
import scipy.sparse as sp
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm


class EASE:
    def __init__(self):
        self.user_enc = LabelEncoder()
        self.item_enc = LabelEncoder()

    def _get_users_and_items(self, df):
        users = self.user_enc.fit_transform(df.loc[:, 'userID'])
        items = self.item_enc.fit_transform(df.loc[:, 'itemID'])
        return users, items

    def fit(self, df, lambda_: float = 0.5, implicit=True):
        """
        df: pandas.DataFrame with columns user_id, item_id and (rating)
        lambda_: l2-regularization term
        implicit: if True, ratings are ignored and taken as 1, else normalized ratings are used
        """
        users, items = self._get_users_and_items(df)
        values = np.ones(df.shape[0]) if implicit else df['rating'].to_numpy() / df['rating'].max()

        X = csr_matrix((values, (users, items)))
        self.X = X

        reg_weight = 250.

        G = X.T @ X

        # add reg to diagonal
        G += reg_weight * sp.identity(G.shape[0])

        # convert to dense because inverse will be dense
        G = G.todense()

        # invert. this takes most of the time
        P = np.linalg.inv(G)
        B = P / (-np.diag(P))
        # zero out diag
        np.fill_diagonal(B, 0.)

        self.B = B
        self.pred = X.dot(B)

    def predict(self, train, users, items, k):
        df = pd.DataFrame()
        items = self.item_enc.transform(items)
        dd = train.loc[train.userID.isin(users)]
        dd['ci'] = self.item_enc.transform(dd.itemID)
        dd['cu'] = self.user_enc.transform(dd.userID)
        g = dd.groupby('userID')
        for user, group in tqdm(g):
            watched = set(group['ci'])
            candidates = [item for item in items if item not in watched]
            u = group['cu'].iloc[0]
            pred = np.take(self.pred[u, :], candidates)
            res = np.argpartition(pred, -k)[-k:]
            r = pd.DataFrame({
                "userID": [user] * len(res),
                "itemID": np.take(candidates, res),
                "score": np.take(pred, res)
            }).sort_values('score', ascending=False)
            df = df.append(r, ignore_index=True)
        df['itemID'] = self.item_enc.inverse_transform(df['itemID'])
        return df

In [3]:
# top k items to recommend
TOP_K = 10

# Dataset

In [4]:
COL_USER = "userID"
COL_ITEM = "itemID"
COL_RATING = "rating"
COL_PREDICTION = "rating"
COL_TIMESTAMP = "timestamp"

root_dir = '/opt/ml/input/data/train/'
df = pd.read_csv(os.path.join(root_dir,'train_ratings.csv'), names=[COL_USER, COL_ITEM, COL_TIMESTAMP], header=0)
df.head()

Unnamed: 0,userID,itemID,timestamp
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563


In [5]:
df['rating'] = 1

In [6]:
tqdm.pandas()
df['rated_year'] = df['timestamp'].progress_apply(lambda x: date.fromtimestamp(x).year)

100%|██████████| 5154471/5154471 [00:07<00:00, 712179.77it/s]


In [9]:
df

Unnamed: 0,userID,itemID,timestamp,rating,rated_year
0,11,4643,1230782529,1,2009
1,11,170,1230782534,1,2009
2,11,531,1230782539,1,2009
3,11,616,1230782542,1,2009
4,11,2140,1230782563,1,2009
...,...,...,...,...,...
5154466,138493,44022,1260209449,1,2009
5154467,138493,4958,1260209482,1,2009
5154468,138493,68319,1260209720,1,2009
5154469,138493,40819,1260209726,1,2009


In [8]:
years = pd.read_csv('./years_fixed.tsv', sep='\t', names=[COL_ITEM, 'year'], header=0)

In [10]:
df = pd.merge(df, years, how='left', on='itemID')

In [11]:
df

Unnamed: 0,userID,itemID,timestamp,rating,rated_year,year
0,11,4643,1230782529,1,2009,2001
1,11,170,1230782534,1,2009,1995
2,11,531,1230782539,1,2009,1993
3,11,616,1230782542,1,2009,1970
4,11,2140,1230782563,1,2009,1982
...,...,...,...,...,...,...
5154466,138493,44022,1260209449,1,2009,2006
5154467,138493,4958,1260209482,1,2009,2001
5154468,138493,68319,1260209720,1,2009,2009
5154469,138493,40819,1260209726,1,2009,2005


In [12]:
train, test = python_stratified_split(df, ratio=0.75)

# EASE

In [38]:
model = EASE()

In [39]:
# inference할 때는 전체 데이터(df)로 학습 필요
# evaluation할 때는 train data로만 학습
model.fit(train)

In [40]:
users = df['userID'].unique()
items = df['itemID'].unique()

In [41]:
print(len(users), len(items))

31360 6807


In [42]:
model.pred.shape

(31360, 6807)

# Inference

In [43]:
# inference할 때는 전체 데이터(df)로 inference 필요
# evaluation할 때는 train data로 inference
predictions = model.predict(train, users, items, 10)

  df = df.append(r, ignore_index=True)
  0%|          | 71/31360 [00:03<25:38, 20.34it/s] 


KeyboardInterrupt: 

# Evaluation

In [163]:
predictions = predictions.rename({"score": "prediction"}, axis=1)

### 연도 + 1 이후로 추천

In [154]:
eval_recall = recall_at_k(test, predictions, k=TOP_K)
print("Recall@K:\t%f" % eval_recall)

Recall@K:	0.126524


### 해당 연도 이후 추천 안함

In [145]:
eval_recall = recall_at_k(test, predictions, k=TOP_K)
print("Recall@K:\t%f" % eval_recall)

Recall@K:	0.126577


### 연도 상관없이 다 추천

In [32]:
eval_recall = recall_at_k(test, predictions, k=TOP_K)
print("Recall@K:\t%f" % eval_recall)

Recall@K:	0.126496


# Submission

In [164]:
submission_df = predictions[['userID', 'itemID']]
submission_df.rename(columns = {"userID": "user", "itemID": "item"}, inplace=True)
submission_df.to_csv('/opt/ml/input/submission.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_df.rename(columns = {"userID": "user", "itemID": "item"}, inplace=True)


# Hyperparameter

In [None]:
df['rating'] = 1
hyperparameter = [0.1, 0.3, 0.5, 0.7, 0.9]
for param in hyperparameter:
    model = EASE()
    model.fit(train, lambda_=param)
    predictions = model.predict(train, users, items, 10)
    predictions = predictions.rename({"score": "prediction"}, axis=1)
    eval_recall = recall_at_k(test, predictions, k=TOP_K)
    print("Lambda: {} / Recall@K:\t{}".format(param, eval_recall))

  df = df.append(r, ignore_index=True)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 31360/31360 [10:56<00:00, 47.79it/s]


Lambda: 0.1 / Recall@K:	0.12586906030068523


  df = df.append(r, ignore_index=True)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 31360/31360 [11:13<00:00, 46.56it/s]


Lambda: 0.3 / Recall@K:	0.12616284261738195


  df = df.append(r, ignore_index=True)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 31360/31360 [10:53<00:00, 47.97it/s]
 21%|████████████████████████████████▊                                                                                                                           | 6591/31360 [02:13<08:38, 47.79it/s]