In [1]:
import numpy as np
import pandas as pd
import torch
from scipy.sparse import csr_matrix
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from multiprocessing import Pool, cpu_count
path = '/opt/ml/input/project/model/data/'

train = pd.read_csv(path + 'S_train.csv')   
train


Unnamed: 0,user,item,userid,rest
0,0,3,5b62e8065fffc95678a5a628,838819922
1,0,4,5b62e8065fffc95678a5a628,36466212
2,0,5,5b62e8065fffc95678a5a628,21900266
3,0,6,5b62e8065fffc95678a5a628,1756032132
4,0,7,5b62e8065fffc95678a5a628,11623967
...,...,...,...,...
98620,10083,722,636275962c777da5c98d7695,1679960108
98621,10083,344,636275962c777da5c98d7695,37921873
98622,10083,816,636275962c777da5c98d7695,1220094431
98623,10083,634,636275962c777da5c98d7695,31510379


In [2]:
'''
train => user, item, time 으로 이루어진 
5154471 rows × 3 columns 크기의 csv
'''

'\ntrain => user, item, time 으로 이루어진 \n5154471 rows × 3 columns 크기의 csv\n'

In [3]:
class EASE:
    def __init__(self):
        self.user_enc = LabelEncoder()
        self.item_enc = LabelEncoder()

    def _get_users_and_items(self, df):
        users = self.user_enc.fit_transform(df.loc[:, 'user'])
        items = self.item_enc.fit_transform(df.loc[:, 'item'])
        return users, items

    def fit(self, df, lambda_: float = 500, implicit=True):
        """
        df: pandas.DataFrame with columns user_id, item_id and (rating)
        lambda_: l2-regularization term
        implicit: if True, ratings are ignored and taken as 1, else normalized ratings are used
        """
        users, items = self._get_users_and_items(df)
        values = (
            np.ones(df.shape[0])
            if implicit
            else df['rating'].to_numpy() / df['rating'].max()
        )

        X = csr_matrix((values, (users, items)))
        self.X = X

        G = X.T.dot(X).toarray()
        diagIndices = np.diag_indices(G.shape[0])
        G[diagIndices] += lambda_
        P = np.linalg.inv(G)
        B = P / (-np.diag(P))
        B[diagIndices] = 0

        self.B = B
        self.pred = X.dot(B)

    def predict(self, train, users, items, k):
        items = self.item_enc.transform(items)
        dd = train.loc[train.user.isin(users)]
        dd['ci'] = self.item_enc.transform(dd.item)
        dd['cu'] = self.user_enc.transform(dd.user)
        g = dd.groupby('cu')
        with Pool(cpu_count()) as p:
            user_preds = p.starmap(
                self.predict_for_user,
                [(user, group, self.pred[user, :], items, k) for user, group in g],
            )
        df = pd.concat(user_preds)
        df['item'] = self.item_enc.inverse_transform(df['item'])
        df['user'] = self.user_enc.inverse_transform(df['user'])
        return df

    @staticmethod
    def predict_for_user(user, group, pred, items, k):
        watched = set(group['ci'])
        candidates = [item for item in items if item not in watched]
        pred = np.take(pred, candidates)
        res = np.argpartition(pred, -k)[-k:]
        r = pd.DataFrame(
            {
                "user": [user] * len(res),
                "item": np.take(candidates, res),
                "score": np.take(pred, res),
            }
        ).sort_values('score', ascending=False)
        return r
        


In [4]:
model = EASE()
model.fit(train)
predict = model.predict(train,train['user'].unique(),train['item'].unique(),3)
predict = predict.drop('score',axis = 1)


In [6]:
predict.to_csv('../output/ease_S.csv', index=False)

# Test(recall K)

In [7]:
predict = pd.read_csv('/opt/ml/input/project/model/output/' + 'ease_S.csv')
answer = pd.read_csv('/opt/ml/input/project/model/data/S_test.csv')



In [8]:
predict_user = predict.groupby('user')['item'].apply(list) 
answer_user = answer.groupby('user')['item'].apply(list)


In [9]:
_recall = []

for i, ans in enumerate(answer_user):
    a = 0
    for j in ans:
        if j in predict_user[i]:
            a += 1 
    _recall.append(a/2)

In [10]:
recall = sum(_recall) / len(_recall)
recall

0.07566441888139627

In [None]:
scores = test_score(args, epoch, train_dataloader, model)
print("recall_k = ", scores)