In [3]:
import pandas as pd
import numpy as np
import os
import time

from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from multiprocessing import Pool, cpu_count

In [4]:
import numpy as np

# 딥 러닝 모델이 아니므로 nn.Module을 상속받을 필요가 없다. 
class EASE_base:
    def __init__(self, _lambda):
        self.B = None
        self._lambda = _lambda
        self.user_enc = LabelEncoder()
        self.item_enc = LabelEncoder()

    def train(self, df):
        X = self.generate_rating_matrix(df)
        self.X = X
        G = X.T.dot(X).toarray() # G = X'X
        diag_indices = list(range(G.shape[0]))
        G[diag_indices, diag_indices] += self._lambda  # X'X + λI
        P = np.linalg.inv(G)  # P = (X'X + λI)^(-1)

        B = P / -np.diag(P)  # - P_{ij} / P_{jj} if i ≠ j
        min_dim = min(B.shape)  
        B[range(min_dim), range(min_dim)] = 0  # 대각행렬 원소만 0으로 만들어주기 위해
        self.B = B
        self.pred = X.dot(B)
    
    def generate_rating_matrix(self, df):
        users = self.user_enc.fit_transform(df.loc[:, 'user'])
        items = self.item_enc.fit_transform(df.loc[:, 'item'])
        data = np.ones(df.shape[0])
        return csr_matrix((data, (users, items)))
    
    def forward(self, df, top_k):
        users = df['user'].unique()
        items = df['item'].unique()
        items = self.item_enc.transform(items)
        train = df.loc[df.user.isin(users)]
        train['label_user'] = self.user_enc.transform(train.user)
        train['label_item'] = self.item_enc.transform(train.item)
        train_groupby = train.groupby('label_user')
        with Pool(cpu_count()) as p:
            user_preds = p.starmap(
                self.predict_by_user,
                [(user, group, self.pred[user, :], items, top_k) for user, group in train_groupby],
            )
        pred_df = pd.concat(user_preds)
        pred_df['user'] = self.user_enc.inverse_transform(pred_df['user'])
        pred_df['item'] = self.item_enc.inverse_transform(pred_df['item'])
        return pred_df

    @staticmethod
    def predict_by_user(user, group, pred, items, top_k):
        watched_item = set(group['label_item'])
        candidates_item = [item for item in items if item not in watched_item]
        # 안 본 영화의 index를 기준으로 추출
        pred = np.take(pred, candidates_item)
        # 큰 순서대로 정렬하고 top_k개의 index 출력
        res = np.argpartition(pred, -top_k)[-top_k:]
        r = pd.DataFrame(
            {
                "user": [user] * len(res),
                "item": np.take(candidates_item, res),
                "score": np.take(pred, res),
            }
        ).sort_values('score', ascending=False)
        return r

In [5]:
path = '/opt/ml/input/data/train/'
dir_path = 'output/'

train = pd.read_csv(path + 'train_ratings.csv')
lambda_, top = 400, 10

In [6]:
model = EASE_base(lambda_)

In [7]:
model.train(train)

In [8]:
predict = model.forward(train, top)
predict = predict.drop('score',axis = 1)

In [9]:
if not os.path.exists(dir_path):
        os.makedirs(dir_path)
predict.to_csv(dir_path + 'my_ease_{}_{}.csv'.format(lambda_, top), index=False)