In [1]:
import pandas as pd
import numpy as np
import torch
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from multiprocessing import Pool, cpu_count
import datetime
import pytz

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
train = pd.read_csv('data/train/train_ratings.csv')

In [3]:
korea_timezone = pytz.timezone('Asia/Seoul')
now_korea = datetime.datetime.now(korea_timezone)
now_date = now_korea.strftime('%Y%m%d')
now_hour = now_korea.strftime('%H%M%S')
date_time = f"{now_date}_{now_hour}"

In [13]:
class EaseModel():
    def __init__(self):
        # super().__init__()
        self.user_enc = LabelEncoder()
        self.item_enc = LabelEncoder()

    def _get_users_and_items(self, df):
        users = self.user_enc.fit_transform(df.loc[:, 'user'])
        items = self.item_enc.fit_transform(df.loc[:, 'item'])
        return users, items

    def fit(self, df, lambda_: float = 150):
        """
        df: pandas.DataFrame with columns user_id, item_id and (rating)
        lambda_: l2-regularization term
        """
        users, items = self._get_users_and_items(df)
        values = (np.ones(df.shape[0]))
        values = np.where(values==1, 0.9, values)

        X = csr_matrix((values, (users, items)))
        self.X = X

        G = X.T.dot(X).toarray()
        diagIndices = np.diag_indices(G.shape[0])
        G[diagIndices] += lambda_
        P = np.linalg.inv(G)
        B = P / (-np.diag(P))
        B[diagIndices] = 0

        self.B = B
        self.pred = X.dot(B)

    def predict(self, train, users, items, k):
        items = self.item_enc.transform(items)
        dd = train.loc[train.user.isin(users)]
        dd['ci'] = self.item_enc.transform(dd.item)
        dd['cu'] = self.user_enc.transform(dd.user)
        g = dd.groupby('cu')
        with Pool(cpu_count()) as p:
            user_preds = p.starmap(
                self.predict_for_user,
                [(user, group, self.pred[user, :], items, k) for user, group in g],
            )
        df = pd.concat(user_preds)
        df['item'] = self.item_enc.inverse_transform(df['item'])
        df['user'] = self.user_enc.inverse_transform(df['user'])
        return df

    @staticmethod
    def predict_for_user(user, group, pred, items, k):
        watched = set(group['ci'])
        candidates = [item for item in items if item not in watched]
        pred = np.take(pred, candidates)
        res = np.argpartition(pred, -k)[-k:]
        r = pd.DataFrame(
            {
                "user": [user] * len(res),
                "item": np.take(candidates, res),
                "score": np.take(pred, res),
            }
        ).sort_values('score', ascending=False)
        return r

    def forward(self):
        pass

In [15]:
model = EaseModel()
model.fit(train)

In [17]:
predict = model.predict(train, train['user'].unique(), train['item'].unique(), 10)
predict = predict.drop('score', axis = 1)
predict.to_csv(f'output/{date_time} EASE.csv', index=False)