In [1]:
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from multiprocessing import Pool, cpu_count


class EASE:
    def __init__(self):
        self.user_enc = LabelEncoder()
        self.item_enc = LabelEncoder()

    def _get_users_and_items(self, df):
        users = self.user_enc.fit_transform(df.loc[:, 'user'])
        items = self.item_enc.fit_transform(df.loc[:, 'item'])
        return users, items

    def fit(self, df, lambda_: float = 500, implicit=True):
        """
        df: pandas.DataFrame with columns user_id, item_id and (rating)
        lambda_: l2-regularization term
        implicit: if True, ratings are ignored and taken as 1, else normalized ratings are used
        """
        users, items = self._get_users_and_items(df)
        values = (
            np.ones(df.shape[0])
            if implicit
            else df['rating'].to_numpy() / df['rating'].max()
        )

        X = csr_matrix((values, (users, items)))
        self.X = X

        G = X.T.dot(X).toarray()
        diagIndices = np.diag_indices(G.shape[0])
        G[diagIndices] += lambda_
        P = np.linalg.inv(G)
        B = P / (-np.diag(P))
        B[diagIndices] = 0

        self.B = B
        self.pred = X.dot(B)

    def predict(self, train, users, items, k):
        items = self.item_enc.transform(items)
        dd = train.loc[train.user.isin(users)]
        dd['ci'] = self.item_enc.transform(dd.item)
        dd['cu'] = self.user_enc.transform(dd.user)
        g = dd.groupby('cu')
        with Pool(cpu_count()) as p:
            user_preds = p.starmap(
                self.predict_for_user,
                [(user, group, self.pred[user, :], items, k) for user, group in g],
            )
        df = pd.concat(user_preds)
        df['item'] = self.item_enc.inverse_transform(df['item'])
        df['user'] = self.user_enc.inverse_transform(df['user'])
        return df

    @staticmethod
    def predict_for_user(user, group, pred, items, k):
        watched = set(group['ci'])
        candidates = [item for item in items if item not in watched]
        pred = np.take(pred, candidates)
        res = np.argpartition(pred, -k)[-k:]
        r = pd.DataFrame(
            {
                "user": [user] * len(res),
                "item": np.take(candidates, res),
                "score": np.take(pred, res),
            }
        ).sort_values('score', ascending=False)
        return r

In [2]:
inters = pd.read_csv("/opt/ml/input/data/train/train_ratings.csv")

In [3]:
inters = inters.drop(columns='time')

In [4]:
ease = EASE()
print('### Fit...')
ease.fit(inters, lambda_=500)
print('### Fit Done!')

### Fit...
### Fit Done!


In [5]:
users = inters['user'].unique()
items = inters['item'].unique()
k = 10

print('### Pred...')
preds = ease.predict(inters, users, items, k)
print('### Pred done!')

### Pred...
### Pred done!


In [6]:
preds

Unnamed: 0,user,item,score
8,11,4370,0.949198
9,11,4886,0.858283
7,11,40815,0.704758
6,11,47,0.678571
5,11,32587,0.667589
...,...,...,...
3,138493,5349,0.599230
2,138493,8970,0.570437
4,138493,32587,0.565791
1,138493,4022,0.546019


In [12]:
preds = preds.reset_index(drop=True)
preds.to_csv('./output/EASE/EASE_Ver_0_1_5_submission_with_item_scores.csv', index=False)
preds = preds.drop(columns='score')
preds.to_csv('./output/EASE/EASE_Ver_0_1_5_submission.csv', index=False)

In [13]:
preds

Unnamed: 0,user,item,score
0,11,4370,0.949198
1,11,4886,0.858283
2,11,40815,0.704758
3,11,47,0.678571
4,11,32587,0.667589
...,...,...,...
313595,138493,5349,0.599230
313596,138493,8970,0.570437
313597,138493,32587,0.565791
313598,138493,4022,0.546019
