In [1]:
import numpy as np
import pandas as pd
import torch
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from multiprocessing import Pool, cpu_count
import random



In [2]:
path = '/opt/ml/input/data/train/'

train = pd.read_csv(path + 'train_ratings.csv')


In [3]:
train['valid'] = False

In [4]:
train

Unnamed: 0,user,item,time,valid
0,11,4643,1230782529,False
1,11,170,1230782534,False
2,11,531,1230782539,False
3,11,616,1230782542,False
4,11,2140,1230782563,False
...,...,...,...,...
5154466,138493,44022,1260209449,False
5154467,138493,4958,1260209482,False
5154468,138493,68319,1260209720,False
5154469,138493,40819,1260209726,False


In [5]:
tmp = train.groupby('user').sample(3)

In [6]:
train['valid'][tmp.index] = True


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['valid'][tmp.index] = True


In [7]:
valid = train[train['valid'] == True]
train = train[train['valid'] == False]
valid = valid.drop(['time','valid'],axis = 1)

In [8]:
class EASE:
    def __init__(self):
        self.user_enc = LabelEncoder()
        self.item_enc = LabelEncoder()

    def _get_users_and_items(self, df):
        users = self.user_enc.fit_transform(df.loc[:, 'user'])
        items = self.item_enc.fit_transform(df.loc[:, 'item'])
        return users, items

    def fit(self, df, lambda_: float = 0.5, implicit=True):
        """
        df: pandas.DataFrame with columns user_id, item_id and (rating)
        lambda_: l2-regularization term
        implicit: if True, ratings are ignored and taken as 1, else normalized ratings are used
        """
        users, items = self._get_users_and_items(df)
        values = (
            np.ones(df.shape[0])
            if implicit
            else df['rating'].to_numpy() / df['rating'].max()
        )

        X = csr_matrix((values, (users, items)))
        self.X = X

        G = X.T.dot(X).toarray()
        diagIndices = np.diag_indices(G.shape[0])
        G[diagIndices] += lambda_
        P = np.linalg.inv(G)
        B = P / (-np.diag(P))
        B[diagIndices] = 0

        self.B = B
        self.pred = X.dot(B)

    def predict(self, train, users, items, k):
        items = self.item_enc.transform(items)
        dd = train.loc[train.user.isin(users)]
        dd['ci'] = self.item_enc.transform(dd.item)
        dd['cu'] = self.user_enc.transform(dd.user)
        g = dd.groupby('cu')
        with Pool(cpu_count()) as p:
            user_preds = p.starmap(
                self.predict_for_user,
                [(user, group, self.pred[user, :], items, k) for user, group in g],
            )
        df = pd.concat(user_preds)
        df['item'] = self.item_enc.inverse_transform(df['item'])
        df['user'] = self.user_enc.inverse_transform(df['user'])
        return df

    @staticmethod
    def predict_for_user(user, group, pred, items, k):
        watched = set(group['ci'])
        candidates = [item for item in items if item not in watched]
        pred = np.take(pred, candidates)
        res = np.argpartition(pred, -k)[-k:]
        r = pd.DataFrame(
            {
                "user": [user] * len(res),
                "item": np.take(candidates, res),
                "score": np.take(pred, res),
            }
        ).sort_values('score', ascending=False)
        return r

In [9]:
model = EASE()


In [10]:
best = 0
for i in range(1,10):
    print(i/10)
    model.fit(train,i/10)
    predict = model.predict(train,train['user'].unique(),train['item'].unique(),10)
    predict = predict.drop('score',axis = 1)
    loss = 0
    for val,pre in zip(valid.groupby('user'),predict.groupby('user')):
        #print(val[1]['item'])
        #print(pre[1]['item'])
        #print(val[1]['item'].isin(pre[1]['item']).sum())
        loss += val[1]['item'].isin(pre[1]['item']).sum()

    if loss != 0: 
        loss = loss / len(valid)
    if best <loss:
        best = loss
        best_predict = predict
    print("lambda = ", i/10, " loss = ", loss)
best_predict.to_csv('../output/ease.csv', index=False)


0.1
lambda =  0.1  loss =  0.1992878401360544
0.2
