In [4]:
import numpy as np
import pandas as pd
import torch
from scipy.sparse import csr_matrix
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from multiprocessing import Pool, cpu_count
path = '/opt/ml/input/project/model/data/'

train = pd.read_csv(path + 'G_train2.csv')   
train

Unnamed: 0,userid,rest,user_code,rest_code
0,5b61c7658f8242cb2a1b1028,1.172647e+07,166651,2073
1,5b61c7658f8242cb2a1b1028,2.140700e+07,166651,1604
2,5b61c7658f8242cb2a1b1028,1.176442e+07,166651,1160
3,5b61c7658f8242cb2a1b1028,1.987767e+07,166651,1571
4,5b61c7658f8242cb2a1b1028,1.100141e+09,166651,658
...,...,...,...,...
338124,6396b0ffadfbb231c28da55d,1.551480e+09,320304,2235
338125,6396b0ffadfbb231c28da55d,1.016763e+09,320304,2166
338126,6396b0ffadfbb231c28da55d,2.062259e+07,320304,2324
338127,6396b0ffadfbb231c28da55d,1.357503e+07,320304,2100


In [None]:
'''
train => user, item, time 으로 이루어진 
5154471 rows × 3 columns 크기의 csv
'''

'\ntrain => user, item, time 으로 이루어진 \n5154471 rows × 3 columns 크기의 csv\n'

In [14]:
class EASE:
    def __init__(self):
        self.user_enc = LabelEncoder()
        self.item_enc = LabelEncoder()

    def _get_users_and_items(self, df):
        users = self.user_enc.fit_transform(df.loc[:, 'user_code'])
        items = self.item_enc.fit_transform(df.loc[:, 'rest_code'])
        return users, items

    def fit(self, df, lambda_: float = 500, implicit=True):
        """
        df: pandas.DataFrame with columns user_id, item_id and (rating)
        lambda_: l2-regularization term
        implicit: if True, ratings are ignored and taken as 1, else normalized ratings are used
        """
        users, items = self._get_users_and_items(df)
        values = (
            np.ones(df.shape[0])
            if implicit
            else df['rating'].to_numpy() / df['rating'].max()
        )

        X = csr_matrix((values, (users, items)))
        self.X = X

        G = X.T.dot(X).toarray()
        diagIndices = np.diag_indices(G.shape[0])
        G[diagIndices] += lambda_
        P = np.linalg.inv(G)
        B = P / (-np.diag(P))
        B[diagIndices] = 0

        self.B = B
        self.pred = X.dot(B)

    def predict(self, train, users, items, k):
        items = self.item_enc.transform(items)
        dd = train.loc[train.user_code.isin(users)]
        dd['ci'] = self.item_enc.transform(dd.rest_code)
        dd['cu'] = self.user_enc.transform(dd.user_code)
        g = dd.groupby('cu')
        with Pool(cpu_count()) as p:
            user_preds = p.starmap(
                self.predict_for_user,
                [(user, group, self.pred[user, :], items, k) for user, group in g],
            )
        df = pd.concat(user_preds)
        df['rest_code'] = self.item_enc.inverse_transform(df['rest_code'])
        df['user_code'] = self.user_enc.inverse_transform(df['user_code'])
        return df

    @staticmethod
    def predict_for_user(user, group, pred, items, k):
        watched = set(group['ci'])
        candidates = [item for item in items if item not in watched]
        pred = np.take(pred, candidates)
        res = np.argpartition(pred, -k)[-k:]
        r = pd.DataFrame(
            {
                "user_code": [user] * len(res),
                "rest_code": np.take(candidates, res),
                "score": np.take(pred, res),
            }
        ).sort_values('score', ascending=False)
        return r
        


In [15]:
model = EASE()
model.fit(train)


# 모델 저장

In [5]:
with open('model.pt', 'wb') as f:
    torch.save(model, f)

# 모델 적용

In [7]:
with open('model.pt', 'rb') as f:
    model = torch.load(f)

In [34]:
input = pd.read_csv(path + 'input.csv')
'''
input = pd.DataFrame({'user':[0],
                      'item':[3],
                      'userid':['5b62e8065fffc95678a5a628'],
                      'rest' :['838819922']})
'''

In [35]:
predict = model.predict(input,input['user'].unique(),input['item'].unique(),3)
predict = predict.drop('score',axis=1)

In [36]:
predict

Unnamed: 0,user,item
2,0,24
1,0,22
0,0,20
2,1,7
1,1,5
0,1,4


In [15]:
predict = model.predict(train,train['user'].unique(),train['item'].unique(),3)
predict = predict.drop('score',axis = 1)


In [17]:
predict

Unnamed: 0,user,item
2,0,502
1,0,451
0,0,210
2,1,154
1,1,41
...,...,...
1,10082,230
0,10082,41
2,10083,293
1,10083,545


In [6]:
predict.to_csv('./output/ease_G.csv', index=False)

# Test(recall K)

In [27]:
predict = pd.read_csv('/opt/ml/input/project/model/EASE/output/' + 'ease_G.csv')
answer = pd.read_csv('/opt/ml/input/project/model/data/G_test2.csv')


In [28]:
answer

Unnamed: 0,userid,rest,user_code,rest_code
0,5b61c7658f8242cb2a1b1028,1.757111e+09,166651,1447
1,5b61c7658f8242cb2a1b1028,1.375758e+09,166651,1501
2,5b62e6f60364c573d06114c1,1.415009e+09,98831,456
3,5b62e6f60364c573d06114c1,1.374996e+09,98831,356
4,5b62e707d6205156e6fa96f0,1.611312e+09,174612,750
...,...,...,...,...
65359,6353b5db1566f10006660c21,1.116538e+09,173687,1923
65360,63568cefbc1c4a789f7aec09,1.222516e+09,188377,858
65361,63568cefbc1c4a789f7aec09,1.647491e+09,188377,1025
65362,6396b0ffadfbb231c28da55d,6.301172e+08,320304,2120


In [29]:
predict_user = predict.groupby('user_code')['rest_code'].apply(list) 
answer_user = answer.groupby('user_code')['rest_code'].apply(list)

In [40]:
predict_user = predict_user.reset_index(drop=True)
answer_user = answer_user.reset_index(drop=True)

In [42]:
_recall = []

for i, ans in enumerate(answer_user):
    a = 0
    for j in ans:
        if j in predict_user[i]:
            a += 1 
    _recall.append(a/2)

In [43]:
recall = sum(_recall) / len(_recall)
recall

0.07219570405727924

In [None]:
scores = test_score(args, epoch, train_dataloader, model)
print("recall_k = ", scores)