<a href="https://colab.research.google.com/github/cars1015/recsys_MyReserch/blob/main/EASE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive

# Googleドライブをマウント
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:

from scipy import sparse
import os

In [6]:
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm#処理の進捗状況を知らせる機能


class EASE:
    def __init__(self):
      #LabelEncoder()は文字列を数値に変えてくれる
        self.user_enc = LabelEncoder()
        self.item_enc = LabelEncoder()

    def _get_users_and_items(self, df):
        users = self.user_enc.fit_transform(df.loc[:, 'uid'])
        items = self.item_enc.fit_transform(df.loc[:, 'sid'])
        return users, items

    def fit(self, df, lambda_: float = 0.5, implicit=True):
        """
        df: pandas.DataFrame with columns user_id, item_id and (rating)
        lambda_: l2-regularization term
        implicit: if True, ratings are ignored and taken as 1, else normalized ratings are used
        """
        users, items = self._get_users_and_items(df)
        values = np.ones(df.shape[0]) if implicit else df['rating'].to_numpy() / df['rating'].max()

        X = csr_matrix((values, (users, items)))
        self.X = X

        G = X.T.dot(X).toarray()
        print(G)
        diagIndices = np.diag_indices(G.shape[0])
        G[diagIndices] += lambda_
        P = np.linalg.inv(G)
        B = P / (-np.diag(P))
        B[diagIndices] = 0

        self.B = B
        # self.pred = X.dot(B)


In [5]:
#data = "ml-20m-clean-new"
data="ml-20m"
dir = "/content/drive/MyDrive/recommend/" + data + "/pro_sg/"

import pandas as pd

#df = pd.read_csv(dir + "clean_train.csv")

df=pd.read_csv(dir+"train.csv")


# df = pd.read_csv(dir + "train-editor-ml-20m-movie.csv")

model = EASE()
model.fit(df, lambda_ = 200.0)
#netflix-prizeの最適lambdaは1000
#ML-20mは500
#MSDは200


In [None]:
#df_test = pd.read_csv(dir + "clean_test_tr.csv")
df_test=pd.read_csv(dir+"test_tr.csv")

users = df_test.loc[:, 'uid']
items = df_test.loc[:, 'sid']

u_enc = LabelEncoder()
users_id = u_enc.fit_transform(users)
items_id = model.item_enc.transform(items)

# 
values = np.ones(df_test.shape[0])
shape = (u_enc.classes_.size, model.item_enc.classes_.size)
X = csr_matrix((values, (users_id, items_id)), shape=shape)
print(X)
print(X.toarray())
#
print(X.T.dot(X).toarray())

pred = X.dot(model.B)

In [13]:
print(model.B)

[[ 0.00000000e+00 -4.19207706e-03  1.64331381e-02 ...  3.17997256e-06
   3.17997256e-06  3.17997256e-06]
 [-2.28424525e-03  0.00000000e+00  3.99140542e-03 ... -2.06805163e-05
  -2.06805163e-05 -2.06805163e-05]
 [ 9.55797281e-03  4.26047141e-03  0.00000000e+00 ...  7.21536996e-05
   7.21536996e-05  7.21536996e-05]
 ...
 [ 6.42326384e-05 -7.66620259e-04  2.50579596e-03 ...  0.00000000e+00
   3.20409650e-03  3.20409650e-03]
 [ 6.42326384e-05 -7.66620259e-04  2.50579596e-03 ...  3.20409650e-03
   0.00000000e+00  3.20409650e-03]
 [ 6.42326384e-05 -7.66620259e-04  2.50579596e-03 ...  3.20409650e-03
   3.20409650e-03  0.00000000e+00]]


In [11]:
np.savetxt(dir+"pred_white.txt",pred)
#x=np.loadtxt(dir+"pred.txt")

In [23]:
df_results = pd.DataFrame()

dd_test = df_test.groupby('uid')

k=100
#ラベル名を渡している
item_set = model.item_enc.classes_

for user, group in tqdm(dd_test):
  #重複なしのsid格納
  watched = set(group['sid'])
  #触れられなかったアイテムを格納
  candidates = [item for item in item_set if item not in watched]
  uid = u_enc.transform([user])
  #ここに評価されていないアイテムの予測評価値が格納されている
  p = np.take(pred[uid[0], :], candidates)
  #配列全体をソートせずに上位k件についてインデックスを見つけソート
  res = np.argpartition(p, -k)[-k:]
  #rに一人のユーザの上位100件のおすすめデータを格納し、スコアでソートしている。
  r = pd.DataFrame({
                "uid": [user] * len(res),
                "sid": np.take(candidates, res),
                "score": np.take(p, res)
      }).sort_values('score', ascending=False)
  df_results = df_results.append(r, ignore_index=True)
#すべてのユーザに予測された評価値上位100個のアイテムを出力
df_results.to_csv(dir + "ease_results_500.csv", index=False, header=False)

100%|██████████| 40000/40000 [15:15<00:00, 43.70it/s]
