# EASER Movie Recommendation

## Library

In [1]:
import os
import sys
import random
import time

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import numpy as np
import pandas as pd
from scipy import sparse

## Config

In [2]:
import yaml

def load_config(config_file):
    with open(config_file, 'r') as stream:
        try:
            config = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
    return config

cfg = load_config('config.yaml')

In [3]:
if torch.cuda.is_available():
    print('CUDA is available')
    cfg['device'] = True

device = torch.device('cuda' if cfg['device'] else 'cpu')
device

CUDA is available


device(type='cuda')

In [4]:
random.seed(cfg['seed'])
np.random.seed(cfg['seed'])
torch.manual_seed(cfg['seed'])


<torch._C.Generator at 0x7f9d7f1d0590>

In [5]:
raw_data = pd.read_csv(os.path.join(cfg['DATA_DIR'], cfg['data']), header=0, usecols=[0, 1])

In [6]:
df_user_movie = raw_data[['user', 'item']]
df_user_movie

Unnamed: 0,user,item
0,11,4643
1,11,170
2,11,531
3,11,616
4,11,2140
...,...,...
5154466,138493,44022
5154467,138493,4958
5154468,138493,68319
5154469,138493,40819


In [7]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

def filter_triplets(tp, min_uc=5, min_sc=0):
    if min_sc > 0:
        itemcount = get_count(tp, 'item')
        tp = tp[tp['item'].isin(itemcount[itemcount['size'] >= min_sc]['item'])]

    if min_uc > 0:
        usercount = get_count(tp, 'user')
        tp = tp[tp['user'].isin(usercount[usercount['size'] >= min_uc]['user'])]

    usercount, itemcount = get_count(tp, 'user'), get_count(tp, 'item')
    return tp, usercount, itemcount

def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('user')
    tr_list, te_list = list(), list()

    for _, group in data_grouped_by_user:
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])

        else:
            tr_list.append(group)

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)

    return data_tr, data_te

def numerize(tp, user2id, item2id):
    uid = tp['user'].apply(lambda x: user2id[x])
    sid = tp['item'].apply(lambda x: item2id[x])
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

def denumerize(tp, id2user, id2item):
    user = tp['uid'].apply(lambda x: id2user[x])
    item = tp['sid'].apply(lambda x: id2item[x])
    return pd.DataFrame(data={'user': user, 'item': item}, columns=['user', 'item'])

In [8]:
raw_data, user_activity, item_popularity = filter_triplets(raw_data, min_uc=5, min_sc=10)

print("유저별 리뷰수\n",user_activity)
print("아이템별 리뷰수\n",item_popularity)

유저별 리뷰수
          user  size
0          11   376
1          14   180
2          18    77
3          25    91
4          31   154
...       ...   ...
31355  138473    63
31356  138475   124
31357  138486   137
31358  138492    68
31359  138493   314

[31360 rows x 2 columns]
아이템별 리뷰수
         item   size
0          1  12217
1          2   3364
2          3    734
3          4     43
4          5    590
...      ...    ...
6802  118700     54
6803  118900     60
6804  118997     52
6805  119141    122
6806  119145     78

[6807 rows x 2 columns]


In [9]:
# Shuffle User Indices
unique_uid = user_activity['user'].unique()
unique_sid = item_popularity['item'].unique()
print("(BEFORE) unique_uid:",unique_uid)

np.random.seed(cfg['seed'])
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]
print("(AFTER) unique_uid:",unique_uid)

n_users = unique_uid.size #31360
n_items = unique_sid.size #5000


(BEFORE) unique_uid: [    11     14     18 ... 138486 138492 138493]
(AFTER) unique_uid: [ 81259  11986  67552 ...   3671  69383 103755]


In [10]:
raw_data

Unnamed: 0,user,item
0,11,4643
1,11,170
2,11,531
3,11,616
4,11,2140
...,...,...
5154466,138493,44022
5154467,138493,4958
5154468,138493,68319
5154469,138493,40819


In [11]:
unique_item = raw_data['item'].unique()


In [12]:
item2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
user2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

pro_dir = os.path.join('pro_sg')

if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)

with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

with open(os.path.join(pro_dir, 'unique_uid.txt'), 'w') as f:
    for uid in unique_uid:
        f.write('%s\n' % uid)



In [13]:
num_heldout_users = cfg['FOLD_SIZE']

In [14]:
df_for_ease = numerize(raw_data, user2id, item2id)
df_for_ease

Unnamed: 0,uid,sid
0,13266,2505
1,13266,109
2,13266,319
3,13266,368
4,13266,1183
...,...,...
5154466,4927,4882
5154467,4927,2652
5154468,4927,5768
5154469,4927,4791


In [15]:
df_for_ease['watched'] = [1] * len(df_for_ease)
df_for_ease

Unnamed: 0,uid,sid,watched
0,13266,2505,1
1,13266,109,1
2,13266,319,1
3,13266,368,1
4,13266,1183,1
...,...,...,...
5154466,4927,4882,1
5154467,4927,2652,1
5154468,4927,5768,1
5154469,4927,4791,1


In [16]:
pivot_table = df_for_ease.pivot_table(index=["uid"], columns=["sid"], values="watched")
X = pivot_table.to_numpy()
X = np.nan_to_num(X)

In [17]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [34]:
from copy import deepcopy
from tqdm import tqdm

In [35]:
class EASER():
    def __init__(self, threshold = 3500, lambdaBB = 500, lambdaCC = 10000, rho = 50000, epochs = 40):
        self.threshold = threshold
        self.lambdaBB = lambdaBB
        self.lambdaCC = lambdaCC
        self.rho = rho
        self.epochs = epochs

    def create_list_feature_pairs(self, XtX):
        AA = np.triu(np.abs(XtX))
        AA[ np.diag_indices(AA.shape[0]) ]=0.0
        ii_pairs = np.where((AA > self.threshold) == True)
        return ii_pairs

    def create_matrix_Z(self, ii_pairs, X):
        MM = np.zeros( (len(ii_pairs[0]), X.shape[1]),    dtype=np.float64)
        MM[np.arange(MM.shape[0]) , ii_pairs[0]   ]=1.0
        MM[np.arange(MM.shape[0]) , ii_pairs[1]   ]=1.0
        CCmask = 1.0-MM
        MM = sparse.csc_matrix(MM.T)
        Z=  X * MM
        Z= (Z == 2.0 )
        Z=Z*1.0
        return Z, CCmask

    def train_higher(self, XtX, XtXdiag, ZtZ, ZtZdiag, CCmask, ZtX):
        ii_diag=np.diag_indices(XtX.shape[0])
        XtX[ii_diag] = XtXdiag + self.lambdaBB
        PP = np.linalg.inv(XtX)
        ii_diag_ZZ=np.diag_indices(ZtZ.shape[0])
        ZtZ[ii_diag_ZZ] = ZtZdiag + self.lambdaCC + self.rho
        QQ=np.linalg.inv(ZtZ)
        CC = np.zeros( (ZtZ.shape[0], XtX.shape[0]),dtype=np.float64 )
        DD = np.zeros( (ZtZ.shape[0], XtX.shape[0]),dtype=np.float64 )
        UU = np.zeros( (ZtZ.shape[0], XtX.shape[0]),dtype=np.float64 )

        for iter in tqdm(range(self.epochs)):
            # learn BB
            XtX[ii_diag] = XtXdiag
            BB= PP.dot(XtX-ZtX.T.dot(CC))
            gamma = np.diag(BB) / np.diag(PP)
            BB-= PP * gamma
            # learn CC
            CC= QQ.dot(ZtX-ZtX.dot(BB) + self.rho * (DD-UU))
            # learn DD
            DD=  CC  * CCmask
            #DD= np.maximum(0.0, DD) # if you want to enforce non-negative parameters
            # learn UU (is Gamma in paper)
            UU+= CC-DD

        return BB, DD

    def fit(self, X):
        print(' --- init')
        XtX = (X.T @ X)
        XtXdiag = deepcopy(np.diag(XtX))
        ii_pairs = self.create_list_feature_pairs(XtX)
        Z, CCmask = self.create_matrix_Z(ii_pairs, X)

        ZtZ = (Z.transpose() @ Z)
        ZtZdiag = deepcopy(np.diag(ZtZ))

        ZtX = (Z.transpose() @ X)

        print(' --- iteration start.')
        BB, CC = self.train_higher(XtX, XtXdiag, ZtZ, ZtZdiag, CCmask, ZtX)
        print(' --- iteration end.')

        self.pred = torch.from_numpy(X.dot(BB) + Z.dot(CC))

In [36]:
easer = EASER()
easer.fit(X)

 --- init
 --- iteration start.


100%|██████████| 40/40 [13:29<00:00, 20.23s/it]


 --- iteration end.


In [42]:
X_pred = easer.pred.numpy()

In [43]:
X_pred

array([[ 1.54421716e-01,  5.91912625e-02,  2.07301154e-02, ...,
         1.04652035e-02, -8.30006771e-04, -1.24459579e-02],
       [ 4.99935844e-01,  3.09546568e-01,  1.14500023e-01, ...,
         2.83030480e-03, -1.06144294e-02,  1.21164947e-02],
       [ 6.40607516e-01,  3.05062113e-01,  5.66899892e-03, ...,
        -5.99358232e-03, -1.08049459e-02,  3.70181809e-03],
       ...,
       [ 9.46996138e-01,  1.81685244e-01,  1.11324227e-01, ...,
         1.39981929e-02, -2.70518413e-02,  1.68039900e-02],
       [ 1.50066500e-01,  6.59546285e-02,  7.39451284e-03, ...,
        -2.08051370e-03, -3.67058869e-03,  5.11150823e-04],
       [ 3.38351033e-01,  2.38472199e-01, -2.66798391e-02, ...,
        -4.88278140e-03, -8.84184706e-03, -1.21180404e-02]])

In [44]:
X_pred[X.nonzero()] = -np.inf

In [None]:
result = easer.forward(X[:, :])
print(result)

[[ 1.83566345e+00  6.25712395e-02  2.33214792e-02 ...  8.58501692e-03
  -3.66008969e-04 -1.65193833e-02]
 [ 2.22127863e+00  1.45964979e-01  1.37979592e-01 ...  4.32863994e-03
  -1.74153992e-02 -2.55486916e-03]
 [-1.64288337e+00  1.75651106e-01 -1.22299960e-02 ... -7.69506705e-03
  -1.44144038e-02 -3.01526096e-03]
 ...
 [ 1.32627349e-01  9.14687324e-02  1.45398930e-01 ...  9.30167743e-03
  -1.87203732e-02  1.89461318e-02]
 [-3.44706299e-03  9.12737008e-02 -7.56348240e-03 ... -5.11154187e-03
   1.00204462e-03 -4.58949981e-03]
 [-1.12498007e+00  7.78999262e-02 -2.95719179e-02 ... -4.35460561e-03
   4.15031578e-03 -9.43776163e-03]]


In [None]:
print(X.nonzero())


(array([    0,     0,     0, ..., 31359, 31359, 31359]), array([ 136,  153,  380, ..., 5771, 5855, 6084]))


In [None]:
result[X.nonzero()] = -np.inf
print(result)

[[ 1.83566345e+00  6.25712395e-02  2.33214792e-02 ...  8.58501692e-03
  -3.66008969e-04 -1.65193833e-02]
 [ 2.22127863e+00            -inf  1.37979592e-01 ...  4.32863994e-03
  -1.74153992e-02 -2.55486916e-03]
 [           -inf            -inf -1.22299960e-02 ... -7.69506705e-03
  -1.44144038e-02 -3.01526096e-03]
 ...
 [           -inf  9.14687324e-02  1.45398930e-01 ...  9.30167743e-03
  -1.87203732e-02  1.89461318e-02]
 [-3.44706299e-03  9.12737008e-02 -7.56348240e-03 ... -5.11154187e-03
   1.00204462e-03 -4.58949981e-03]
 [-1.12498007e+00  7.78999262e-02 -2.95719179e-02 ... -4.35460561e-03
   4.15031578e-03 -9.43776163e-03]]


In [45]:
X_pred

array([[ 0.15442172,  0.05919126,  0.02073012, ...,  0.0104652 ,
        -0.00083001, -0.01244596],
       [ 0.49993584,        -inf,  0.11450002, ...,  0.0028303 ,
        -0.01061443,  0.01211649],
       [       -inf,        -inf,  0.005669  , ..., -0.00599358,
        -0.01080495,  0.00370182],
       ...,
       [       -inf,  0.18168524,  0.11132423, ...,  0.01399819,
        -0.02705184,  0.01680399],
       [ 0.1500665 ,  0.06595463,  0.00739451, ..., -0.00208051,
        -0.00367059,  0.00051115],
       [ 0.33835103,  0.2384722 , -0.02667984, ..., -0.00488278,
        -0.00884185, -0.01211804]])

In [46]:
import bottleneck as bn

In [47]:
top_items_by_user = bn.argpartition(-X_pred, 10, axis=1)[:, :10]
print(top_items_by_user)

[[4065 5147 4101 ... 5514 4610 4735]
 [ 931 2653  650 ...  604 4101  226]
 [2672  406  894 ... 1161  105    9]
 ...
 [  41 4621  146 ... 1494 1435 2619]
 [1400  890  178 ...  657 2304 2670]
 [ 237 3420  328 ...  492 3240 1949]]


In [48]:
user_result = []
item_result =[]

for id, top_k in enumerate(top_items_by_user):
    user_result.extend([id] * 10)
    item_result.extend(top_k)

df_user_result = pd.DataFrame(user_result, columns=['uid'])
df_item_result = pd.DataFrame(item_result, columns=['sid'])
df_result = pd.concat([df_user_result, df_item_result], axis=1)

In [49]:
df_result

Unnamed: 0,uid,sid
0,0,4065
1,0,5147
2,0,4101
3,0,5936
4,0,6173
...,...,...
313595,31359,4000
313596,31359,3994
313597,31359,492
313598,31359,3240


In [50]:
id2item = dict((i, sid) for (i, sid) in enumerate(unique_sid.squeeze()))
id2user = dict((i, pid) for (i, pid) in enumerate(unique_uid.squeeze()))


In [51]:
df_infer = denumerize(df_result, id2user, id2item)
df_infer.columns = ['user', 'item']
df_infer = df_infer.sort_values('user')
df_infer

Unnamed: 0,user,item
132668,11,2987
132669,11,4886
132667,11,3996
132666,11,4370
132665,11,47
...,...,...
49275,138493,53125
49276,138493,1270
49277,138493,4022
49279,138493,32587


In [None]:
df_infer.to_csv(os.path.join('submission_EASER.csv'), index=False)
