# EASE Movie Recommendation

## Library

In [1]:
import os
import sys
import random
import time

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import numpy as np
import pandas as pd
from scipy import sparse

## Config

In [2]:
import yaml

def load_config(config_file):
    with open(config_file, 'r') as stream:
        try:
            config = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
    return config

cfg = load_config('config.yaml')

In [3]:
if torch.cuda.is_available():
    print('CUDA is available')
    cfg['device'] = True

device = torch.device('cuda' if cfg['device'] else 'cpu')
device

CUDA is available


device(type='cuda')

In [4]:
random.seed(cfg['seed'])
np.random.seed(cfg['seed'])
torch.manual_seed(cfg['seed'])


<torch._C.Generator at 0x7f027a924590>

In [5]:
raw_data = pd.read_csv(os.path.join(cfg['DATA_DIR'], cfg['data']), header=0, usecols=[0, 1])

In [6]:
df_user_movie = raw_data[['user', 'item']]
df_user_movie

Unnamed: 0,user,item
0,11,4643
1,11,170
2,11,531
3,11,616
4,11,2140
...,...,...
5154466,138493,44022
5154467,138493,4958
5154468,138493,68319
5154469,138493,40819


In [7]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

def filter_triplets(tp, min_uc=5, min_sc=0):
    if min_sc > 0:
        itemcount = get_count(tp, 'item')
        tp = tp[tp['item'].isin(itemcount[itemcount['size'] >= min_sc]['item'])]

    if min_uc > 0:
        usercount = get_count(tp, 'user')
        tp = tp[tp['user'].isin(usercount[usercount['size'] >= min_uc]['user'])]

    usercount, itemcount = get_count(tp, 'user'), get_count(tp, 'item')
    return tp, usercount, itemcount

def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('user')
    tr_list, te_list = list(), list()

    for _, group in data_grouped_by_user:
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])

        else:
            tr_list.append(group)

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)

    return data_tr, data_te

def numerize(tp, user2id, item2id):
    uid = tp['user'].apply(lambda x: user2id[x])
    sid = tp['item'].apply(lambda x: item2id[x])
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

def denumerize(tp, id2user, id2item):
    user = tp['uid'].apply(lambda x: id2user[x])
    item = tp['sid'].apply(lambda x: id2item[x])
    return pd.DataFrame(data={'user': user, 'item': item}, columns=['user', 'item'])

In [8]:
raw_data, user_activity, item_popularity = filter_triplets(raw_data, min_uc=5, min_sc=10)

print("유저별 리뷰수\n",user_activity)
print("아이템별 리뷰수\n",item_popularity)

유저별 리뷰수
          user  size
0          11   376
1          14   180
2          18    77
3          25    91
4          31   154
...       ...   ...
31355  138473    63
31356  138475   124
31357  138486   137
31358  138492    68
31359  138493   314

[31360 rows x 2 columns]
아이템별 리뷰수
         item   size
0          1  12217
1          2   3364
2          3    734
3          4     43
4          5    590
...      ...    ...
6802  118700     54
6803  118900     60
6804  118997     52
6805  119141    122
6806  119145     78

[6807 rows x 2 columns]


In [9]:
# Shuffle User Indices
unique_uid = user_activity['user'].unique()
unique_sid = item_popularity['item'].unique()
print("(BEFORE) unique_uid:",unique_uid)

np.random.seed(cfg['seed'])
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]
print("(AFTER) unique_uid:",unique_uid)

n_users = unique_uid.size #31360
n_items = unique_sid.size #5000


(BEFORE) unique_uid: [    11     14     18 ... 138486 138492 138493]
(AFTER) unique_uid: [ 81259  11986  67552 ...   3671  69383 103755]


In [10]:
raw_data

Unnamed: 0,user,item
0,11,4643
1,11,170
2,11,531
3,11,616
4,11,2140
...,...,...
5154466,138493,44022
5154467,138493,4958
5154468,138493,68319
5154469,138493,40819


In [11]:
unique_item = raw_data['item'].unique()


In [12]:
item2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
user2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

pro_dir = os.path.join('pro_sg')

if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)

with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

with open(os.path.join(pro_dir, 'unique_uid.txt'), 'w') as f:
    for uid in unique_uid:
        f.write('%s\n' % uid)



In [13]:
num_heldout_users = cfg['FOLD_SIZE']

In [14]:
df_for_ease = numerize(raw_data, user2id, item2id)
df_for_ease

Unnamed: 0,uid,sid
0,13266,2505
1,13266,109
2,13266,319
3,13266,368
4,13266,1183
...,...,...
5154466,4927,4882
5154467,4927,2652
5154468,4927,5768
5154469,4927,4791


In [15]:
df_for_ease['watched'] = [0.9] * len(df_for_ease)
df_for_ease

Unnamed: 0,uid,sid,watched
0,13266,2505,1
1,13266,109,1
2,13266,319,1
3,13266,368,1
4,13266,1183,1
...,...,...,...
5154466,4927,4882,1
5154467,4927,2652,1
5154468,4927,5768,1
5154469,4927,4791,1


In [16]:
pivot_table = df_for_ease.pivot_table(index=["uid"], columns=["sid"], values="watched")
X = pivot_table.to_numpy()
X = np.nan_to_num(X)

In [17]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
class EASE:
    """
    Embarrassingly Shallow Autoencoders model class
    """

    def __init__(self, lambda_):
        self.B = None
        self.lambda_ = lambda_

    def train(self, interaction_matrix):
        """
        train pass
        :param interaction_matrix: interaction_matrix
        """
        G = interaction_matrix.T @ interaction_matrix
        diag = list(range(G.shape[0]))
        G[diag, diag] += self.lambda_
        P = np.linalg.inv(G)

        # B = P * (X^T * X − diagMat(γ))
        self.B = P / -np.diag(P)
        min_dim = min(*self.B.shape)
        self.B[range(min_dim), range(min_dim)] = 0

    def forward(self, user_row):
        """
        forward pass
        """
        return user_row @ self.B

In [19]:
ease = EASE(600)
ease.train(X)

In [20]:
result = ease.forward(X[:, :])
print(result)

[[ 1.53742638e-01  4.97771689e-02  1.86853641e-02 ...  1.18158275e-02
  -7.07904956e-04 -1.25905044e-02]
 [ 5.87536974e-01  2.92324895e-01  1.33843414e-01 ...  5.10958433e-03
  -9.87448250e-03  7.45281427e-03]
 [ 6.59899150e-01  2.57923893e-01  2.32991596e-04 ... -8.25163264e-03
  -7.69776453e-03  6.02780975e-03]
 ...
 [ 9.19670589e-01  2.29087867e-01  1.12606528e-01 ...  1.02980241e-02
  -2.59433605e-02  1.45164188e-02]
 [ 2.16129025e-01  4.95342147e-02  1.73911521e-02 ... -2.39265827e-03
  -3.93880791e-03  2.73377878e-03]
 [ 3.77747832e-01  2.57649935e-01 -4.26564546e-02 ... -2.69299005e-03
  -4.95828317e-03 -1.50368717e-02]]


In [21]:
print(X.nonzero())


(array([    0,     0,     0, ..., 31359, 31359, 31359]), array([ 136,  153,  380, ..., 5771, 5855, 6084]))


In [22]:
result[X.nonzero()] = -np.inf
print(result)

[[ 1.53742638e-01  4.97771689e-02  1.86853641e-02 ...  1.18158275e-02
  -7.07904956e-04 -1.25905044e-02]
 [ 5.87536974e-01            -inf  1.33843414e-01 ...  5.10958433e-03
  -9.87448250e-03  7.45281427e-03]
 [           -inf            -inf  2.32991596e-04 ... -8.25163264e-03
  -7.69776453e-03  6.02780975e-03]
 ...
 [           -inf  2.29087867e-01  1.12606528e-01 ...  1.02980241e-02
  -2.59433605e-02  1.45164188e-02]
 [ 2.16129025e-01  4.95342147e-02  1.73911521e-02 ... -2.39265827e-03
  -3.93880791e-03  2.73377878e-03]
 [ 3.77747832e-01  2.57649935e-01 -4.26564546e-02 ... -2.69299005e-03
  -4.95828317e-03 -1.50368717e-02]]


In [23]:
import bottleneck as bn



In [24]:
top_items_by_user = bn.argpartition(-result, 10, axis=1)[:, :10]
print(top_items_by_user)

[[4101 5147 5936 ... 4610 5737 4882]
 [1048  226  356 ... 4101 2653  604]
 [ 105  406 2619 ...    9 2672  894]
 ...
 [  41 2619 4621 ...  146 4101 1494]
 [ 735  890 1284 ...  178 2670  657]
 [1949 3420  237 ... 4000 3240   92]]


In [25]:
user_result = []
item_result =[]

for id, top_k in enumerate(top_items_by_user):
    user_result.extend([id] * 10)
    item_result.extend(top_k)

df_user_result = pd.DataFrame(user_result, columns=['uid'])
df_item_result = pd.DataFrame(item_result, columns=['sid'])
df_result = pd.concat([df_user_result, df_item_result], axis=1)

In [26]:
df_result

Unnamed: 0,uid,sid
0,0,4101
1,0,5147
2,0,5936
3,0,5514
4,0,6173
...,...,...
313595,31359,1414
313596,31359,492
313597,31359,4000
313598,31359,3240


In [27]:
id2item = dict((i, sid) for (i, sid) in enumerate(unique_sid.squeeze()))
id2user = dict((i, pid) for (i, pid) in enumerate(unique_uid.squeeze()))


In [28]:
df_infer = denumerize(df_result, id2user, id2item)
df_infer.columns = ['user', 'item']
df_infer = df_infer.sort_values('user')
df_infer

Unnamed: 0,user,item
132668,11,7438
132669,11,33004
132667,11,2
132666,11,7373
132665,11,32587
...,...,...
49275,138493,2628
49276,138493,8961
49277,138493,110
49279,138493,53125


In [29]:
df_infer.to_csv(os.path.join('submission_EASE.csv'), index=False)
