In [1]:
import os
import random
import pandas as pd
import numpy as np
from datetime import datetime
import warnings; warnings.filterwarnings("ignore")

from IPython.display import Image

import scipy
from scipy import sparse
from scipy.sparse import csr_matrix, lil_matrix

import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm

%matplotlib inline

## data load

In [2]:
from pathlib import Path

file_path = Path.cwd() / "data"

train_path = file_path / "train"
eval_path = file_path / "eval"

In [3]:
ratings_data = pd.read_csv(train_path / "train_ratings.csv")

ratings_data.shape

# year_data = pd.read_csv(train_path / 'years.tsv', sep='\t')
# writer_data = pd.read_csv(train_path / 'writers.tsv', sep='\t')
# title_data = pd.read_csv(train_path / 'titles.tsv', sep='\t')
# genre_data = pd.read_csv(train_path / 'genres.tsv', sep='\t')
# director_data = pd.read_csv(train_path / 'directors.tsv', sep='\t')

# year_data.shape, writer_data.shape, title_data.shape, genre_data.shape, director_data.shape

(5154471, 3)

In [4]:
ratings_data.head(1)

Unnamed: 0,user,item,time
0,11,4643,1230782529


In [5]:
ratings_data["user"].nunique(), ratings_data["item"].nunique()

(31360, 6807)

In [6]:
ratings_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5154471 entries, 0 to 5154470
Data columns (total 3 columns):
 #   Column  Dtype
---  ------  -----
 0   user    int64
 1   item    int64
 2   time    int64
dtypes: int64(3)
memory usage: 118.0 MB


## preprocessing

**category 변경**

In [7]:
ratings_data['user'] = ratings_data['user'].astype("category")
ratings_data['item'] = ratings_data['item'].astype("category")
ratings_data['user_id'] = ratings_data['user'].cat.codes
ratings_data['item_id'] = ratings_data['item'].cat.codes

In [8]:
ratings_data.head()

Unnamed: 0,user,item,time,user_id,item_id
0,11,4643,1230782529,0,2505
1,11,170,1230782534,0,109
2,11,531,1230782539,0,319
3,11,616,1230782542,0,368
4,11,2140,1230782563,0,1183


In [9]:
user_id_to_user_map = dict(enumerate(ratings_data['user'].cat.categories)) # 새로운 user_id => 기존 CustomerID
item_id_to_item_map = dict(enumerate(ratings_data['item'].cat.categories))  # 새로운 item_id => 기존 StockCode

In [10]:
user_to_user_id_map = dict() # 기존 user => 새로운 user_id
item_to_item_id_map = dict()  # 기존 item  => 새로운 item_id

for x, y in zip(user_id_to_user_map.keys(), user_id_to_user_map.values()) :
    user_to_user_id_map[y] = x
    
for x, y in zip(item_id_to_item_map.keys(), item_id_to_item_map.values()) :
    item_to_item_id_map[y] = x

**CTR 예측 데이터 처럼 변환. 랭킹을 매겼냐(1), 매기지 않았냐(0) 으로 변환**

In [11]:
df = ratings_data.pivot_table(
    ["time"], 
    index=ratings_data["user_id"],
    columns=ratings_data["item_id"], 
    aggfunc="count",
    fill_value=0
)

In [12]:
df

Unnamed: 0_level_0,time,time,time,time,time,time,time,time,time,time,time,time,time,time,time,time,time,time,time,time,time
item_id,0,1,2,3,4,5,6,7,8,9,...,6797,6798,6799,6800,6801,6802,6803,6804,6805,6806
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31355,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31356,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31357,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31358,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**sparse matrix**

In [13]:
sparse_user_item = sparse.csr_matrix(df)
sparse_user_item.shape

(31360, 6807)

In [14]:
assert ratings_data["user_id"].nunique() == sparse_user_item.shape[0]
assert ratings_data["item_id"].nunique() == sparse_user_item.shape[1]

## ML model with ALS method

In [15]:
import implicit

implicit.__version__

'0.6.2'

**CFG**

In [42]:
factors = 20
regularization = 0.1
iterations = 1

In [16]:
# ALS 모델 선언
model = implicit.als.AlternatingLeastSquares(
    factors=factors, # The number of latent factors to compute
    regularization = regularization,
    iterations = iterations, # 높이면 오래 걸림;; 100 정도로 하다가 넘 오래 걸려서...
    calculate_training_loss=False,
    use_gpu = False
)

model

<implicit.cpu.als.AlternatingLeastSquares at 0x16a7de8f0>

In [17]:
# https://benfred.github.io/implicit/api/models/cpu/als.html#implicit.cpu.als.AlternatingLeastSquares.fit
model.fit(sparse_user_item)

  0%|          | 0/1 [00:00<?, ?it/s]

**factorized된 행렬 확인**

$$ R \approx P * Q^T = \hat{R}  $$

In [18]:
# 유저, 아이템 행렬 크기(차원) 확인
user_vecs = model.user_factors  # User Matrix
item_vecs = model.item_factors  # Item Matrix

# factors를 20을 줬으니까 
# (유저 수, 20), (아이템 수, 20)이 나옴
print(user_vecs.shape)
print(item_vecs.shape)

(31360, 20)
(6807, 20)


In [19]:
# 유저 0에게 가장 적합한 아이템 10개를 추천해봅시다
# https://benfred.github.io/implicit/api/models/cpu/als.html#implicit.cpu.als.AlternatingLeastSquares.recommend
a, b = model.recommend(0, sparse_user_item[0], 10)

[(item, score) for item, score in zip(a,b)]

[(5078, 0.81997836),
 (4065, 0.7368049),
 (5279, 0.7208816),
 (6084, 0.7151063),
 (5162, 0.7071584),
 (5059, 0.70668733),
 (2653, 0.616813),
 (4581, 0.61572254),
 (5386, 0.61473006),
 (4790, 0.61186194)]

## submission

In [30]:
type(user_id_to_user_map[0])

int

In [39]:
l = []

for user_id in ratings_data["user_id"].unique():
    item, score = model.recommend(user_id, sparse_user_item[user_id], 10)
    original_user = user_id_to_user_map[user_id]
    
    for rec_item in item:
        original_item = item_id_to_item_map[rec_item]
        
        d = dict()
        d["user"] = original_user
        d["item"] = original_item
        l.append(d)

Unnamed: 0,user,item


In [41]:
pd.DataFrame(l).to_csv("output.csv", index=False)