In [1]:
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import os

import random
from datetime import datetime
from time import time
import scipy.sparse as sp

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from box import Box

import warnings

warnings.filterwarnings(action='ignore')

# 1. 학습 설정

In [2]:
config = {
    'data_path' : "/opt/ml/input/data/train" , # 데이터 경로
    
    'submission_path' : "../submission",
    'submission_name' : 'DeepFM_v1_submission.csv', 

    'model_path' : "../model", # 모델 저장 경로
    'model_name' : 'DeepFM_v1.pt',

    'num_epochs' : 15,
    'lr' : 0.005,
    'batch_size' : 2048,

    "num_factor" : 128,
    "num_layers" : 3,
    "dropout" : 0.5,

    'valid_samples' : 10, # 검증에 사용할 sample 수
    'seed' : 22,
}

device = 'cuda' if torch.cuda.is_available() else 'cpu'

config = Box(config)

In [3]:
if not os.path.isdir(config.model_path):
    os.mkdir(config.model_path)

In [4]:
if not os.path.isdir(config.submission_path):
    os.mkdir(config.submission_path)

# 2. 데이터 전처리

In [5]:
class MakeFMDataSet():
    """
    FMDataSet 생성
    """
    def __init__(self, config):
        self.config = config
        self.df = pd.read_csv(os.path.join(self.config.data_path, 'train_ratings.csv'))
        self.genre_df = pd.read_csv(os.path.join(self.config.data_path, 'genres.tsv'), sep='\t')

        self.item_encoder, self.item_decoder = self.generate_encoder_decoder(col = 'item', df = self.df)
        self.user_encoder, self.user_decoder = self.generate_encoder_decoder(col = 'user', df = self.df)
        self.genre_encoder, self.genre_decoder = self.generate_encoder_decoder(col = 'genre', df = self.genre_df)

        self.num_item, self.num_user, self.num_genre = len(self.item_encoder), len(self.user_encoder), len(self.genre_encoder)

        self.df['item_idx'] = self.df['item'].apply(lambda x : self.item_encoder[x])
        self.df['user_idx'] = self.df['user'].apply(lambda x : self.user_encoder[x])
        
        self.genre_df['item_idx'] = self.genre_df['item'].apply(lambda x : self.item_encoder[x])
        self.genre_df['genre_idx'] = self.genre_df['genre'].apply(lambda x : self.genre_encoder[x] + 1)

        self.exist_users = [i for i in range(self.num_user)]
        self.exist_items = [i for i in range(self.num_item)]
        self.user_train, self.user_valid = self.generate_sequence_data()
        self.item_idx2genre_list = self.generate_genre_data()

    def make_item2genre(self, items):
        genre_list = []
        for item in items:
            genre_list.append(self.item_idx2genre_list[item])
        return genre_list

    def generate_genre_data(self):
        max_len = 10
        item_idx2genre_list = {}
        group_df = self.genre_df.groupby('item_idx')
        for item_idx, df in group_df:
            genre_list = df['genre_idx'].tolist()
            padding_list = [0] * (max_len - len(genre_list))
            genre_list = genre_list + padding_list
            item_idx2genre_list[item_idx] = genre_list
        
        return item_idx2genre_list

    def generate_encoder_decoder(self, col : str, df) -> dict:
        """
        encoder, decoder 생성

        Args:
            col (str): 생성할 columns 명
        Returns:
            dict: 생성된 user encoder, decoder
        """

        encoder = {}
        decoder = {}
        ids = df[col].unique()

        for idx, _id in enumerate(ids):
            encoder[_id] = idx
            decoder[idx] = _id

        return encoder, decoder
    
    def generate_sequence_data(self) -> dict:
        """
        sequence_data 생성

        Returns:
            dict: train user sequence / valid user sequence
        """
        users = defaultdict(list)
        user_train = {}
        user_valid = {}
        for user, item, time in zip(self.df['user_idx'], self.df['item_idx'], self.df['time']):
            users[user].append(item)
        
        for user in users:
            np.random.seed(self.config.seed)

            user_total = users[user]
            valid = np.random.choice(user_total, size = self.config.valid_samples, replace = False).tolist()
            train = list(set(user_total) - set(valid))

            user_train[user] = train
            user_valid[user] = valid # valid_samples 개수 만큼 검증에 활용 (현재 Task와 가장 유사하게)

        return user_train, user_valid

    def neg_sampling(self, users):
        
        neg_sampling_cnt = 3
        
        def sample_neg_items_for_u(u, num):
            neg_items = list(set(self.exist_items) - set(self.user_train[u]))
            neg_batch = random.sample(neg_items, num)
            return neg_batch
        
        _users, neg_items = [], []
        for user in users:
            neg_items += sample_neg_items_for_u(user, neg_sampling_cnt)
            _users += [user] * neg_sampling_cnt

        return _users, neg_items

    def get_train_valid_data(self):
        return self.user_train, self.user_valid

In [6]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class CFDataset(Dataset):
    def __init__(self, user_train):
        self.users = []
        self.items = []
        for user in user_train.keys():
            self.items += user_train[user]
            self.users += [user] * len(user_train[user])

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        user = self.users[idx]
        item = self.items[idx]

        return user, item

# 3. 모델

In [7]:
class DeepFM(nn.Module):
    def __init__(self, num_user, num_item, num_genre, num_factor, num_layers, dropout):
        super(DeepFM, self).__init__()
        self.dropout = dropout
        self.user_emb = nn.Embedding(num_user, num_factor)
        self.item_emb = nn.Embedding(num_item, num_factor)
        self.genre_emb = nn.Embedding(num_genre + 1, 20, padding_idx = 0)
        input_size = num_factor * 2 + 20

        # MLP_modules = []
        # for i in range(num_layers):
        #     if i != 0:
        #         MLP_modules.append(nn.Dropout(p = self.dropout))
        #         MLP_modules.append(nn.Linear(input_size, input_size // 2))
        #         MLP_modules.append(nn.ReLU())
        #     else:
        #         MLP_modules.append(nn.Linear(input_size, input_size // 2))
        #         MLP_modules.append(nn.ReLU())
        #     input_size = input_size // 2
        # self.MLP_layers = nn.Sequential(*MLP_modules)

        self.predict_layer = nn.Sequential(
            nn.Linear(input_size, 1, bias = True),
            nn.Sigmoid()
        )

        self._init_weight_()
    
    def _init_weight_(self):
        nn.init.normal_(self.user_emb.weight, std=0.01)
        nn.init.normal_(self.item_emb.weight, std=0.01)
        nn.init.normal_(self.genre_emb.weight[1:], std=0.01)
        
        # for m in self.MLP_layers:
        #     if isinstance(m, nn.Linear):
        #         nn.init.xavier_uniform_(m.weight)
        
        for m in self.predict_layer:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight)
    
    def forward(self, user, item, genre):
        user_emb = self.user_emb(user)
        item_emb = self.item_emb(item)
        genre_emb = self.genre_emb(genre).sum(dim = 1)
        
        cat_emb = torch.cat((user_emb, item_emb, genre_emb), -1)

        # output = self.MLP_layers(cat_emb)

        output = self.predict_layer(cat_emb)

        return output.view(-1)

# 4. 학습 함수

In [8]:
def train(model, data_loader, criterion, optimizer, make_data_set):
    model.train()
    loss_val = 0

    for users, items in data_loader:
        neg_users, neg_items = make_data_set.neg_sampling(users.numpy().tolist())

        all_users = torch.concat([users, torch.tensor(neg_users)]).to(device)
        all_items = torch.concat([items, torch.tensor(neg_items)]).to(device)

        all_genres = make_data_set.make_item2genre(all_items.cpu().numpy().tolist())
        all_genres = torch.LongTensor(all_genres).to(device)

        pos_target = [1] * len(items)
        neg_target = [0] * len(neg_items)

        target = torch.FloatTensor(pos_target + neg_target).to(device)
    
        optimizer.zero_grad()
        
        output = model(all_users, all_items, all_genres)
        loss = criterion(output, target)

        loss.backward()
        optimizer.step()

        loss_val += loss.item()

    loss_val /= len(data_loader)

    return loss_val


def get_ndcg(pred_list, true_list):
    idcg = sum((1 / np.log2(rank + 2) for rank in range(1, len(pred_list))))
    dcg = 0
    for rank, pred in enumerate(pred_list):
        if pred in true_list:
            dcg += 1 / np.log2(rank + 2)
    ndcg = dcg / idcg
    return ndcg

# hit == recall == precision
def get_hit(pred_list, true_list):
    hit_list = set(true_list) & set(pred_list)
    hit = len(hit_list) / len(true_list)
    return hit

def evaluate(model, user_train, user_valid, make_data_set):
    model.eval()

    NDCG = 0.0 # NDCG@10
    HIT = 0.0 # HIT@10

    all_users = make_data_set.exist_users
    all_items = make_data_set.exist_items
    all_genres = make_data_set.make_item2genre(all_items)
    with torch.no_grad():
        for user in all_users:
            users = [user] * len(all_items)
            users, items, genres = torch.tensor(users).to(device), torch.tensor(all_items).to(device), torch.tensor(all_genres).to(device)

            output = model(users, items, genres)
            output = output.softmax(dim = 0)
            output[user_train[user]] = -1.

            uv = user_valid[user]
            up = output.argsort()[-10:].cpu().numpy().tolist()

            NDCG += get_ndcg(pred_list = up, true_list = uv)
            HIT += get_hit(pred_list = up, true_list = uv)

    NDCG /= len(all_users)
    HIT /= len(all_users)

    return NDCG, HIT

# 5. 학습

In [9]:
make_fm_data_set = MakeFMDataSet(config = config)
user_train, user_valid = make_fm_data_set.get_train_valid_data()

In [10]:
cf_dataset = CFDataset(user_train = user_train)
data_loader = DataLoader(
    cf_dataset, 
    batch_size = config.batch_size, 
    shuffle = True, 
    drop_last = False)

In [11]:
model = DeepFM(
    num_user = make_fm_data_set.num_user, 
    num_item = make_fm_data_set.num_item, 
    num_genre = make_fm_data_set.num_genre,
    num_factor = config.num_factor,
    num_layers = config.num_layers,
    dropout = config.dropout).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr = config.lr)
criterion = nn.BCELoss()

In [None]:
best_hit = 0
for epoch in range(1, config.num_epochs + 1):
    tbar = tqdm(range(1))
    for _ in tbar:
        train_loss = train(
            model = model, 
            data_loader = data_loader, 
            criterion = criterion, 
            optimizer = optimizer, 
            make_data_set = make_fm_data_set
            )
        
        ndcg, hit = evaluate(model, user_train, user_valid, make_fm_data_set)
        
        if best_hit < hit:
            best_hit = hit
            torch.save(model.state_dict(), os.path.join(config.model_path, config.model_name))

        tbar.set_description(f'Epoch: {epoch:3d}| Train loss: {train_loss:.5f}| NDCG@10: {ndcg:.5f}| HIT@10: {hit:.5f}')