## import packages and install pip
### 본 노트북은 다음과 같은 구조일 때 동작합니다
```
   upper
     ├─ /code
     |    ├─ /data
     |    ├─ /src
     |    ├─ /submit
     |    ├─ ... 
     |    ├─ PipeLine.ipynb
```

In [1]:
#torch 안깔려 있으면, 까셔야 합니다 ..!
import warnings
warnings.filterwarnings('ignore')
!pip install --upgrade pip


[0m

In [2]:

!pip install -r ../code/requirement.txt
!pip install easydict

[0mCollecting easydict
  Using cached easydict-1.11.tar.gz (6.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: easydict
  Building wheel for easydict (setup.py) ... [?25ldone
[?25h  Created wheel for easydict: filename=easydict-1.11-py3-none-any.whl size=6660 sha256=d557359f241ca82b3b8ba8e566c751b0f3989010abce3671bd9f61f536ccde8d
  Stored in directory: /data/ephemeral/home/.cache/pip/wheels/58/05/80/2adaebf497dacf51a0267ec07b3451ce91b596e2b9502d67d6
Successfully built easydict
Installing collected packages: easydict
Successfully installed easydict-1.11
[0m

In [3]:

import time
import argparse
import pandas as pd
import easydict
from src.utils import Logger, Setting, models_load
from src.data import context_data_load, context_data_split, context_data_loader
from src.data import dl_data_load, dl_data_split, dl_data_loader
from src.data import image_data_load, image_data_split, image_data_loader
from src.data import text_data_load, text_data_split, text_data_loader
from src.train import train, test
import os
import tqdm
import torch
import torch.nn as nn
from torch.nn import MSELoss
from torch.optim import SGD, Adam

In [4]:
args = easydict.EasyDict({
    'data_path': './data/',  # Data path 설정
    'saved_model_path': './saved_models',  # Saved Model path 설정
    'model': "FM",  # 학습 및 예측할 모델 선택 (None으로 초기화, 사용 전에 설정 필요)
    'data_shuffle': True,  # 데이터 셔플 여부 조정
    'test_size': 0.2,  # Train/Valid split 비율 조정
    'seed': 42,  # Seed 값 조정
    'use_best_model': True,  # 검증 성능이 가장 좋은 모델 사용 여부 설정

    # TRAINING OPTION
    'batch_size': 1024,  # Batch size 조정
    'epochs': 10,  # Epoch 수 조정
    'lr': 1e-3,  # Learning Rate 조정
    'loss_fn': 'RMSE',  # 손실 함수 변경 (MSE 또는 RMSE)
    'optimizer': 'ADAM',  # 최적화 함수 변경 (SGD 또는 ADAM)
    'weight_decay': 1e-6,  # Adam optimizer에서 정규화에 사용하는 값 조정

    # GPU
    'device': 'cuda',  # 학습에 사용할 Device 조정

    # FM, FFM, NCF, WDN, DCN Common OPTION
    'embed_dim': 16,  # FM, FFM, NCF, WDN, DCN에서 embedding시킬 차원 조정
    'dropout': 0.2,  # NCF, WDN, DCN에서 Dropout rate 조정
    'mlp_dims': (16, 16),  # NCF, WDN, DCN에서 MLP Network의 차원 조정

    # DCN
    'num_layers': 3,  # Cross Network의 레이어 수 조정

    # CNN_FM
    'cnn_embed_dim': 64,  # CNN_FM에서 user와 item에 대한 embedding시킬 차원 조정
    'cnn_latent_dim': 12,  # CNN_FM에서 user/item/image에 대한 latent 차원 조정

    # DeepCoNN
    'vector_create': False,  # DEEP_CONN에서 text vector 생성 여부 조정 (최초 학습에만 True로 설정)
    'deepconn_embed_dim': 32,  # DEEP_CONN에서 user와 item에 대한 embedding시킬 차원 조정
    'deepconn_latent_dim': 10,  # DEEP_CONN에서 user/item/image에 대한 latent 차원 조정
    'conv_1d_out_dim': 50,  # DEEP_CONN에서 1D conv의 출력 크기 조정
    'kernel_size': 3,  # DEEP_CONN에서 1D conv의 kernel 크기 조정
    'word_dim': 768,  # DEEP_CONN에서 1D conv의 입력 크기 조정
    'out_dim': 32  # DEEP_CONN에서 1D conv의 출력 크기 조정
})

## Settings Seed

In [5]:
Setting.seed_everything(args.seed)

## DATA LOAD


In [6]:
print(f'--------------- {args.model} Load Data ---------------')
if args.model in ('FM', 'FFM'):
    data = context_data_load(args)
elif args.model in ('NCF', 'WDN', 'DCN'):
    data = dl_data_load(args)
elif args.model == 'CNN_FM':
    data = image_data_load(args)
elif args.model == 'DeepCoNN':
    import nltk
    nltk.download('punkt')
    data = text_data_load(args)
else:
    pass

--------------- FM Load Data ---------------


## DATA Split

In [7]:
######################## Train/Valid Split
print(f'--------------- {args.model} Train/Valid Split ---------------')
if args.model in ('FM', 'FFM'):
    data = context_data_split(args, data)
    data = context_data_loader(args, data)

elif args.model in ('NCF', 'WDN', 'DCN'):
    data = dl_data_split(args, data)
    data = dl_data_loader(args, data)

elif args.model=='CNN_FM':
    data = image_data_split(args, data)
    data = image_data_loader(args, data)

elif args.model=='DeepCoNN':
    data = text_data_split(args, data)
    data = text_data_loader(args, data)
else:
    pass

--------------- FM Train/Valid Split ---------------


## Logs settings

In [8]:
####################### Setting for Log
setting = Setting()

log_path = setting.get_log_path(args)
setting.make_dir(log_path)

logger = Logger(args, log_path)
logger.save_args()


## Define Model (FM 모델을 예시로 사용)

In [9]:
import numpy as np
import torch
import torch.nn as nn


# factorization을 통해 얻은 feature를 embedding 합니다.
class FeaturesEmbedding(nn.Module):
    def __init__(self, field_dims: np.ndarray, embed_dim: int):
        super().__init__()
        self.embedding = torch.nn.Embedding(sum(field_dims), embed_dim)
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.int32)
        torch.nn.init.xavier_uniform_(self.embedding.weight.data)


    def forward(self, x: torch.Tensor):
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        return self.embedding(x)


# FM모델 등에서 활용되는 선형 결합 부분을 정의합니다.
class FeaturesLinear(nn.Module):
    def __init__(self, field_dims: np.ndarray, output_dim: int=1):
        super().__init__()
        self.fc = torch.nn.Embedding(sum(field_dims), output_dim)
        self.bias = torch.nn.Parameter(torch.zeros((output_dim,)))
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.int32)


    def forward(self, x: torch.Tensor):
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        return torch.sum(self.fc(x), dim=1) + self.bias


# feature 사이의 상호작용을 효율적으로 계산합니다.
class FactorizationMachine(nn.Module):
    def __init__(self, reduce_sum:bool=True):
        super().__init__()
        self.reduce_sum = reduce_sum


    def forward(self, x: torch.Tensor):
        square_of_sum = torch.sum(x, dim=1) ** 2
        sum_of_square = torch.sum(x ** 2, dim=1)
        ix = square_of_sum - sum_of_square
        if self.reduce_sum:
            ix = torch.sum(ix, dim=1, keepdim=True)
        return 0.5 * ix

# FM 모델을 구현합니다.
class FactorizationMachineModel(nn.Module):
    def __init__(self, args, data):
        super().__init__()
        self.field_dims = data['field_dims']
        self.embedding = FeaturesEmbedding(self.field_dims, args.embed_dim)
        self.linear = FeaturesLinear(self.field_dims)
        self.fm = FactorizationMachine(reduce_sum=True)


    def forward(self, x: torch.Tensor):
        x = self.linear(x) + self.fm(self.embedding(x))
        # return torch.sigmoid(x.squeeze(1))
        return x.squeeze(1)


In [22]:
import numpy as np
import torch
import torch.nn as nn


# feature 사이의 상호작용을 효율적으로 계산합니다.
class FactorizationMachine(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super().__init__()
        self.v = nn.Parameter(torch.rand(input_dim, latent_dim), requires_grad = True)
        self.linear = nn.Linear(input_dim, 1, bias=True)


    def forward(self, x):
        linear = self.linear(x)
        square_of_sum = torch.mm(x, self.v) ** 2
        sum_of_square = torch.mm(x ** 2, self.v ** 2)
        pair_interactions = torch.sum(square_of_sum - sum_of_square, dim=1, keepdim=True)
        output = linear + (0.5 * pair_interactions)
        return output


# factorization을 통해 얻은 feature를 embedding 합니다.
class FeaturesEmbedding(nn.Module):
    def __init__(self, field_dims: np.ndarray, embed_dim: int):
        super().__init__()
        self.embedding = torch.nn.Embedding(sum(field_dims), embed_dim)
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.int32)
        torch.nn.init.xavier_uniform_(self.embedding.weight.data)


    def forward(self, x: torch.Tensor):
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        return self.embedding(x)


# 텍스트 특징 추출을 위한 기초적인 CNN 1D Layer를 정의합니다.
class CNN_1D(nn.Module):
    def __init__(self, word_dim, out_dim, kernel_size, conv_1d_out_dim):
        super(CNN_1D, self).__init__()
        self.conv = nn.Sequential(
                                nn.Conv1d(
                                        in_channels=word_dim,
                                        out_channels=out_dim,
                                        kernel_size=kernel_size,
                                        padding=(kernel_size - 1) // 2),
                                nn.ReLU(),
                                nn.MaxPool2d(kernel_size=(kernel_size, 1)),
                                nn.Dropout(p=0.5)
                                )
        self.linear = nn.Sequential(
                                    nn.Linear(int(out_dim/kernel_size), conv_1d_out_dim),
                                    nn.ReLU(),
                                    nn.Dropout(p=0.5))


    def forward(self, vec):
        output = self.conv(vec)
        output = self.linear(output.reshape(-1, output.size(1)))
        return output


# 기존 유저/상품 벡터와 유저/상품 리뷰 벡터를 결합하여 FM으로 학습하는 모델을 구현합니다.
class DeepCoNN(nn.Module):
    def __init__(self, args, data):
        super(DeepCoNN, self).__init__()
        self.field_dims = np.array([len(data['user2idx']), len(data['isbn2idx'])], dtype=np.uint32)
        self.embedding = FeaturesEmbedding(self.field_dims, args.deepconn_embed_dim)
        self.cnn_u = CNN_1D(
                             word_dim=args.word_dim,
                             out_dim=args.out_dim,
                             kernel_size=args.kernel_size,
                             conv_1d_out_dim=args.conv_1d_out_dim,
                            )
        self.cnn_i = CNN_1D(
                             word_dim=args.word_dim,
                             out_dim=args.out_dim,
                             kernel_size=args.kernel_size,
                             conv_1d_out_dim=args.conv_1d_out_dim,
                            )
        self.fm = FactorizationMachine(
                                        input_dim=(args.conv_1d_out_dim * 2) + (args.deepconn_embed_dim*len(self.field_dims)),
                                        latent_dim=args.deepconn_latent_dim,
                                        )


    def forward(self, x):
        user_isbn_vector, user_text_vector, item_text_vector = x[0], x[1], x[2]
        user_isbn_feature = self.embedding(user_isbn_vector)
        user_text_feature = self.cnn_u(user_text_vector)
        item_text_feature = self.cnn_i(item_text_vector)
        feature_vector = torch.cat([
                                    user_isbn_feature.view(-1, user_isbn_feature.size(1) * user_isbn_feature.size(2)),
                                    user_text_feature,
                                    item_text_feature
                                    ], dim=1)
        output = self.fm(feature_vector)
        return output.squeeze(1)


## Load Model

In [23]:
######################## Model
print(f'--------------- INIT {args.model} ---------------')
model = DeepCoNN(args, data).to(args.device) #이부분수정하면됩니다

--------------- INIT FM ---------------


## Define RMSEloss

In [24]:
class RMSELoss(nn.Module):
    def __init__(self):
        super(RMSELoss, self).__init__()
        self.eps = 1e-6
    def forward(self, x, y):
        criterion = MSELoss()
        loss = torch.sqrt(criterion(x, y)+self.eps)
        return loss

## Train

In [25]:
model = train(args, model, data, logger, setting)
def train(args, model, dataloader, logger, setting):
    minimum_loss = 999999999
    if args.loss_fn == 'MSE':
        loss_fn = MSELoss()
    elif args.loss_fn == 'RMSE':
        loss_fn = RMSELoss()
    else:
        pass
    if args.optimizer == 'SGD':
        optimizer = SGD(model.parameters(), lr=args.lr)
    elif args.optimizer == 'ADAM':
        optimizer = Adam(model.parameters(), lr=args.lr)
    else:
        pass

    for epoch in tqdm.tqdm(range(args.epochs)):
        model.train()
        total_loss = 0
        batch = 0

        for idx, data in enumerate(dataloader['train_dataloader']):
            if args.model == 'CNN_FM':
                x, y = [data['user_isbn_vector'].to(args.device), data['img_vector'].to(args.device)], data['label'].to(args.device)
            elif args.model == 'DeepCoNN':
                x, y = [data['user_isbn_vector'].to(args.device), data['user_summary_merge_vector'].to(args.device), data['item_summary_vector'].to(args.device)], data['label'].to(args.device)
            else:
                x, y = data[0].to(args.device), data[1].to(args.device)
            y_hat = model(x)
            loss = loss_fn(y.float(), y_hat)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            batch +=1
        valid_loss = valid(args, model, dataloader, loss_fn)
        print(f'Epoch: {epoch+1}, Train_loss: {total_loss/batch:.3f}, valid_loss: {valid_loss:.3f}')
        logger.log(epoch=epoch+1, train_loss=total_loss/batch, valid_loss=valid_loss)
        if minimum_loss > valid_loss:
            minimum_loss = valid_loss
            os.makedirs(args.saved_model_path, exist_ok=True)
            torch.save(model.state_dict(), f'{args.saved_model_path}/{setting.save_time}_{args.model}_model.pt')
    logger.close()
    return model


def valid(args, model, dataloader, loss_fn):
    model.eval()
    total_loss = 0
    batch = 0

    for idx, data in enumerate(dataloader['valid_dataloader']):
        if args.model == 'CNN_FM':
            x, y = [data['user_isbn_vector'].to(args.device), data['img_vector'].to(args.device)], data['label'].to(args.device)
        elif args.model == 'DeepCoNN':
            x, y = [data['user_isbn_vector'].to(args.device), data['user_summary_merge_vector'].to(args.device), data['item_summary_vector'].to(args.device)], data['label'].to(args.device)
        else:
            x, y = data[0].to(args.device), data[1].to(args.device)
        y_hat = model(x)
        loss = loss_fn(y.float(), y_hat)
        total_loss += loss.item()
        batch +=1
    valid_loss = total_loss/batch
    return valid_loss


def test(args, model, dataloader, setting):
    predicts = list()
    if args.use_best_model == True:
        model.load_state_dict(torch.load(f'./saved_models/{setting.save_time}_{args.model}_model.pt'))
    else:
        pass
    model.eval()

    for idx, data in enumerate(dataloader['test_dataloader']):
        if args.model == 'CNN_FM':
            x, _ = [data['user_isbn_vector'].to(args.device), data['img_vector'].to(args.device)], data['label'].to(args.device)
        elif args.model == 'DeepCoNN':
            x, _ = [data['user_isbn_vector'].to(args.device), data['user_summary_merge_vector'].to(args.device), data['item_summary_vector'].to(args.device)], data['label'].to(args.device)
        else:
            x = data[0].to(args.device)
        y_hat = model(x)
        predicts.extend(y_hat.tolist())
    return predicts



  0%|          | 0/10 [00:00<?, ?it/s]


RuntimeError: The size of tensor a (10) must match the size of tensor b (2) at non-singleton dimension 1

In [13]:

######################## INFERENCE
print(f'--------------- {args.model} PREDICT ---------------')
predicts = test(args, model, data, setting)

--------------- FM PREDICT ---------------


## SAVE MODEL

In [14]:
######################## SAVE PREDICT
print(f'--------------- SAVE {args.model} PREDICT ---------------')
submission = pd.read_csv(args.data_path + 'sample_submission.csv')
if args.model in ('FM', 'FFM', 'NCF', 'WDN', 'DCN', 'CNN_FM', 'DeepCoNN'):
    submission['rating'] = predicts
else:
    pass

--------------- SAVE FM PREDICT ---------------


In [21]:
import sys
print(sys.prefix, sys.version)

/opt/conda 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0]


## 답안 제출 파일 생성

In [15]:
filename = setting.get_submit_filename(args)
submission.to_csv(filename, index=False)