## import packages and install pip
### 본 노트북은 다음과 같은 구조일 때 동작합니다
```
   upper
     ├─ /code
     |    ├─ /data
     |    ├─ /src
     |    ├─ /submit
     |    ├─ ... 
     |    ├─ PipeLine.ipynb
```

In [1]:
#torch 안깔려 있으면, 까셔야 합니다 ..!
import warnings
warnings.filterwarnings('ignore')
!pip install --upgrade pip


[0m

In [2]:

!pip install -r ../code/requirement.txt
!pip install easydict

[0m

In [3]:

import time
import argparse
import pandas as pd
import easydict
from src.utils import Logger, Setting, models_load
from src.data import context_data_load, context_data_split, context_data_loader
from src.data import dl_data_load, dl_data_split, dl_data_loader
from src.data import image_data_load, image_data_split, image_data_loader
from src.data import text_data_load, text_data_split, text_data_loader
from src.train import train, test
import os
import tqdm
import torch
import torch.nn as nn
from torch.nn import MSELoss
from torch.optim import SGD, Adam

In [4]:
torch.__version__

'1.12.1'

In [5]:
args = easydict.EasyDict({
    'data_path': './data/',  # Data path 설정
    'saved_model_path': './saved_models',  # Saved Model path 설정
    'model': 'CNN_NCF',  # 학습 및 예측할 모델 선택 (None으로 초기화, 사용 전에 설정 필요)
    'data_shuffle': True,  # 데이터 셔플 여부 조정
    'test_size': 0.2,  # Train/Valid split 비율 조정
    'seed': 42,  # Seed 값 조정
    'use_best_model': True,  # 검증 성능이 가장 좋은 모델 사용 여부 설정

    # TRAINING OPTION
    'batch_size': 1024,  # Batch size 조정
    'epochs': 10,  # Epoch 수 조정
    'lr': 1e-3,  # Learning Rate 조정
    'loss_fn': 'RMSE',  # 손실 함수 변경 (MSE 또는 RMSE)
    'optimizer': 'ADAM',  # 최적화 함수 변경 (SGD 또는 ADAM)
    'weight_decay': 1e-6,  # Adam optimizer에서 정규화에 사용하는 값 조정

    # GPU
    'device': 'cuda',  # 학습에 사용할 Device 조정

    # FM, FFM, NCF, WDN, DCN Common OPTION
    'embed_dim': 16,  # FM, FFM, NCF, WDN, DCN에서 embedding시킬 차원 조정
    'dropout': 0.2,  # NCF, WDN, DCN에서 Dropout rate 조정
    'mlp_dims': (16, 16),  # NCF, WDN, DCN에서 MLP Network의 차원 조정

    # DCN
    'num_layers': 3,  # Cross Network의 레이어 수 조정

    # CNN_FM
    'cnn_embed_dim': 64,  # CNN_FM에서 user와 item에 대한 embedding시킬 차원 조정
    'cnn_latent_dim': 12,  # CNN_FM에서 user/item/image에 대한 latent 차원 조정

    # DeepCoNN
    'vector_create': False,  # DEEP_CONN에서 text vector 생성 여부 조정 (최초 학습에만 True로 설정)
    'deepconn_embed_dim': 32,  # DEEP_CONN에서 user와 item에 대한 embedding시킬 차원 조정
    'deepconn_latent_dim': 10,  # DEEP_CONN에서 user/item/image에 대한 latent 차원 조정
    'conv_1d_out_dim': 50,  # DEEP_CONN에서 1D conv의 출력 크기 조정
    'kernel_size': 3,  # DEEP_CONN에서 1D conv의 kernel 크기 조정
    'word_dim': 768,  # DEEP_CONN에서 1D conv의 입력 크기 조정
    'out_dim': 32  # DEEP_CONN에서 1D conv의 출력 크기 조정
})

## Settings Seed

In [6]:
Setting.seed_everything(args.seed)

## DATA LOAD


In [7]:
print(f'--------------- {args.model} Load Data ---------------')
if args.model in ('FM', 'FFM'):
    data = context_data_load(args)
elif args.model in ('NCF', 'WDN', 'DCN'):
    data = dl_data_load(args)
elif args.model in ('CNN_FM', 'CNN_NCF'):
    data = image_data_load(args)
elif args.model == 'DeepCoNN':
    import nltk
    nltk.download('punkt')
    data = text_data_load(args)
else:
    pass

--------------- CNN_NCF Load Data ---------------


129777it [00:55, 2355.82it/s]
52000it [00:21, 2370.05it/s]


## DATA Split

In [8]:
######################## Train/Valid Split
print(f'--------------- {args.model} Train/Valid Split ---------------')
if args.model in ('FM', 'FFM'):
    data = context_data_split(args, data)
    data = context_data_loader(args, data)

elif args.model in ('NCF', 'WDN', 'DCN'):
    data = dl_data_split(args, data)
    data = dl_data_loader(args, data)

elif args.model in ('CNN_FM', 'CNN_NCF'):
    data = image_data_split(args, data)
    data = image_data_loader(args, data)

elif args.model=='DeepCoNN':
    data = text_data_split(args, data)
    data = text_data_loader(args, data)
else:
    pass

--------------- CNN_NCF Train/Valid Split ---------------


## Logs settings

In [9]:
####################### Setting for Log
setting = Setting()

log_path = setting.get_log_path(args)
setting.make_dir(log_path)

logger = Logger(args, log_path)
logger.save_args()


## Define Model (FM 모델을 예시로 사용)

In [44]:
import numpy as np
import torch
import torch.nn as nn


# factorization을 통해 얻은 feature를 embedding 합니다.
class FeaturesEmbedding(nn.Module):
    def __init__(self, field_dims: np.ndarray, embed_dim: int):
        super().__init__()
        self.embedding = torch.nn.Embedding(sum(field_dims), embed_dim)
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.int32)
        torch.nn.init.xavier_uniform_(self.embedding.weight.data)


    def forward(self, x: torch.Tensor):
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        return self.embedding(x)


# NCF 모델은 MLP와 GMF를 합하여 최종 결과를 도출합니다.
# MLP을 구현합니다.
class MultiLayerPerceptron(nn.Module):
    def __init__(self, input_dim, embed_dims, dropout, output_layer=True):
        super().__init__()
        layers = list()
        for embed_dim in embed_dims:
            layers.append(torch.nn.Linear(input_dim, embed_dim))
            layers.append(torch.nn.BatchNorm1d(embed_dim))
            layers.append(torch.nn.ReLU())
            layers.append(torch.nn.Dropout(p=dropout))
            input_dim = embed_dim
        if output_layer:
            layers.append(torch.nn.Linear(input_dim, 1))
        self.mlp = torch.nn.Sequential(*layers)


    def forward(self, x):
        return self.mlp(x)


# 이미지 특징 추출을 위한 기초적인 CNN Layer를 정의합니다.
class CNN_Base(nn.Module):
    def __init__(self, ):
        super(CNN_Base, self).__init__()
        self.cnn_layer = nn.Sequential(
                                        nn.Conv2d(3, 6, kernel_size=3, stride=2, padding=1),
                                        nn.ReLU(),
                                        nn.MaxPool2d(kernel_size=3, stride=2),
                                        nn.Conv2d(6, 12, kernel_size=3, stride=2, padding=1),
                                        nn.ReLU(),
                                        nn.MaxPool2d(kernel_size=3, stride=2),
                                        )
    def forward(self, x):
        x = self.cnn_layer(x)
        x = x.view(-1, 12 * 1 * 1)
        return x

# user와 item의 latent factor를 활용하여 GMF를 구현합니다.
# 그리고 MLP결과와 concat하여 NCF 모델을 구현하고 최종 결과를 도출합니다.
class CNN_NCF(nn.Module):
    def __init__(self, args, data):
        super().__init__()
        self.field_dims = data['field_dims']
        self.user_field_idx = np.array((0, ), dtype=np.int32)
        self.item_field_idx = np.array((1, ), dtype=np.int32)
        self.embedding = FeaturesEmbedding(self.field_dims, args.embed_dim)
        self.embed_output_dim = len(self.field_dims) * args.embed_dim
        self.mlp = MultiLayerPerceptron(self.embed_output_dim, args.mlp_dims, args.dropout, output_layer=False)
        self.fc = torch.nn.Linear(args.mlp_dims[-1] + args.embed_dim + args.cnn_latent_dim, 1)
        
        self.cnn = CNN_Base()

    '''
    def forward(self, x):
        x = self.embedding(x)
        user_x = x[:, self.user_field_idx].squeeze(1)
        item_x = x[:, self.item_field_idx].squeeze(1)
        gmf = user_x * item_x
        x = self.mlp(x.view(-1, self.embed_output_dim))
        x = torch.cat([gmf, x], dim=1)
        x = self.fc(x).squeeze(1)
        return x
    '''
    
    def forward(self, x):
        user_isbn_vector, img_vector = x[0], x[1]
        user_isbn_feature = self.embedding(user_isbn_vector)
        
        user_x = user_isbn_feature[:, self.user_field_idx].squeeze(1)
        item_x = user_isbn_feature[:, self.item_field_idx].squeeze(1)
        gmf = user_x * item_x
        
        mlp_x = self.mlp(user_isbn_feature.view(-1, self.embed_output_dim))
        
        img_feature = self.cnn(img_vector)
        
        x = torch.cat([gmf, mlp_x, img_feature], dim = 1)
        x = self.fc(x).squeeze(1)
        return x


## Load Model

In [45]:
######################## Model
print(f'--------------- INIT {args.model} ---------------')
model = CNN_NCF(args, data).to(args.device) #이부분수정하면됩니다

--------------- INIT CNN_NCF ---------------


## Define RMSEloss

In [46]:
class RMSELoss(nn.Module):
    def __init__(self):
        super(RMSELoss, self).__init__()
        self.eps = 1e-6
    def forward(self, x, y):
        criterion = MSELoss()
        loss = torch.sqrt(criterion(x, y)+self.eps)
        return loss

## Train

In [47]:
def train(args, model, dataloader, logger, setting):
    minimum_loss = 999999999
    if args.loss_fn == 'MSE':
        loss_fn = MSELoss()
    elif args.loss_fn == 'RMSE':
        loss_fn = RMSELoss()
    else:
        pass
    if args.optimizer == 'SGD':
        optimizer = SGD(model.parameters(), lr=args.lr)
    elif args.optimizer == 'ADAM':
        optimizer = Adam(model.parameters(), lr=args.lr)
    else:
        pass

    for epoch in tqdm.tqdm(range(args.epochs)):
        model.train()
        total_loss = 0
        batch = 0

        for idx, data in enumerate(dataloader['train_dataloader']):
            if args.model in ('CNN_FM', 'CNN_NCF'):
                x, y = [data['user_isbn_vector'].to(args.device), data['img_vector'].to(args.device)], data['label'].to(args.device)
            elif args.model == 'DeepCoNN':
                x, y = [data['user_isbn_vector'].to(args.device), data['user_summary_merge_vector'].to(args.device), data['item_summary_vector'].to(args.device)], data['label'].to(args.device)
            else:
                x, y = data[0].to(args.device), data[1].to(args.device)
            y_hat = model(x)
            loss = loss_fn(y.float(), y_hat)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            batch +=1
        valid_loss = valid(args, model, dataloader, loss_fn)
        print(f'Epoch: {epoch+1}, Train_loss: {total_loss/batch:.3f}, valid_loss: {valid_loss:.3f}')
        logger.log(epoch=epoch+1, train_loss=total_loss/batch, valid_loss=valid_loss)
        if minimum_loss > valid_loss:
            minimum_loss = valid_loss
            os.makedirs(args.saved_model_path, exist_ok=True)
            torch.save(model.state_dict(), f'{args.saved_model_path}/{setting.save_time}_{args.model}_model.pt')
    logger.close()
    return model


def valid(args, model, dataloader, loss_fn):
    model.eval()
    total_loss = 0
    batch = 0

    for idx, data in enumerate(dataloader['valid_dataloader']):
        if args.model in ('CNN_FM', 'CNN_NCF'):
            x, y = [data['user_isbn_vector'].to(args.device), data['img_vector'].to(args.device)], data['label'].to(args.device)
        elif args.model == 'DeepCoNN':
            x, y = [data['user_isbn_vector'].to(args.device), data['user_summary_merge_vector'].to(args.device), data['item_summary_vector'].to(args.device)], data['label'].to(args.device)
        else:
            x, y = data[0].to(args.device), data[1].to(args.device)
        y_hat = model(x)
        loss = loss_fn(y.float(), y_hat)
        total_loss += loss.item()
        batch +=1
    valid_loss = total_loss/batch
    return valid_loss


def test(args, model, dataloader, setting):
    predicts = list()
    if args.use_best_model == True:
        model.load_state_dict(torch.load(f'./saved_models/{setting.save_time}_{args.model}_model.pt'))
    else:
        pass
    model.eval()

    for idx, data in enumerate(dataloader['test_dataloader']):
        if args.model in ('CNN_FM', 'CNN_NCF'):
            x, _ = [data['user_isbn_vector'].to(args.device), data['img_vector'].to(args.device)], data['label'].to(args.device)
        elif args.model == 'DeepCoNN':
            x, _ = [data['user_isbn_vector'].to(args.device), data['user_summary_merge_vector'].to(args.device), data['item_summary_vector'].to(args.device)], data['label'].to(args.device)
        else:
            x = data[0].to(args.device)
        y_hat = model(x)
        predicts.extend(y_hat.tolist())
    return predicts

model = train(args, model, data, logger, setting)


 10%|█         | 1/10 [00:11<01:42, 11.40s/it]

Epoch: 1, Train_loss: 3.950, valid_loss: 2.514


 20%|██        | 2/10 [00:22<01:30, 11.26s/it]

Epoch: 2, Train_loss: 2.159, valid_loss: 2.258


 30%|███       | 3/10 [00:33<01:18, 11.28s/it]

Epoch: 3, Train_loss: 1.858, valid_loss: 2.282


 40%|████      | 4/10 [00:44<01:06, 11.17s/it]

Epoch: 4, Train_loss: 1.651, valid_loss: 2.303


 50%|█████     | 5/10 [00:56<00:55, 11.19s/it]

Epoch: 5, Train_loss: 1.518, valid_loss: 2.324


 60%|██████    | 6/10 [01:07<00:44, 11.22s/it]

Epoch: 6, Train_loss: 1.425, valid_loss: 2.340


 70%|███████   | 7/10 [01:18<00:33, 11.32s/it]

Epoch: 7, Train_loss: 1.353, valid_loss: 2.348


 80%|████████  | 8/10 [01:30<00:22, 11.31s/it]

Epoch: 8, Train_loss: 1.298, valid_loss: 2.346


 90%|█████████ | 9/10 [01:41<00:11, 11.30s/it]

Epoch: 9, Train_loss: 1.251, valid_loss: 2.364


100%|██████████| 10/10 [01:52<00:00, 11.27s/it]

Epoch: 10, Train_loss: 1.218, valid_loss: 2.361





In [26]:

######################## INFERENCE
print(f'--------------- {args.model} PREDICT ---------------')
predicts = test(args, model, data, setting)

--------------- FM PREDICT ---------------


## SAVE MODEL

In [27]:
######################## SAVE PREDICT
print(f'--------------- SAVE {args.model} PREDICT ---------------')
submission = pd.read_csv(args.data_path + 'sample_submission.csv')
if args.model in ('FM', 'FFM', 'NCF', 'WDN', 'DCN', 'CNN_FM', 'DeepCoNN'):
    submission['rating'] = predicts
else:
    pass

--------------- SAVE FM PREDICT ---------------


## 답안 제출 파일 생성

In [28]:
filename = setting.get_submit_filename(args)
submission.to_csv(filename, index=False)