## import packages and install pip
### 본 노트북은 다음과 같은 구조일 때 동작합니다
```
   upper
     ├─ /code
     |    ├─ /data
     |    ├─ /src
     |    ├─ /submit
     |    ├─ ... 
     |    ├─ PipeLine.ipynb
```

In [1]:
#torch 안깔려 있으면, 까셔야 합니다 ..!
import warnings
warnings.filterwarnings('ignore')
!pip install --upgrade pip


[0m

In [2]:

!pip install -r ../code/requirement.txt
!pip install easydict

[0m

In [3]:

import time
import argparse
import pandas as pd
import easydict
from src.utils import Logger, Setting, models_load
from src.data import context_data_load, context_data_split, text_img_data_loader, text_img_data_load, text_img_data_split 
from src.data import dl_data_load, dl_data_split, dl_data_loader
from src.data import image_data_load, image_data_split, image_data_loader
from src.data import text_data_load, text_data_split, text_data_loader
from src.train import train, test
import os
import tqdm
import torch
import torch.nn as nn
from torch.nn import MSELoss
from torch.optim import SGD, Adam

In [4]:
args = easydict.EasyDict({
    'data_path': './data/',  # Data path 설정
    'saved_model_path': './saved_models',  # Saved Model path 설정
    'model': "Custom",  # 학습 및 예측할 모델 선택 (None으로 초기화, 사용 전에 설정 필요)
    'data_shuffle': True,  # 데이터 셔플 여부 조정
    'test_size': 0.2,  # Train/Valid split 비율 조정
    'seed': 42,  # Seed 값 조정
    'use_best_model': True,  # 검증 성능이 가장 좋은 모델 사용 여부 설정

    # TRAINING OPTION
    'batch_size': 1024,  # Batch size 조정
    'epochs': 10,  # Epoch 수 조정
    'lr': 1e-3,  # Learning Rate 조정
    'loss_fn': 'RMSE',  # 손실 함수 변경 (MSE 또는 RMSE)
    'optimizer': 'ADAM',  # 최적화 함수 변경 (SGD 또는 ADAM)
    'weight_decay': 1e-6,  # Adam optimizer에서 정규화에 사용하는 값 조정

    # GPU
    'device': 'cuda',  # 학습에 사용할 Device 조정

    # FM, FFM, NCF, WDN, DCN Common OPTION
    'embed_dim': 16,  # FM, FFM, NCF, WDN, DCN에서 embedding시킬 차원 조정
    'dropout': 0.2,  # NCF, WDN, DCN에서 Dropout rate 조정
    'mlp_dims': (16, 16),  # NCF, WDN, DCN에서 MLP Network의 차원 조정

    # DCN
    'num_layers': 3,  # Cross Network의 레이어 수 조정

    # CNN_FM
    'cnn_embed_dim': 64,  # CNN_FM에서 user와 item에 대한 embedding시킬 차원 조정
    'cnn_latent_dim': 12,  # CNN_FM에서 user/item/image에 대한 latent 차원 조정

    # DeepCoNN
    'vector_create': False,  # DEEP_CONN에서 text vector 생성 여부 조정 (최초 학습에만 True로 설정)
    'deepconn_embed_dim': 32,  # DEEP_CONN에서 user와 item에 대한 embedding시킬 차원 조정
    'deepconn_latent_dim': 10,  # DEEP_CONN에서 user/item/image에 대한 latent 차원 조정
    'conv_1d_out_dim': 50,  # DEEP_CONN에서 1D conv의 출력 크기 조정
    'kernel_size': 3,  # DEEP_CONN에서 1D conv의 kernel 크기 조정
    'word_dim': 768,  # DEEP_CONN에서 1D conv의 입력 크기 조정
    'out_dim': 32  # DEEP_CONN에서 1D conv의 출력 크기 조정
})

## Settings Seed

In [5]:
Setting.seed_everything(args.seed)

## DATA LOAD


In [6]:
print(f'--------------- {args.model} Load Data ---------------')
if args.model in ('FM', 'FFM'):
    data = context_data_load(args)
elif args.model in ('NCF', 'WDN', 'DCN'):
    data = dl_data_load(args)
elif args.model == 'CNN_FM':
    data = image_data_load(args)
elif args.model == 'DeepCoNN':
    import nltk
    nltk.download('punkt')
    data = text_data_load(args)
elif args.model == 'Custom':
    data = text_img_data_load(args)
else:
    pass

--------------- Custom Load Data ---------------


129777it [01:05, 1974.73it/s]
52000it [00:26, 1968.87it/s]


Check Vectorizer
Vector Load
Check Vectorizer
Vector Load


## DATA Split

In [7]:
######################## Train/Valid Split
print(f'--------------- {args.model} Train/Valid Split ---------------')
if args.model in ('FM', 'FFM'):
    data = context_data_split(args, data)
    data = context_data_loader(args, data)

elif args.model in ('NCF', 'WDN', 'DCN'):
    data = dl_data_split(args, data)
    data = dl_data_loader(args, data)

elif args.model=='CNN_FM':
    data = image_data_split(args, data)
    data = image_data_loader(args, data)

elif args.model=='DeepCoNN':
    data = text_data_split(args, data)
    data = text_data_loader(args, data)
elif args.model == 'Custom':
    data = text_img_data_split(args,data)
   # print(data.keys())
    data = text_img_data_loader(args,data)
    #print(data.keys())
else:
    pass

--------------- Custom Train/Valid Split ---------------
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [8]:
data['X_train']['img_vector']

121312    [[[0.003921569, 0.003921569, 0.003921569, 0.00...
265089    [[[0.75686276, 0.81960785, 0.8980392, 0.874509...
60236     [[[0.06666667, 0.09411765, 0.08235294, 0.03529...
111218    [[[0.9098039, 0.9137255, 0.9098039, 0.89411765...
306001    [[[0.90588236, 0.81960785, 0.827451, 0.8235294...
                                ...                        
119879    [[[0.1882353, 0.46666667, 0.47058824, 0.427450...
259178    [[[0.6862745, 0.654902, 0.73333335, 0.6862745,...
131932    [[[0.94509804, 1.0, 1.0, 0.99607843, 1.0, 1.0,...
146867    [[[0.93333334, 0.9490196, 0.9647059, 0.9568627...
121958    [[[0.22745098, 0.22745098, 0.22352941, 0.21960...
Name: img_vector, Length: 245436, dtype: object

In [9]:
pd.concat((data['text_test'][['user_id', 'isbn', 'item_summary_vector']], data['img_test']['img_vector']),axis=1)

Unnamed: 0,user_id,isbn,item_summary_vector,img_vector
0,13,0,"[-0.21812883, -0.20391995, 0.06439816, -0.1359...","[[[0.99215686, 0.99607843, 0.9882353, 0.996078..."
1,13426,0,"[-0.21812883, -0.20391995, 0.06439816, -0.1359...","[[[0.99215686, 0.99607843, 0.9882353, 0.996078..."
2,26761,1,"[-0.53865755, 0.12224594, 0.078437455, -0.0562...","[[[0.42352942, 0.5921569, 0.6039216, 0.4156862..."
3,16495,2,"[-0.05029925, 0.13056105, -0.1790179, -0.15296...","[[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0..."
4,6225,3,"[0.12107262, 0.26905593, 0.2554203, -0.2838499...","[[[0.003921569, 0.003921569, 0.003921569, 0.00..."
...,...,...,...,...
76694,7728,149565,"[-0.15238026, 0.53894144, -0.103565834, -0.101...","[[[0.003921569, 0.003921569, 0.003921569, 0.00..."
76695,47785,149566,"[-0.64320135, 0.0661266, 0.11430665, -0.244580...","[[[0.18039216, 0.20784314, 0.12156863, 0.03529..."
76696,4209,149567,"[0.11831138, 0.26141447, 0.47210327, -0.080064...","[[[0.16078432, 0.11372549, 0.21176471, 0.41568..."
76697,40779,149568,"[0.06324439, 0.08317141, -0.20397493, 0.017179...","[[[0.1254902, 0.11372549, 0.105882354, 0.09803..."


In [10]:
#data['img_test']
pd.concat((data['text_test'][['user_id', 'isbn', 'item_summary_vector']], data['img_test']['img_vector']),axis=1).values

array([[13, 0,
        array([-2.18128830e-01, -2.03919947e-01,  6.43981621e-02, -1.35916039e-01,
                4.01943177e-01, -2.07400233e-01,  1.33493692e-01,  4.96284813e-01,
               -3.48474234e-01, -1.32244572e-01,  2.11999863e-01, -5.33152878e-01,
                2.35747918e-01,  4.32718217e-01, -4.74705637e-01,  3.83882105e-01,
                5.37673175e-01,  1.31260782e-01,  8.86616707e-02,  9.35770273e-02,
               -1.95306748e-01, -1.32464752e-01,  4.05294821e-03,  4.15047139e-01,
                2.82527000e-01, -2.49478459e-01,  3.26571688e-02, -3.96691114e-02,
               -2.52422571e-01,  1.95406936e-02,  3.23147446e-01, -1.34026229e-01,
               -5.02617732e-02,  1.10279344e-01, -4.35918495e-02,  3.84464264e-01,
               -6.92584962e-02, -2.20555767e-01, -2.66284615e-01,  3.83468717e-01,
               -9.66787577e-01, -5.49207389e-01,  2.08259493e-01,  4.38577980e-02,
                2.19534278e-01, -4.41265523e-01,  4.18543994e-01, -3.703

In [11]:
data['y_train'].values

array([ 4,  8,  7, ...,  5,  8, 10])

## Logs settings

In [12]:
####################### Setting for Log
setting = Setting()

log_path = setting.get_log_path(args)
setting.make_dir(log_path)

logger = Logger(args, log_path)
logger.save_args()


## Define Model (FM 모델을 예시로 사용)

In [19]:
import numpy as np
import torch
import torch.nn as nn


# feature 사이의 상호작용을 효율적으로 계산합니다.
class FactorizationMachine(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super().__init__()
        self.v = nn.Parameter(torch.rand(input_dim, latent_dim), requires_grad = True)
        self.linear = nn.Linear(input_dim, 1, bias=True)


    def forward(self, x):
        linear = self.linear(x)
        square_of_sum = torch.mm(x, self.v) ** 2
        sum_of_square = torch.mm(x ** 2, self.v ** 2)
        pair_interactions = torch.sum(square_of_sum - sum_of_square, dim=1, keepdim=True)
        output = linear + (0.5 * pair_interactions)
        return output


# factorization을 통해 얻은 feature를 embedding 합니다.
class FeaturesEmbedding(nn.Module):
    def __init__(self, field_dims: np.ndarray, embed_dim: int):
        super().__init__()
        self.embedding = torch.nn.Embedding(sum(field_dims), embed_dim)
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.int32)
        torch.nn.init.xavier_uniform_(self.embedding.weight.data)


    def forward(self, x: torch.Tensor):
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        return self.embedding(x)


# 이미지 특징 추출을 위한 기초적인 CNN Layer를 정의합니다.
class CNN_Base(nn.Module):
    def __init__(self, ):
        super(CNN_Base, self).__init__()
        self.cnn_layer = nn.Sequential(
                                        nn.Conv2d(3, 6, kernel_size=3, stride=2, padding=1),
                                        nn.ReLU(),
                                        nn.MaxPool2d(kernel_size=3, stride=2),
                                        nn.Conv2d(6, 12, kernel_size=3, stride=2, padding=1),
                                        nn.ReLU(),
                                        nn.MaxPool2d(kernel_size=3, stride=2),
                                        )
    def forward(self, x):
        x = self.cnn_layer(x)
        x = x.view(-1, 12 * 1 * 1)
        return x


# 기존 유저/상품 벡터와 이미지 벡터를 결합하여 FM으로 학습하는 모델을 구현합니다.
class CNN_Text_FM(torch.nn.Module):
    def __init__(self, args, data):
        super().__init__()
        self.field_dims = np.array([len(data['user2idx']), len(data['isbn2idx'])], dtype=np.uint32)
        self.embedding = FeaturesEmbedding(self.field_dims, args.cnn_embed_dim)
        self.cnn = CNN_Base()
        self.fm = FactorizationMachine(
                                        input_dim=(args.cnn_embed_dim * 2) + (12 * 1 * 1),
                                        latent_dim=args.cnn_latent_dim,
                                        )
        

    def forward(self, x):
        print(len(x))
        user_isbn_vector, img_vector, text_vector = x[0], x[1], x[2]
        user_isbn_feature = self.embedding(user_isbn_vector)
        img_feature = self.cnn(img_vector)
        feature_vector = torch.cat([
                                    user_isbn_feature.view(-1, user_isbn_feature.size(1) * user_isbn_feature.size(2)),
                                    img_feature
                                    ], dim=1)
        output = self.fm(feature_vector)
        return output.squeeze(1)


## Load Model

In [20]:
######################## Model
print(f'--------------- INIT {args.model} ---------------')
model = CNN_Text_FM(args, data).to(args.device) #이부분수정하면됩니다

--------------- INIT Custom ---------------


In [21]:
for i in data['train_dataloader']:
    print(i.keys())
    break

dict_keys(['user_isbn_vector', 'img_vector', 'text_vector', 'label'])


## Define RMSEloss

In [22]:
class RMSELoss(nn.Module):
    def __init__(self):
        super(RMSELoss, self).__init__()
        self.eps = 1e-6
    def forward(self, x, y):
        criterion = MSELoss()
        loss = torch.sqrt(criterion(x, y)+self.eps)
        return loss

## Train

In [23]:

def train(args, model, dataloader, logger, setting):
    minimum_loss = 999999999
    if args.loss_fn == 'MSE':
        loss_fn = MSELoss()
    elif args.loss_fn == 'RMSE':
        loss_fn = RMSELoss()
    else:
        pass
    if args.optimizer == 'SGD':
        optimizer = SGD(model.parameters(), lr=args.lr)
    elif args.optimizer == 'ADAM':
        optimizer = Adam(model.parameters(), lr=args.lr)
    else:
        pass

    for epoch in tqdm.tqdm(range(args.epochs)):
        model.train()
        total_loss = 0
        batch = 0

        for idx, data in enumerate(dataloader['train_dataloader']):
            ##print(data.keys())
            if args.model == 'CNN_FM':
                x, y = [data['user_isbn_vector'].to(args.device), data['img_vector'].to(args.device)], data['label'].to(args.device)
            elif args.model == 'DeepCoNN':
                x, y = [data['user_isbn_vector'].to(args.device), data['user_summary_merge_vector'].to(args.device), data['item_summary_vector'].to(args.device)], data['label'].to(args.device)
            elif args.model == 'Custom':
                x,y = [data['user_isbn_vector'].to(args.device), data['img_vector'].to(args.device), data['text_vector'].to(args.device)], data['label'].to(args.device)
            else:
                x, y = data[0].to(args.device), data[1].to(args.device)
            y_hat = model(x)
            loss = loss_fn(y.float(), y_hat)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            batch +=1
        valid_loss = valid(args, model, dataloader, loss_fn)
        print(f'Epoch: {epoch+1}, Train_loss: {total_loss/batch:.3f}, valid_loss: {valid_loss:.3f}')
        logger.log(epoch=epoch+1, train_loss=total_loss/batch, valid_loss=valid_loss)
        if minimum_loss > valid_loss:
            minimum_loss = valid_loss
            os.makedirs(args.saved_model_path, exist_ok=True)
            torch.save(model.state_dict(), f'{args.saved_model_path}/{setting.save_time}_{args.model}_model.pt')
    logger.close()
    return model


def valid(args, model, dataloader, loss_fn):
    model.eval()
    total_loss = 0
    batch = 0

    for idx, data in enumerate(dataloader['valid_dataloader']):
        if args.model == 'CNN_FM':
            x, y = [data['user_isbn_vector'].to(args.device), data['img_vector'].to(args.device)], data['label'].to(args.device)
        elif args.model == 'DeepCoNN':
            x, y = [data['user_isbn_vector'].to(args.device), data['user_summary_merge_vector'].to(args.device), data['item_summary_vector'].to(args.device)], data['label'].to(args.device)
        elif args.model == 'Custom':
            x,y = [data['user_isbn_vector'].to(args.device), data['img_vector'].to(args.device), data['text_vector'].to(args.device)], data['label'].to(args.device)
        else:
            x, y = data[0].to(args.device), data[1].to(args.device)
        y_hat = model(x)
        loss = loss_fn(y.float(), y_hat)
        total_loss += loss.item()
        batch +=1
    valid_loss = total_loss/batch
    return valid_loss


def test(args, model, dataloader, setting):
    predicts = list()
    if args.use_best_model == True:
        model.load_state_dict(torch.load(f'./saved_models/{setting.save_time}_{args.model}_model.pt'))
    else:
        pass
    model.eval()

    for idx, data in enumerate(dataloader['test_dataloader']):
        if args.model == 'CNN_FM' or args.model == 'Custom':
            x, _ = [data['user_isbn_vector'].to(args.device), data['img_vector'].to(args.device)], data['label'].to(args.device)
        elif args.model == 'DeepCoNN':
            x, _ = [data['user_isbn_vector'].to(args.device), data['user_summary_merge_vector'].to(args.device), data['item_summary_vector'].to(args.device)], data['label'].to(args.device)
        elif args.model == 'Custom':
            x,y = [data['user_isbn_vector'].to(args.device), data['img_vector'].to(args.device), data['text_vector'].to(args.device)], data['label'].to(args.device)
        else:
            x = data[0].to(args.device)
        y_hat = model(x)
        predicts.extend(y_hat.tolist())
    return predicts

data['train_dataloader']
model = train(args, model, data, logger, setting)



  0%|          | 0/10 [00:00<?, ?it/s]

3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3


  0%|          | 0/10 [00:16<?, ?it/s]

3
2





IndexError: list index out of range

In [None]:
data

In [None]:

######################## INFERENCE
print(f'--------------- {args.model} PREDICT ---------------')
predicts = test(args, model, data, setting)

## SAVE MODEL

In [None]:
######################## SAVE PREDICT
print(f'--------------- SAVE {args.model} PREDICT ---------------')
submission = pd.read_csv(args.data_path + 'sample_submission.csv')
if args.model in ('FM', 'FFM', 'NCF', 'WDN', 'DCN', 'CNN_FM', 'DeepCoNN'):
    submission['rating'] = predicts
else:
    pass

## 답안 제출 파일 생성

In [None]:
filename = setting.get_submit_filename(args)
submission.to_csv(filename, index=False)