In [2]:
#pip install --upgrade typing-extensions

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
#!pip install optuna

[0m

In [8]:
import time
import argparse
import json
import pandas as pd
import numpy as np
import tqdm
import argparse
import warnings
import joblib

#from src.models import FFMDCN
from src.models.FFMDCN import FFDCNModel

from src.data import context_data_load, context_data_split, context_data_loader
from src.utils import Logger, Setting, models_load
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from typing_extensions import Concatenate

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings(action='ignore')

In [9]:
parser = argparse.ArgumentParser()
args = argparse.Namespace()
with open('config.json','rt') as f:
    args.__dict__.update(json.load(f))

In [10]:
Setting.seed_everything(args.seed)

In [11]:
data = context_data_load(args)

In [12]:
fold_num=5
skf = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=args.seed)
folds = []
for train_idx, valid_idx in skf.split(data['train'].drop(['rating'], axis=1), data['train']['rating']):
    folds.append((train_idx, valid_idx))

In [16]:
predicts = []
for i in range(fold_num):
    Setting.seed_everything(args.seed)
    print('---------------',i+1,' epoch ---------------')
    train_idx, valid_idx = folds[i]
    # train-valid set Split
    X_train = data['train'].drop(['rating'], axis=1).iloc[train_idx]
    X_valid = data['train'].drop(['rating'], axis=1).iloc[valid_idx]
    y_train = data['train']['rating'][train_idx]
    y_valid = data['train']['rating'][valid_idx]
    # Create Dataloader
    fold_data = {
            'X_train':X_train,
            'X_valid':X_valid,
            'y_train':y_train,
            'y_valid':y_valid,
            'test':data['test'],
            'field_dims':data['field_dims'],
            'sub':data['sub'],
            'idx2user':data['idx2user'],
            'idx2isbn':data['idx2isbn']
    }
    fold_data = context_data_loader(args,fold_data)
    # Create Mode
    print('--------------- TRAINING ---------------')
    model = FFDCNModel(args,fold_data)
    model.train()
    log_score = model.predict_train()

    # Predict
    print('--------------- PREDICT ---------------')
    predict = model.predict(fold_data['test_dataloader'])
    predicts.append(predict)

--------------- 1  epoch ---------------
--------------- TRAINING ---------------


100%|██████████| 959/959 [01:27<00:00, 10.92it/s, loss=2.18]
100%|██████████| 240/240 [00:02<00:00, 86.00it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

epoch: 0 validation: rmse: 2.1629021481608874


100%|██████████| 240/240 [00:02<00:00, 83.27it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

--------------- PREDICT ---------------


100%|██████████| 300/300 [00:02<00:00, 104.48it/s]


--------------- 2  epoch ---------------
--------------- TRAINING ---------------


100%|██████████| 959/959 [01:26<00:00, 11.06it/s, loss=2.17]
100%|██████████| 240/240 [00:02<00:00, 84.98it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

epoch: 0 validation: rmse: 2.1664966743219716


100%|██████████| 240/240 [00:02<00:00, 85.80it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

--------------- PREDICT ---------------


100%|██████████| 300/300 [00:02<00:00, 102.36it/s]


--------------- 3  epoch ---------------
--------------- TRAINING ---------------


100%|██████████| 959/959 [01:30<00:00, 10.56it/s, loss=2.17]
100%|██████████| 240/240 [00:03<00:00, 74.83it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

epoch: 0 validation: rmse: 2.180985979636841


100%|██████████| 240/240 [00:03<00:00, 71.88it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

--------------- PREDICT ---------------


100%|██████████| 300/300 [00:03<00:00, 96.67it/s]


--------------- 4  epoch ---------------
--------------- TRAINING ---------------


100%|██████████| 959/959 [01:17<00:00, 12.44it/s, loss=2.17]
100%|██████████| 240/240 [00:02<00:00, 94.65it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

epoch: 0 validation: rmse: 2.160044918340023


100%|██████████| 240/240 [00:02<00:00, 93.42it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

--------------- PREDICT ---------------


100%|██████████| 300/300 [00:02<00:00, 119.39it/s]


--------------- 5  epoch ---------------
--------------- TRAINING ---------------


100%|██████████| 959/959 [01:08<00:00, 13.96it/s, loss=2.18]
100%|██████████| 240/240 [00:02<00:00, 97.90it/s]
  0%|          | 0/240 [00:00<?, ?it/s]

epoch: 0 validation: rmse: 2.1762367520301478


100%|██████████| 240/240 [00:02<00:00, 94.36it/s] 
  0%|          | 0/300 [00:00<?, ?it/s]

--------------- PREDICT ---------------


100%|██████████| 300/300 [00:02<00:00, 120.69it/s]


In [18]:
# Save Predicted
print('--------------- SAVE PREDICT ---------------')
submission = pd.read_csv(args.data_path + 'sample_submission.csv')
for fold in predicts:
    submission['rating'] += np.array(fold) / fold_num
submission.loc[submission['rating']>10,'rating'] = 10.0 # 10이상인 rating은 10으로 처리 이상값 처리 과정
submission.to_csv('submit/FFMDCN_5FOLD.csv', index=False)
print('------------------ FINISH ------------------')

--------------- SAVE PREDICT ---------------
------------------ FINISH ------------------
