# 0. 환경설정

self.args에 2가지 arguments 추가

- feats: 사용할 feature 목록들\
    dataloader > load_train_data > load_data_from_file > __feature_engineering
    
- cat_feats: labeling될 범주형 feature 목록들\
    dataloader > load_train_data > load_data_from_file > __preprocess


##### args.py에 4가지 arguments 추가\
모든 목록들 다 넣었다뺐다 하기 힘들어서 base(default)피처들과 추가 혹은 제거할 유동적인 피처들로 나눔

무조건 들어가는 애들 -> base_feats\
실험 대상인 애들 -> feats

- base_feats: 무조건 들어가는 feature들
- base_cat_feats: 무조건 들어가는 범주형 feature들

- feats: 유동적인 feature들
- cat_feats: 유동적인 범주형 feature들



In [46]:
import os

import numpy as np
import torch
import wandb

from dkt.dkt import trainer
# from dkt.dkt.args import parse_args
from dkt.dkt.dataloader import Preprocess
from dkt.dkt.utils import get_logger, set_seeds, logging_conf

from datetime import datetime
import pytz

import warnings
warnings.filterwarnings("ignore")

logger = get_logger(logging_conf)
korea = pytz.timezone('Asia/Seoul')
current_time = datetime.now(korea).strftime("%m-%d %H:%M")

In [47]:
import argparse


def parse_args():
    parser = argparse.ArgumentParser()

    parser.add_argument("--seed", default=42, type=int, help="seed")
    parser.add_argument("--device", default="cpu", type=str, help="cpu or gpu")
    parser.add_argument(
        "--data_dir",
        default="/data/ephemeral/home/level2-dkt-recsys-06/data/",
        type=str,
        help="data directory",
    )
    parser.add_argument(
        "--asset_dir", default="asset/", type=str, help="data directory"
    )
    parser.add_argument(
        "--file_name", default="train_data.csv", type=str, help="train file name"
    )
    parser.add_argument(
        "--model_dir", default="models/", type=str, help="model directory"
    )
    parser.add_argument(
        "--model_name", default="best_model.pt", type=str, help="model file name"
    )
    parser.add_argument(
        "--output_dir", default="../submit/", type=str, help="output directory"
    )
    parser.add_argument(
        "--test_file_name", default="test_data.csv", type=str, help="test file name"
    )

    parser.add_argument(
        "--max_seq_len", default=20, type=int, help="max sequence length"
    )
    parser.add_argument("--num_workers", default=1, type=int, help="number of workers")

    # 모델
    parser.add_argument(
        "--hidden_dim", default=64, type=int, help="hidden dimension size"
    )
    parser.add_argument("--n_layers", default=2, type=int, help="number of layers")
    parser.add_argument("--n_heads", default=2, type=int, help="number of heads")
    parser.add_argument("--drop_out", default=0.2, type=float, help="drop out rate")

    # 훈련
    parser.add_argument("--n_epochs", default=20, type=int, help="number of epochs")
    parser.add_argument("--batch_size", default=64, type=int, help="batch size")
    parser.add_argument("--lr", default=0.0001, type=float, help="learning rate")
    parser.add_argument("--clip_grad", default=10, type=int, help="clip grad")
    parser.add_argument("--patience", default=5, type=int, help="for early stopping")

    parser.add_argument(
        "--log_steps", default=50, type=int, help="print log per n steps"
    )

    ### 중요 ###
    parser.add_argument("--model", default="lstm", type=str, help="model type")
    parser.add_argument("--optimizer", default="adam", type=str, help="optimizer type")
    parser.add_argument(
        "--scheduler", default="plateau", type=str, help="scheduler type"
    )

    # submission 파일
    parser.add_argument("--submission_name", default="dkt_submission.csv", type=str, help="submission file name")

    ### feature engineering

    # base : 무조건 들어갈 애들
    parser.add_argument("--base_feats", nargs='+', default = ["userID","assessmentItemID", "testId", "KnowledgeTag", "answerCode"])
    parser.add_argument("--base_cat_feats", nargs='+', default = ["userID","assessmentItemID", "testId", "KnowledgeTag", "answerCode"])

    # 실험 대상
    parser.add_argument("--feats", nargs='+', default=[])
    parser.add_argument("--cat_feats", nargs='+', default=[])
    
    args = parser.parse_args(args = [])

    args.feats = list(set(args.feats+args.base_feats))
    args.cat_feats = list(set(args.feats+args.base_feats))

    return args


# 1. 두 모델 모두 학습 후 비교

In [None]:
args = parse_args()

set_seeds(args.seed)
args.device = "cuda" if torch.cuda.is_available() else "cpu"

logger.info("Preparing data ...")
preprocess = Preprocess(args)
preprocess.load_train_data(file_name=args.file_name)
train_data: np.ndarray = preprocess.get_train_data()
train_data, valid_data = preprocess.split_data(data=train_data)

In [None]:
model: torch.nn.Module = trainer.get_model(args=args).to(args.device)

trainer.run(args=args, train_data=train_data, valid_data=valid_data, model=model)

# 2. 새로운 모델만 학습하기

In [32]:
train_data[0]

(array([ 90,  90,  90,  90,  90,  90, 959, 959, 959, 959, 959, 778, 778,
        778, 778, 778, 778, 778, 780, 780, 780, 780, 780, 780]),
 array([ 459,  460,  461,  462,  463,  464, 5260, 5261, 5262, 5263, 5264,
        4066, 4067, 4068, 4069, 4070, 4071, 4072, 4078, 4079, 4080, 4081,
        4082, 4083]),
 array([567, 567, 567, 567, 567, 567,  20,  20,  20,  20,  20, 235, 235,
        235, 235, 235, 235, 235, 237, 237, 237, 237, 237, 237]),
 array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
        0, 1]))

In [37]:
args.feats

AttributeError: 'Namespace' object has no attribute 'feats'

In [50]:
train = preprocess.load_data_from_file(file_name=args.file_name)

In [53]:
train = preprocess.split_data(train)

In [55]:
train[0]

array([(array([ 90,  90,  90,  90,  90,  90, 959, 959, 959, 959, 959, 778, 778,
              778, 778, 778, 778, 778, 780, 780, 780, 780, 780, 780]), array([ 459,  460,  461,  462,  463,  464, 5260, 5261, 5262, 5263, 5264,
              4066, 4067, 4068, 4069, 4070, 4071, 4072, 4078, 4079, 4080, 4081,
              4082, 4083]), array([567, 567, 567, 567, 567, 567,  20,  20,  20,  20,  20, 235, 235,
              235, 235, 235, 235, 235, 237, 237, 237, 237, 237, 237]), array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
              0, 1]))                                                                                                                          ,
       (array([200, 200, 200, 200, 200, 202, 202, 202, 202, 202, 192, 192, 192,
              192, 192, 204, 204, 204, 204, 204, 208, 208, 208, 208, 208, 210,
              210, 210, 210, 210, 206, 206, 206, 206, 206, 676, 676, 676, 676,
              676, 194, 194, 194, 194, 212, 212, 212, 212, 212, 214, 

In [59]:
train_data[0][0]

array([ 90,  90,  90,  90,  90,  90, 959, 959, 959, 959, 959, 778, 778,
       778, 778, 778, 778, 778, 780, 780, 780, 780, 780, 780])