# 환경설정

1. 위치\
level2-dkt-recsys-06/code/dkt

2. 사용파일: FE_v2.0.csv\
    파일 업데이트 되면 args_total에 컬럼 추가해줘야 합니다

3. 데이터를 사전에 로드해놓고, 필요한 feature를 그때그때 slicing만 하는 방식으로 작동

3. args에 설명

    - feats: 사용할 feature들 전체 목록

    - base_feats: 무조건 들어가는 feature들
    - base_cat_feats: 무조건 들어가는 범주형 feature들

    - new_num_feats: 실험할 ***수치형*** feature들 (반드시 2개 이상 넣어야 합니다)
    - new_cat_feats: 실험할 ***범주형*** feature들


        1) args_total: 전체 변수들 미리 로드 용
        2) args_1, arg_2로 비교

> 데이터 불러오기 부터 보시면 됩니다

In [1]:
import os

import pandas as pd
import numpy as np
import torch
import wandb

from dkt.dkt import trainer
# from dkt.dkt.args import parse_args
from dkt.dkt.dataloader import Preprocess
from dkt.dkt.dataloader import get_loaders
from dkt.dkt.utils import get_logger, set_seeds, logging_conf
from dkt.dkt.optimizer import get_optimizer
from dkt.dkt.scheduler import get_scheduler

from datetime import datetime
import pytz

import argparse

import warnings
warnings.filterwarnings("ignore")

logger = get_logger(logging_conf)
korea = pytz.timezone('Asia/Seoul')
current_time = datetime.now(korea).strftime("%m-%d %H:%M")

import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def parse_args():
    # Create a Namespace instance
    args = argparse.Namespace()

    # Manually assign values to the attributes
    args.seed = 42
    args.device = "cpu"
    args.data_dir = "/data/ephemeral/home/level2-dkt-recsys-06/data/"
    args.asset_dir = "asset/"
    args.file_name = "FE_v2.0.csv"
    args.model_dir = "models/"
    args.model_name = "best_model.pt"
    args.output_dir = "../submit/"
    args.test_file_name = "test_data.csv"
    args.max_seq_len = 20
    args.num_workers = 1
    args.hidden_dim = 64
    args.n_layers = 2
    args.n_heads = 2
    args.drop_out = 0.2
    args.n_epochs = 20
    args.batch_size = 64
    args.lr = 0.0001
    args.clip_grad = 10
    args.patience = 5
    args.log_steps = 50
    args.model = "lstm"
    args.optimizer = "adam"
    args.scheduler = "plateau"
    args.submission_name = "dkt_submission.csv"
    args.base_num_feats = []
    args.base_cat_feats = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag"]

    args.n_questions = len(
            np.load(os.path.join(args.asset_dir, "assessmentItemID_classes.npy"))
        )
    args.n_tests = len(
            np.load(os.path.join(args.asset_dir, "testId_classes.npy"))
        )
    args.n_tags = len(
            np.load(os.path.join(args.asset_dir, "KnowledgeTag_classes.npy"))
        )

    args.device = "cuda" if torch.cuda.is_available() else "cpu"

    return args

In [3]:
def train(df, args):    
    # argument update
    args.num_feats = args.base_num_feats + args.new_num_feats
    args.cat_feats = args.base_cat_feats + args.new_cat_feats
    args.feats = args.cat_feats + args.num_feats
    args.n_cat_feats = []
    for cat in args.new_cat_feats:    
        if args.n_cat_feats:
            args.n_cat_feats.append(df[cat].nunique())
        else:
            args.n_cat_feats = [df[cat].nunique()]
    set_seeds(args.seed)


    print("loading data...")

    # 최종 피처선택
    temp = df.copy()
    columns = args.feats
    print(f"-----------------------\ncolumns:{columns}\n------------------------")
    group = (
        temp[args.feats]
        .groupby("userID")
        .apply(lambda r: tuple([r[col].values for col in columns[1:]]))
    ).values

    train_data = group

    print("preprocessing...")
    train_data, valid_data = preprocess.split_data(data=train_data)
    train_loader, valid_loader = get_loaders(
        args=args, train=train_data, valid=valid_data
    )
    model: torch.nn.Module = trainer.get_model(args=args).to(args.device)
    optimizer = get_optimizer(model=model, args=args)
    scheduler = get_scheduler(optimizer=optimizer, args=args)

    print("training")
    train_loss, train_auc, train_acc, val_auc, val_acc, val_loss = (
        [],
        [],
        [],
        [],
        [],
        [],
    )
    for epoch in range(args.n_epochs):
        t_auc, t_acc, t_loss = trainer.train(
            train_loader=train_loader,
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            args=args,
        )
        train_auc.append(t_auc)
        train_acc.append(t_acc)
        train_loss.append(t_loss.detach().cpu())

        v_auc, v_acc, v_loss = trainer.validate(
            valid_loader=valid_loader, model=model, args=args
        )
        val_auc.append(v_auc)
        val_acc.append(v_acc)
        val_loss.append(v_loss.detach().cpu())

        # if (epoch <= 5) or (epoch % 5 ==0):
        #     print(f'{epoch+1}th epoch finished')
        #     print(f'train auc:{t_auc}, acc:{t_acc}, loss:{t_loss}')
        #     print(f'VAL   auc:{v_auc}, acc:{v_acc}, loss:{v_loss}')

    metrics = {
        "train_auc": train_auc,
        "train_acc": train_acc,
        "train_loss": train_loss,
        "val_auc": val_auc,
        "val_acc": val_acc,
        "val_loss": val_loss
    }

    return metrics

## 0. 데이터 불러오기

- 여기서 전체 컬럼들 목록 분류해서 넣어야 함!

In [4]:
args_total = parse_args()

args_total.new_cat_feats = ['Month', 'DayOfWeek', 'TimeOfDay']
args_total.new_num_feats = ['SolvingTime', 'CumulativeTime''problems_cumulative', 'problems_last7days', 'problems_last30days',
       'same_tag_last7days', 'same_tag_last30days', 'same_tag_cumulative','UserCumulativeAnswerRate', 'TagAnswerRate', 'UserAnswerRate',
       'ItemAnswerRate', 'TestAnswerRate', 'C_SolvingTime', 'C_ItemAnswerRate',
       'C_TagAnswerRate', 'C_TestAnswerRate']

args_total.num_feats = args_total.base_num_feats + args_total.new_num_feats
args_total.cat_feats = args_total.base_cat_feats + args_total.new_cat_feats
args_total.feats = args_total.cat_feats + args_total.num_feats
args_total.n_cat_feats = []

csv_file_path = os.path.join(args_total.data_dir, args_total.file_name)
df = pd.read_csv(csv_file_path)

for cat in args_total.new_cat_feats:    
    if args_total.n_cat_feats:
        args_total.n_cat_feats.append(df[cat].nunique())
    else:
        args_total.n_cat_feats = [df[cat].nunique()]

preprocess = Preprocess(args_total)
df = preprocess._Preprocess__preprocessing(df, is_train=True)  # 범주형 전처리
df = df.sort_values(by=["userID", "Timestamp"], axis=0)

## 1. 실험 feature 정하기

In [5]:
## 변수 정하기
args1 = parse_args()
args2 = parse_args()

# featue 목록 1
args1.new_cat_feats = []
args1.new_num_feats = []

# feature 목록 2
args2.new_cat_feats = ['DayOfWeek']
args2.new_num_feats = ['problems_cumulative','problems_last7days']

## 2. 학습

In [None]:
metrics_1 = train(args=args1, df=df)

In [None]:
metrics_2 = train(args=args2, df=df)

## 3. 시각화

In [None]:
fig, axes = plt.subplots(2,3, figsize=(20,8), sharex=True)

for i, (key,value) in enumerate(metrics_1.items()):
        
        row, col = int(i%3), i//3

        axes[col,row].plot(metrics_1[key], linewidth=5, color='red', label='args 1')
        axes[col,row].plot(metrics_2[key], linewidth=5, color='blue', label='args 2')

        axes[col,row].set_title(key, fontsize=30)

        axes[col,row].legend(loc='right', fontsize=20)
        axes[col,row].grid()

plt.tight_layout()
plt.show()

print(f"args 1 auc:{metrics_1['val_auc'][-5:]}")
print(f"args 2 auc:{metrics_2['val_auc'][-5:]}", end='\n\n')

print(f"args 1 acc:{metrics_1['val_acc'][-5:]}")
print(f"args 2 acc:{metrics_2['val_acc'][-5:]}", end='\n\n')

print(f"args 1 loss:{metrics_1['val_loss'][-5:]}")
print(f"args 2 loss:{metrics_2['val_loss'][-5:]}")