In [12]:
import pandas as pd
import numpy as np
# import ydata_profiling
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import os
from math import pi
from matplotlib.path import Path
from matplotlib.spines import Spine
from matplotlib.transforms import Affine2D
import json
from collections import Counter
import time

from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

In [14]:
%%time
dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}
DATA_PATH = '/opt/ml/input/data'
train_path = os.path.join(DATA_PATH, 'train_data.csv')
test_path = os.path.join(DATA_PATH, 'test_data.csv')
train_df = pd.read_csv(train_path, dtype=dtype)
test_df = pd.read_csv(test_path, dtype=dtype)
df = pd.concat([train_df, test_df])
df.drop_duplicates(subset=["userID", "assessmentItemID"], keep="last", inplace=True)

CPU times: user 2.9 s, sys: 200 ms, total: 3.1 s
Wall time: 3.11 s


In [15]:
cate_cols = ["assessmentItemID", "testId", "KnowledgeTag"]

for col in cate_cols:
    le = LabelEncoder()
    # For UNKNOWN class
    a = df[col].unique().tolist() + ["unknown"]
    le.fit(a)

    # 모든 컬럼이 범주형이라고 가정
    df[col] = df[col].astype(str)
    test = le.transform(df[col])
    df[col] = test

def convert_time(s: str):
    timestamp = time.mktime(
        datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple()
    )
    return int(timestamp)

df["Timestamp"] = df["Timestamp"].apply(convert_time)

In [16]:
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,5354,975,1,1585009031,618
1,0,5355,975,1,1585009034,619
2,0,5356,975,1,1585009042,619
3,0,5357,975,1,1585009049,619
4,0,5358,975,1,1585009056,619
...,...,...,...,...,...,...
260109,7439,3728,713,0,1602716843,783
260110,7439,3729,713,1,1602716861,783
260111,7439,3730,713,1,1602716882,761
260112,7439,3731,713,1,1602716971,761


In [18]:
from torch.utils.data import Dataset

class BaseDataset(Dataset):
    def __init__(
            self,
            data: Dataset,
            idx: list,
            config: dict,
    ) -> None:
        super().__init__()
        self.data = data[data["userID"].isin(idx)]
        self.user_list = self.data["userID"].unique().tolist()
        self.config = config
        self.max_seq_len = config["dataset"]["max_seq_len"]

        self.Y = self.data.groupby("userID")["answerCode"]

        self.cur_cat_col = [f"{col}2idx" for col in config["cat_cols"]] + ["userID"]
        self.cur_num_col = config["num_cols"] + ["userID"]
        self.X_cat = self.data.loc[:, self.cur_cat_col].copy()
        self.X_num = self.data.loc[:, self.cur_num_col].copy()

        self.X_cat = self.X_cat.groupby("userID")
        self.X_num = self.X_num.groupby("userID")

        self.group_data = self.data.groupby("userID")

    def __len__(self) -> int:
        """
        return data length
        """
        return len(self.user_list)

    def __getitem__(self, index: int) -> object:
        user = self.user_list[index]
        cat = self.X_cat.get_group(user).values[:, :-1]
        num = self.X_num.get_group(user).values[:, :-1].astype(np.float32)
        y = self.Y.get_group(user).values
        seq_len = cat.shape[0]

        if seq_len >= self.max_seq_len:
            cat = torch.tensor(cat[-self.max_seq_len :], dtype=torch.long)
            num = torch.tensor(num[-self.max_seq_len :], dtype=torch.float32)
            y = torch.tensor(y[-self.max_seq_len :], dtype=torch.float32)
            mask = torch.ones(self.max_seq_len, dtype=torch.long)
        else:
            cat = torch.cat(
                (
                    torch.zeros(
                        self.max_seq_len - seq_len,
                        len(self.cur_cat_col) - 1,
                        dtype=torch.long,
                    ),
                    torch.tensor(cat, dtype=torch.long),
                )
            )
            num = torch.cat(
                (
                    torch.zeros(
                        self.max_seq_len - seq_len,
                        len(self.cur_num_col) - 1,
                        dtype=torch.float32,
                    ),
                    torch.tensor(num, dtype=torch.float32),
                )
            )
            y = torch.cat(
                (
                    torch.zeros(self.max_seq_len - seq_len, dtype=torch.float32),
                    torch.tensor(y, dtype=torch.float32),
                )
            )
            mask = torch.zeros(self.max_seq_len, dtype=torch.long)
            mask[-seq_len:] = 1

        return {"cat": cat, "num": num, "answerCode": y, "mask": mask}

In [None]:
def run_kfold(k, config, data, now):
    kf = KFold(n_splits=k, shuffle=True, random_state=22)
    
    for fold, (train_idx, val_idx) in enumerate(
        kf.split(data["userID"].unique().tolist())
    ):
        print(
            f"-------------------------START FOLD {fold + 1} TRAINING---------------------------"
        )
        print(
            f"-------------------------START FOLD {fold + 1} MODEL LOADING----------------------"
        )

        model = models.get_models(config)

        print(
            f"-------------------------DONE FOLD {fold + 1} MODEL LOADING-----------------------"
        )

        train_set = BaseDataset(data, train_idx, config)
        val_set = BaseDataset(data, val_idx, config)

        train, valid = get_loader(train_set, val_set, config["data_loader"]["args"])

        trainer = BaseTrainer(
            model=model,
            train_data_loader=train,
            valid_data_loader=valid,
            config=config,
            fold=fold + 1,
        )

        trainer.train()
        print(
            f"---------------------------DONE FOLD {fold + 1} TRAINING--------------------------"
        )