In [1]:
from dataloader import Preprocess
from args import parse_args
from utils import get_logger, set_seeds, logging_conf
import os
import torch
import numpy as np
import pandas as pd
import os
import random
import time
from datetime import datetime
from typing import Tuple

import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder

In [9]:

class Preprocess:
    def __init__(self):
        self.train_data = None
        self.test_data = None
        self.asset_dir = '/opt/ml/'
        self.data_dir = None

    def get_train_data(self):
        return self.train_data

    def get_test_data(self):
        return self.test_data

    def split_data(self,
                   data: np.ndarray,
                   ratio: float = 0.7,
                   shuffle: bool = True,
                   seed: int = 0) -> Tuple[np.ndarray]:
        """
        split data into two parts with a given ratio.
        """
        if shuffle:
            random.seed(seed)  # fix to default seed 0
            random.shuffle(data)

        size = int(len(data) * ratio)
        data_1 = data[:size]
        data_2 = data[size:]
        return data_1, data_2

    def __save_labels(self, encoder: LabelEncoder, name: str) -> None:
        le_path = os.path.join(self.asset_dir, name + "_classes.npy")
        np.save(le_path, encoder.classes_)

    def __preprocessing(self, df: pd.DataFrame, is_train: bool = True) -> pd.DataFrame:
        cate_cols = ["assessmentItemID", "testId", "KnowledgeTag"]

        if not os.path.exists(self.asset_dir):
            os.makedirs(self.asset_dir)

        for col in cate_cols:
            le = LabelEncoder()
            if is_train:
                # For UNKNOWN class
                a = df[col].unique().tolist() + ["unknown"]
                le.fit(a)
                self.__save_labels(le, col)
            else:
                label_path = os.path.join(self.asset_dir, col + "_classes.npy")
                le.classes_ = np.load(label_path)

                df[col] = df[col].apply(
                    lambda x: x if str(x) in le.classes_ else "unknown"
                )

            # 모든 컬럼이 범주형이라고 가정
            df[col] = df[col].astype(str)
            test = le.transform(df[col])
            df[col] = test

        def convert_time(s: str):
            timestamp = time.mktime(
                datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple()
            )
            return int(timestamp)

        df["Timestamp"] = df["Timestamp"].apply(convert_time)
        return df

    def __feature_engineering(self, df: pd.DataFrame) -> pd.DataFrame:
        # TODO: Fill in if needed
        return df

    def load_data_from_file(self, file_name: str, is_train: bool = True) -> np.ndarray:
        df = pd.read_csv(file_name)  # , nrows=100000)
        df = self.__feature_engineering(df)
        df = self.__preprocessing(df, is_train)

        # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용

        self.n_questions = len(
            np.load(os.path.join(self.asset_dir, "assessmentItemID_classes.npy"))
        )
        self.n_tests = len(
            np.load(os.path.join(self.asset_dir, "testId_classes.npy"))
        )
        self.n_tags = len(
            np.load(os.path.join(self.asset_dir, "KnowledgeTag_classes.npy"))
        )

        df = df.sort_values(by=["userID", "Timestamp"], axis=0)
        self.df = df
        columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag"]
        group = (
            df[columns]
            .groupby("userID")
            .apply(
                lambda r: (
                    r["testId"].values,
                    r["assessmentItemID"].values,
                    r["KnowledgeTag"].values,
                    r["answerCode"].values,
                )
            )
        )
        return group.values

    def load_train_data(self, file_name: str) -> None:
        self.train_data = self.load_data_from_file(file_name)

    def load_test_data(self, file_name: str) -> None:
        self.test_data = self.load_data_from_file(file_name, is_train=False)

In [10]:
preprocess = Preprocess()
preprocess.load_train_data(file_name='/opt/ml/input/data/train_data.csv')
train_data= preprocess.get_train_data()

In [18]:
df = preprocess.df
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,5354,975,1,1585034231,618
1,0,5355,975,1,1585034234,619
2,0,5356,975,1,1585034242,619
3,0,5357,975,1,1585034249,619
4,0,5358,975,1,1585034256,619
...,...,...,...,...,...,...
2266581,7441,2373,456,0,1591365021,375
2266582,7441,3909,748,1,1597997199,784
2266583,7441,3910,748,1,1597997210,784
2266584,7441,3911,748,1,1597997256,784


In [37]:
get_df = df.groupby('userID')
get_df

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f917ff55f90>

In [28]:
columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag"]
input_df = df[columns].groupby("userID").apply(
                lambda r: (
                    r["testId"].values,
                    r["assessmentItemID"].values,
                    r["KnowledgeTag"].values,
                    r["answerCode"].values,
                )
            )

In [41]:
cat_feature_list = []
num_feature_list = []
answerCode_list = []
max_len = 5
window = 2
cat_cols = ["assessmentItemID", "testId", "KnowledgeTag"]
for userID, grouped_df in get_df:
    cat_feature = grouped_df[cat_cols].values[::-1]
    answerCode = grouped_df['answerCode'].values[::-1]
    pdb.set_trace()
    start_idx = 0

    if len(grouped_df) <= 5:
        cat_feature_list.append(cat_feature[::-1])
        answerCode_list.append(answerCode[::-1])
    else:
        while True:
            if len(cat_feature[start_idx: start_idx + 5, :]) < 5:
                cat_feature_list.append(cat_feature[start_idx: start_idx + max_len, :][::-1])
                answerCode_list.append(answerCode[start_idx: start_idx + max_len][::-1])
                break
            cat_feature_list.append(cat_feature[start_idx: start_idx + max_len, :][::-1])
            answerCode_list.append(answerCode[start_idx: start_idx + max_len][::-1])
            start_idx += window
    