In [1]:
from argparse import ArgumentParser
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy import sparse
import os
import pickle
import time

In [2]:
def prepare_test_iscream(
    min_user_inter_num: int
):
    data_path = "/opt/ml/input/data"
    df1 = pd.read_csv(os.path.join(data_path, "train_data.csv"), sep=",")
    df2 = pd.read_csv(os.path.join(data_path, "test_data.csv"), sep=",")
    df = pd.concat([df1, df2])

    df = df.rename(
        columns={"userID": "user_id", "assessmentItemID": "item_id", "KnowledgeTag": "skill_id",
        "answerCode": "correct"}
        )
    df["timestamp"] = pd.to_datetime(df["Timestamp"])
    df["timestamp"] = df["timestamp"] - df["timestamp"].min()
    df["timestamp"] = (
        df["timestamp"].apply(lambda x: x.total_seconds()).astype(np.int64)
    )

    # Remove continuous outcomes
    df = df.replace(-1, -2)
    df = df[df["correct"].isin([0, 1, -2])]
    df["correct"] = df["correct"].astype(np.int32)

    # Filter too short sequences
    df = df.groupby("user_id").filter(lambda x: len(x) >= min_user_inter_num)

    df["user_id"] = np.unique(df["user_id"], return_inverse=True)[1]
    df["item_id"] = np.unique(df["item_id"], return_inverse=True)[1]
    df["skill_id"] = np.unique(df["skill_id"].astype(str), return_inverse=True)[1]
    # with open(os.path.join(data_path, "skill_id_name"), "wb") as f:
    #     pickle.dump(dict(zip(df["skill_id"], df["skill_name"])), f)

    # Build Q-matrix
    Q_mat = np.zeros((len(df["item_id"].unique()), len(df["skill_id"].unique())))
    for item_id, skill_id in df[["item_id", "skill_id"]].values:
        Q_mat[item_id, skill_id] = 1

    # Remove row duplicates due to multiple skills for one item
    df = df.drop_duplicates(["user_id", "item_id"], keep='last')

    print("# Users: {}".format(df["user_id"].nunique()))
    print("# Skills: {}".format(df["skill_id"].nunique()))
    print("# Items: {}".format(df["item_id"].nunique()))
    print("# Interactions: {}".format(len(df)))

    # Get unique skill id from combination of all skill ids
    unique_skill_ids = np.unique(Q_mat, axis=0, return_inverse=True)[1]
    df["skill_id"] = unique_skill_ids[df["item_id"]]

    print("# Preprocessed Skills: {}".format(df["skill_id"].nunique()))
    # Sort data temporally
    df.sort_values(by="timestamp", inplace=True)

    # Sort data by users, preserving temporal order for each user
    df = pd.concat([u_df for _, u_df in df.groupby("user_id")])
    df.to_csv(os.path.join(data_path, "iscream/original_df.csv"), sep="\t", index=False)

    df = df[["user_id", "item_id", "timestamp", "correct", "skill_id"]]
    df.reset_index(inplace=True, drop=True)

    test_idx = df['user_id'][df["correct"] == -2]
    train_df = df.loc[~df["user_id"].isin(test_idx)]
    test_df = df.loc[df["user_id"].isin(test_idx)]
    train_df.reset_index(inplace=True, drop=True)
    test_df.reset_index(inplace=True, drop=True)

    # Save data
    with open(os.path.join(data_path, "iscream/question_skill_rel.pkl"), "wb") as f:
        pickle.dump(csr_matrix(Q_mat), f)

    sparse.save_npz(os.path.join(data_path, "iscream/q_mat.npz"), csr_matrix(Q_mat))
    train_df.to_csv(os.path.join(data_path, "iscream/preprocessed_df.csv"), sep="\t", index=False)
    test_df.to_csv(os.path.join(data_path, "iscream/preprocessed_test_df.csv"), sep="\t", index=False)

In [3]:
prepare_test_iscream(10)

# Users: 7441
# Skills: 912
# Items: 9454
# Interactions: 2476697
# Preprocessed Skills: 912
