In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" 

In [1]:
%pip install recbole
%pip install ray kmeans-pytorch

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
from logging import getLogger
import os
import json
import pandas as pd
import time, datetime

from recbole.config import Config
from recbole.data import create_dataset, data_preparation

from sklearn.metrics import accuracy_score, roc_auc_score

import torch

In [4]:
dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}

path = '/data/ephemeral/data/'
train_data = pd.read_csv(path + 'train_data.csv', dtype=dtype, parse_dates=['Timestamp'])
test_data  = pd.read_csv(path + 'test_data.csv', dtype=dtype, parse_dates=['Timestamp'])

In [5]:
data = pd.concat([train_data, test_data])

In [6]:
data.head(1)

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224


In [7]:
import numpy as np
def elapsed(df) :
    diff_train = df.loc[:, ['userID','Timestamp']].groupby('userID').diff().shift(-1)
    diff_train = diff_train['Timestamp'].apply(lambda x : x.total_seconds())
    df['elapsed'] = diff_train

    df.groupby('userID').apply(lambda x :x.iloc[:-1])

    # 한 시간이 지나면 outlier로 처리
    outlier = 1*3600
    non_outlier = df[df['elapsed'] <= outlier]
    # outlier에 해당하지 않는 row로 재구성 한 후 각 태그의 평균처리
    mean_elapsed = non_outlier.groupby('KnowledgeTag')['elapsed'].mean()
    df.loc[df['elapsed'] > outlier, 'elapsed'] = df[df['elapsed'] > outlier].apply(lambda x: mean_elapsed.get(x['KnowledgeTag'], x['elapsed']), axis=1)

    return df

def cumsum(df) :
    # 누적합
    _cumsum = df.loc[:, ['userID', 'answerCode']].groupby('userID').agg({'answerCode': 'cumsum'})
    # 누적갯수
    _cumcount = df.loc[:, ['userID', 'answerCode']].groupby('userID').agg({'answerCode': 'cumcount'}) + 1

    cum_ans = _cumsum / _cumcount
    df['cumulative'] = cum_ans['answerCode']

    df['paper_number'] = df['assessmentItemID'].apply(lambda x: x[7:]) # assessmentItemID의 뒤에 3자리를 의미 -> 각 시험지 별로 문제번호
    # item 열을 int16으로 변경
    df["paper_number"] = df["paper_number"].astype("int16")

    return df

def avg_percent(x) :
    return np.sum(x) / len(x)

def test_type(x) :
    # 전부 A로 동일
    return  x[0]
def paper_type(x) :
    # 0~9로 시험지 대분류로 가정
    return x[2]
def paper_subtype(x) :
    # ~~ 시험지 중분류로 가정
    return x[4:7]

def type_percent(df) :
    # 위에서 처리한 type을 변환하여 각각의 정답률 처리

    df['test_type'] = df['assessmentItemID'].apply(test_type)
    df['paper_type'] = df['assessmentItemID'].apply(paper_type).astype(int)
    df['paper_subtype'] = df['assessmentItemID'].apply(paper_subtype).astype(int)

    df['paper_number_percent'] = df.groupby('paper_number')['answerCode'].transform(avg_percent)
    df['paper_type_percent'] = df.groupby('paper_type')['answerCode'].transform(avg_percent)
    df['KnowledgeTag_percent'] = df.groupby('KnowledgeTag')['answerCode'].transform(avg_percent)

    return df

In [8]:
userid, itemid = list(set(data.userID)), list(set(data.assessmentItemID))
n_user, n_item = len(userid), len(itemid)

print(f"Train dataset")
display(data.head(5))
print(f" Num. Users    : {n_user}")
print(f" Max. UserID   : {max(userid)}")
print(f" Num. Items    : {n_item}")
print(f" Num. Records  : {len(train_data)}")

Train dataset


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225


 Num. Users    : 7442
 Max. UserID   : 7441
 Num. Items    : 9454
 Num. Records  : 2266586


In [9]:
data = elapsed(data)
data = cumsum(data)
data = type_percent(data)
data.drop(['test_type','paper_type','paper_subtype','paper_number_percent','paper_type_percent'], axis=1, inplace=True)

In [10]:
data.drop_duplicates(subset = ["userID", "assessmentItemID"],
                     keep = "last", inplace = True)

In [11]:
data_old = data.copy()
n_user_old, n_item_old = n_user, n_item

data  = data[data.answerCode>=0].copy()

userid, itemid = list(set(data.userID)), list(set(data.assessmentItemID))
n_user, n_item = len(userid), len(itemid)

display(data.tail(5))
print(f" Num. Users    : {n_user}->{n_user}")
print(f" Max. UserID   : {max(userid)}")
print(f" Num. Items    : {n_item}->{n_item}")
print(f" Num. Records  : {len(data_old)}->{len(data)}")



Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,elapsed,cumulative,paper_number,KnowledgeTag_percent
260108,7439,A040197006,A040000197,1,2020-08-21 07:39:45,2132,60.158849,0.727273,6,0.763158
260109,7439,A040130001,A040000130,0,2020-10-14 23:07:23,8832,18.0,0.666667,1,0.637778
260110,7439,A040130002,A040000130,1,2020-10-14 23:07:41,8832,21.0,0.692308,2,0.637778
260111,7439,A040130003,A040000130,1,2020-10-14 23:08:02,8244,89.0,0.714286,3,0.7625
260112,7439,A040130004,A040000130,1,2020-10-14 23:09:31,8244,32.0,0.733333,4,0.7625


 Num. Users    : 7442->7442
 Max. UserID   : 7441
 Num. Items    : 9454->9454
 Num. Records  : 2476706->2475962


In [12]:
eval_data = data.copy()
eval_data.drop_duplicates(subset = ["userID"],
                     keep = "last", inplace = True)
display(eval_data.head(5))
display(eval_data.tail(5))
print(f" Num. Records  : {len(eval_data)}")

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,elapsed,cumulative,paper_number,KnowledgeTag_percent
744,0,A080129006,A080000129,0,2020-12-23 03:40:19,2725,,0.630872,6,0.296875
1677,1,A090074006,A090000074,1,2020-11-13 02:47:20,2648,,0.853162,6,0.43
1953,2,A050139007,A050000139,0,2020-10-20 11:32:26,428,,0.612319,7,0.674105
2786,5,A080138007,A080000138,1,2020-12-11 22:48:28,8431,,0.795918,7,0.466087
3707,6,A030145005,A030000145,0,2020-10-26 09:52:14,7817,,0.442997,5,0.614667


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,elapsed,cumulative,paper_number,KnowledgeTag_percent
260051,7395,A040122004,A040000122,0,2020-09-08 02:05:18,2102,2.0,0.304348,4,0.7475
260066,7404,A030111004,A030000111,1,2020-10-13 09:47:31,7636,107.0,0.5,4,0.824167
260081,7416,A050193003,A050000193,0,2020-10-04 02:44:17,10402,24.0,0.5,3,0.820896
260096,7417,A050193003,A050000193,0,2020-09-06 13:08:54,10402,21.0,0.142857,3,0.820896
260112,7439,A040130004,A040000130,1,2020-10-14 23:09:31,8244,32.0,0.733333,4,0.7625


 Num. Records  : 7442


In [13]:
data.drop(index=eval_data.index, inplace=True, errors='ignore')
display(data.tail(5))
print(f" Num. Records  : {len(data)}")

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,elapsed,cumulative,paper_number,KnowledgeTag_percent
260107,7439,A040197005,A040000197,0,2020-08-21 07:39:40,2132,5.0,0.7,5,0.763158
260108,7439,A040197006,A040000197,1,2020-08-21 07:39:45,2132,60.158849,0.727273,6,0.763158
260109,7439,A040130001,A040000130,0,2020-10-14 23:07:23,8832,18.0,0.666667,1,0.637778
260110,7439,A040130002,A040000130,1,2020-10-14 23:07:41,8832,21.0,0.692308,2,0.637778
260111,7439,A040130003,A040000130,1,2020-10-14 23:08:02,8244,89.0,0.714286,3,0.7625


 Num. Records  : 2467499


In [14]:
userid, itemid = sorted(list(set(data.userID))), sorted(list(set(data.assessmentItemID)))
n_user, n_item = len(userid), len(itemid)

userid_2_index = {v:i        for i,v in enumerate(userid)}
itemid_2_index = {v:i+n_user for i,v in enumerate(itemid)}
id_2_index = dict(userid_2_index, **itemid_2_index)

In [19]:
train_yaml = """
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
RATING_FIELD: rating
TIME_FIELD: timestamp
ELAPSED : elapsed



load_col:
    inter: [user_id, item_id, rating, timestamp,elapsed]

user_inter_num_interval: "[0,inf)"
item_inter_num_interval: "[0,inf)"

eval_args:
  split: {"RS": [8,2,0]}
  order: "TO"

epochs: 10
learning_rate: 0.01
stopping_step: 20
"""

test_yaml = """
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
RATING_FIELD: rating
TIME_FIELD: timestamp
ELAPSED : elapsed

load_col:
    inter: [user_id, item_id, rating, timestamp,elapsed]

user_inter_num_interval: "[0,inf)"
item_inter_num_interval: "[0,inf)"

eval_args:
  split: {"RS": [10,0,0]}
  order: "TO"

"""

In [20]:
outpath = "/data/ephemeral/code/lightgcn/recbole/train_data/"
outfile = os.path.join(outpath, "train_data.inter")
yamlfile = os.path.join(outpath, "train_data.yaml")

os.makedirs(outpath, exist_ok=True)

print("Processing Start")
inter_table = []
for user, item, acode, tstamp, elapse in zip(data.userID, data.assessmentItemID, data.answerCode, data.Timestamp, data.elapsed):
    uid, iid = id_2_index[user], id_2_index[item]
    # tval = int(time.mktime(datetime.datetime.strptime(tstamp, "%Y-%m-%d %H:%M:%S").timetuple()))
    inter_table.append( [uid, iid, max(acode,0), elapse] )

print("Processing Complete")

print("Dump Start")
# 데이터 설정 파일 저장
with open(yamlfile, "w") as f:
    f.write(train_yaml)

# 데이터 파일 저장
with open(outfile, "w") as f:
    # write header
    f.write("user_id:token\titem_id:token\trating:float\ttimestamp:float\n")
    for row in inter_table:
        f.write("\t".join([str(x) for x in row])+"\n")

print("Dump Complete")


Processing Start
Processing Complete
Dump Start
Dump Complete


In [21]:
outpath = "/data/ephemeral/code/lightgcn/recbole/test_data"
outfile = "/data/ephemeral/code/lightgcn/recbole/test_data/test_data.inter"
yamlfile = "/data/ephemeral/code/lightgcn/recbole/test_data/test_data.yaml"
os.makedirs(outpath, exist_ok=True)

print("Processing Start")
inter_table = []
for user, item, acode, tstamp, elapse in zip(eval_data.userID, eval_data.assessmentItemID, eval_data.answerCode, eval_data.Timestamp, eval_data.elapsed):
    uid, iid = id_2_index[user], id_2_index[item]
    # tval = int(time.mktime(datetime.datetime.strptime(tstamp, "%Y-%m-%d %H:%M:%S").timetuple()))
    inter_table.append( [uid, iid, max(acode,0), elapse] )

print("Processing Complete")

print("Dump Start")
# 데이터 설정 파일 저장
with open(yamlfile, "w") as f:
    f.write(test_yaml)

# 데이터 파일 저장
with open(outfile, "w") as f:
    # write header
    f.write("user_id:token\titem_id:token\trating:float\ttimestamp:float\n")
    for row in inter_table:
        f.write("\t".join([str(x) for x in row])+"\n")

print("Dump Complete")


Processing Start
Processing Complete
Dump Start
Dump Complete


In [22]:
%pip install pyarrow


[0mNote: you may need to restart the kernel to use updated packages.


In [23]:
from recbole.quick_start import run_recbole

# run_recbole(model='LightGCN', dataset='train_data', config_file_list=['/data/ephemeral/code/lightgcn/recbole/train_data.yaml'])
run_recbole(model='LightGCN', dataset='train_data', config_file_list=['/data/ephemeral/code/lightgcn/recbole/train_data.yaml'])


  from .autonotebook import tqdm as notebook_tqdm
2024-01-25 01:30:45,382	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-01-25 01:30:45,692	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
25 Jan 01:30    INFO  ['/data/ephemeral/miniconda3/envs/gcn/lib/python3.10/site-packages/ipykernel_launcher.py', '--f=/opt/ml/.local/share/jupyter/runtime/kernel-v2-772165KdDT5QODGGvv.json']
25 Jan 01:30    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = /data/ephemeral/code/lightgcn/recbole/train_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 2
train_batch_size = 2048
learner = 

KeyboardInterrupt: 

In [29]:
%pip install charset_normalizer

Note: you may need to restart the kernel to use updated packages.


In [34]:
# Load Model
from recbole.quick_start import load_data_and_model
from glob import glob

model_path = sorted(glob("saved/*"))[-1]

config, model, dataset, train_data, valid_data, test_data = load_data_and_model(model_path)

16 Jan 15:17    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = /data/ephemeral/code/lightgcn/recbole/train_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 2
train_batch_size = 2048
learner = adam
learning_rate = 0.01
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [8, 2, 0]}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']
topk = [10]
valid_metric = MRR@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dat

In [36]:
# Load Test Data

from recbole.config import Config
from recbole.data import create_dataset, data_preparation, Interaction

config = Config(model='LightGCN', dataset='test_data', config_file_list=['/data/ephemeral/code/lightgcn/recbole/test_data.yaml'])
dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)

TypeError: cannot perform __add__ with this index type: MultiIndex

In [37]:
from tqdm import tqdm

true = []
prob = []
pred = []

model.eval()
for data in tqdm(dataset):
    # user item별 score 예측
    score = model.full_sort_predict(data)

    true.append(data.rating)
    prob.append(float(score[data.item_id]))
    pred.append(1 if score[data.item_id] >= 0.5 else 0)

100%|██████████| 2467499/2467499 [59:27<00:00, 691.64it/s]  


In [38]:
print("Test data prediction")
print(f" - Accuracy = {100*accuracy_score(true, pred):.2f}%")
print(f" - ROC-AUC  = {100*roc_auc_score(true, prob):.2f}%")

Test data prediction
 - Accuracy = 16.13%


ValueError: multi_class must be in ('ovo', 'ovr')

In [None]:
pred = pred.detach().cpu().numpy()
output_dir = '/data/ephemeral/code/lightgcn/outputs/'
os.makedirs(name=output_dir, exist_ok=True)
write_path = os.path.join(output_dir, "submission.csv")
result = pd.DataFrame({
    'user_id' : [data.user_id],
    'prediction' : pred
}).to_csv(path_or_buf = output_dir, index_label='id')