# test data로만 train 해보기 
- train data : 기존 test data의 마지막 행을 제외한 test data
- test data : 기존 test data의 마지막 행

### train

In [42]:
import numpy as np
import pandas as pd
import torch
from config import CFG, logging_conf
# from lightgcn.datasets import prepare_dataset
from lightgcn.models import build
from lightgcn.utils import class2dict, get_logger
from sklearn.metrics import accuracy_score, roc_auc_score

In [43]:
logger = get_logger(logging_conf)
use_cuda = torch.cuda.is_available() and CFG.use_cuda_if_available
device = torch.device("cuda" if use_cuda else "cpu")
print(device)

cuda


In [44]:
import os
def load_data(basepath):
    path = os.path.join(basepath, "test_data.csv")
    data = pd.read_csv(path)
    
    train_data = data[data.answerCode>=0].copy()
    test_data = data[data.answerCode<0].copy()
    
    train_data.drop_duplicates(
        subset=["userID", "assessmentItemID"], keep="last", inplace=True
    )

    data = pd.concat([train_data, test_data])
    
    return data

In [45]:
def separate_data(data):
    train_data = data.copy()
    test_data = data.copy()
    test_data.drop_duplicates(subset = ["userID"],
                     keep = "last", inplace = True)
    train_data.drop(index=test_data.index, inplace=True, errors='ignore')
    
    return train_data, test_data

In [46]:
def indexing_data(data):
    userid, itemid = (
        sorted(list(set(data.userID))),
        sorted(list(set(data.assessmentItemID))),
    )
    n_user, n_item = len(userid), len(itemid)

    userid_2_index = {v: i for i, v in enumerate(userid)}
    itemid_2_index = {v: i + n_user for i, v in enumerate(itemid)}
    id_2_index = dict(userid_2_index, **itemid_2_index)

    return id_2_index

In [47]:
def process_data(data, id_2_index, device):
    edge, label = [], []
    for user, item, acode in zip(data.userID, data.assessmentItemID, data.answerCode):
        uid, iid = id_2_index[user], id_2_index[item]
        edge.append([uid, iid])
        label.append(acode)

    edge = torch.LongTensor(edge).T
    label = torch.LongTensor(label)

    return dict(edge=edge.to(device), label=label.to(device))

In [48]:
def prepare_dataset(device, basepath, verbose=True, logger=None):
    data = load_data(basepath)
    train_data, test_data = separate_data(data)
    id2index = indexing_data(data)
    train_data_proc = process_data(train_data, id2index, device)
    test_data_proc = process_data(test_data, id2index, device)

    return train_data_proc, test_data_proc, len(id2index)

In [49]:
# data prepare
train_data, test_data, n_node = prepare_dataset(
    device, CFG.basepath, verbose=CFG.loader_verbose, logger=logger.getChild("data")
)

In [50]:
# model build
model = build(
    n_node,
    embedding_dim=CFG.embedding_dim,
    num_layers=CFG.num_layers,
    alpha=CFG.alpha,
    logger=logger.getChild("build"),
    **CFG.build_kwargs
)
model.to(device)

LightGCN(10198, 64, num_layers=1)

In [51]:
def train(
    model,
    train_data,
    valid_data=None,
    n_epoch=100,
    learning_rate=0.01,
    use_wandb=False,
    weight=None,
    logger=None,
):
    model.train()

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if not os.path.exists(weight):
        os.makedirs(weight)

    if valid_data is None:
        eids = np.arange(len(train_data["label"]))
        eids = np.random.permutation(eids)[:50]
        edge, label = train_data["edge"], train_data["label"]
        label = label.to("cpu").detach().numpy()
        valid_data = dict(edge=edge[:, eids], label=label[eids])

    logger.info(f"Training Started : n_epoch={n_epoch}")
    best_auc, best_epoch = 0, -1
    for e in range(n_epoch):
        # forward
        pred = model(train_data["edge"])
        loss = model.link_pred_loss(pred, train_data["label"])

        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            prob = model.predict_link(valid_data["edge"], prob=True)
            prob = prob.detach().cpu().numpy()
            acc = accuracy_score(valid_data["label"], prob > 0.5)
            auc = roc_auc_score(valid_data["label"], prob)
            logger.info(
                f" * In epoch {(e+1):04}, loss={loss:.03f}, acc={acc:.03f}, AUC={auc:.03f}"
            )
            if use_wandb:
                import wandb

                wandb.log(dict(loss=loss, acc=acc, auc=auc))

        if weight:
            if auc > best_auc:
                logger.info(
                    f" * In epoch {(e+1):04}, loss={loss:.03f}, acc={acc:.03f}, AUC={auc:.03f}, Best AUC"
                )
                best_auc, best_epoch = auc, e
                torch.save(
                    {"model": model.state_dict(), "epoch": e + 1},
                    os.path.join(weight, f"best_model.pt"),
                )

            if use_wandb:
                wandb.run.summary['best_auc'] = best_auc
                
    torch.save(
        {"model": model.state_dict(), "epoch": e + 1},
        os.path.join(weight, f"last_model.pt"),
    )
    logger.info(f"Best Weight Confirmed : {best_epoch+1}'th epoch")


In [52]:
# model train
train(
    model,
    train_data,
    n_epoch=CFG.n_epoch,
    learning_rate=CFG.learning_rate,
    use_wandb=False,
    weight=CFG.weight_basepath,
    logger=logger.getChild("train"),
)
logger.info("Task Complete")

2022-11-25 02:27:38,446 - root - INFO - Task Complete


### inference

In [53]:
from lightgcn.datasets import prepare_dataset
from lightgcn.models import build, inference
from lightgcn.utils import get_logger

In [54]:
logger = get_logger(logging_conf)
use_cuda = torch.cuda.is_available() and CFG.use_cuda_if_available
device = torch.device("cuda" if use_cuda else "cpu")

if not os.path.exists(CFG.output_dir):
    os.makedirs(CFG.output_dir)

In [55]:
# inference
pred = inference(model, test_data, logger=logger.getChild("infer"))

In [56]:
a_prob = pred.detach().cpu().numpy()
a_true = test_data["label"].detach().cpu().numpy()
a_pred = [round(v) for v in a_prob] 

실제 평가 데이터로 제출 파일 만들기

In [57]:
pred = pred.detach().cpu().numpy()
pd.DataFrame({"prediction": pred}).to_csv(
    os.path.join(CFG.output_dir, 'only_test_submission.csv'), index_label="id"
)

=> 예측 값이 모두 0.5 주변이다. 왜 그럴까???
데이터 양이 적고 이에 따라 학습이 부족해서 일까? 어쨌든 신뢰할 수 없는 결과이다...