# Import

In [1]:
import torch
import pandas as pd
import os
import numpy as np

from sklearn.metrics import accuracy_score, roc_auc_score
from torch_geometric.nn.models import LightGCN

from src.datasets import prepare_dataset   # lightgcn
from src.utils import class2dict, get_logger   # lightgcn

In [2]:
class CFG:
    use_cuda_if_available = True
    user_wandb = True
    wandb_kwargs = dict(project="dkt-gcn")

    # data
    basepath = "/opt/ml/workspace/kch_dkt/data/"
    loader_verbose = True

    # dump
    output_dir = "/opt/ml/workspace/kch_dkt/code/lightgcn/output"
    pred_file = "submission.csv"

    # build
    embedding_dim = 64  # 64
    num_layers = 6  # 1
    alpha = None  # Optional[Union[float, Tensor]]
    build_kwargs = {}  # other arguments
    weight = "./weight/best_model.pt"

    # train
    n_epoch = 30    # 20
    learning_rate = 0.0005   # 0.001
    weight_basepath = "./weight"

In [3]:
logging_conf = {  # only used when 'user_wandb==False'
    "version": 1,
    "formatters": {
        "basic": {"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"}
    },
    "handlers": {
        "console": {
            "class": "logging.StreamHandler",
            "level": "INFO",
            "formatter": "basic",
            "stream": "ext://sys.stdout",
        },
        "file_handler": {
            "class": "logging.FileHandler",
            "level": "DEBUG",
            "formatter": "basic",
            "filename": "run.log",
        },
    },
    "root": {"level": "INFO", "handlers": ["console", "file_handler"]},
}

# Data

In [4]:
logger = get_logger(logging_conf)
use_cuda = torch.cuda.is_available() and CFG.use_cuda_if_available
device = torch.device("cuda" if use_cuda else "cpu")

train_data, test_data, n_node = prepare_dataset(
        device, CFG.basepath, verbose=CFG.loader_verbose, logger=logger.getChild("data")
    )

2022-12-08 05:46:40,744 - data - INFO - Train Dataset Info
2022-12-08 05:46:40,745 - data - INFO -  * Num. Users    : 7442
2022-12-08 05:46:40,746 - data - INFO -  * Max. UserID   : 7441
2022-12-08 05:46:40,747 - data - INFO -  * Num. Items    : 9454
2022-12-08 05:46:40,747 - data - INFO -  * Num. Records  : 2475962
2022-12-08 05:46:40,748 - data - INFO - Test Dataset Info
2022-12-08 05:46:40,749 - data - INFO -  * Num. Users    : 744
2022-12-08 05:46:40,750 - data - INFO -  * Max. UserID   : 7439
2022-12-08 05:46:40,750 - data - INFO -  * Num. Items    : 444
2022-12-08 05:46:40,751 - data - INFO -  * Num. Records  : 744


# LightGCN

### Build

In [5]:
"""
if CFG.user_wandb:
    import wandb

    wandb.init(**CFG.wandb_kwargs, config=class2dict(CFG))
"""

'\nif CFG.user_wandb:\n    import wandb\n\n    wandb.init(**CFG.wandb_kwargs, config=class2dict(CFG))\n'

In [6]:
def build(n_node, weight=None, logger=None, **kwargs):
    model = LightGCN(n_node, **kwargs)
    if weight:
        if not os.path.isfile(weight):
            logger.fatal("Model Weight File Not Exist")
        logger.info("Load model")
        state = torch.load(weight)["model"]
        model.load_state_dict(state)
        return model
    else:
        logger.info("No load model")
        return model

In [7]:
model = build(
        n_node,
        embedding_dim=CFG.embedding_dim,
        num_layers=CFG.num_layers,
        alpha=CFG.alpha,
        logger=logger.getChild("build"),
        **CFG.build_kwargs
    )
model.to(device)

2022-12-08 05:46:41,030 - build - INFO - No load model


LightGCN(16896, 64, num_layers=6)

### Train

In [8]:
"""
if CFG.user_wandb:
        wandb.watch(model)
"""

'\nif CFG.user_wandb:\n        wandb.watch(model)\n'

In [9]:
def train(
    model,
    train_data,
    valid_data=None,
    n_epoch=100,
    learning_rate=0.01,
    use_wandb=False,
    weight=None,
    logger=None,
):
    model.train()

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if not os.path.exists(weight):
        os.makedirs(weight)

    if valid_data is None:
        eids = np.arange(len(train_data["label"]))
        eids = np.random.permutation(eids)[:1000]
        edge, label = train_data["edge"], train_data["label"]
        label = label.to("cpu").detach().numpy()
        valid_data = dict(edge=edge[:, eids], label=label[eids])

    logger.info(f"Training Started : n_epoch={n_epoch}")
    best_auc, best_epoch = 0, -1
    for e in range(n_epoch):
        # forward
        pred = model(train_data["edge"])
        loss = model.link_pred_loss(pred, train_data["label"])

        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            prob = model.predict_link(valid_data["edge"], prob=True)
            prob = prob.detach().cpu().numpy()
            acc = accuracy_score(valid_data["label"], prob > 0.5)
            auc = roc_auc_score(valid_data["label"], prob)
            logger.info(
                f" * In epoch {(e+1):04}, loss={loss:.03f}, acc={acc:.03f}, AUC={auc:.03f}"
            )

            """
            if use_wandb:
                import wandb

                wandb.log(dict(loss=loss, acc=acc, auc=auc))
            """

        if weight:
            if auc > best_auc:
                logger.info(
                    f" * In epoch {(e+1):04}, loss={loss:.03f}, acc={acc:.03f}, AUC={auc:.03f}, Best AUC"
                )
                best_auc, best_epoch = auc, e
                """
                torch.save(
                    {"model": model.state_dict(), "epoch": e + 1},
                    os.path.join(weight, f"best_model.pt"),
                )
    torch.save(
        {"model": model.state_dict(), "epoch": e + 1},
        os.path.join(weight, f"last_model.pt"),
    )
    """
    logger.info(f"Best Weight Confirmed : {best_epoch+1}'th epoch")

In [10]:
train(
    model,
    train_data,
    n_epoch=CFG.n_epoch,
    learning_rate=CFG.learning_rate,
    weight=CFG.weight_basepath,
    logger=logger.getChild("train"),
)

2022-12-08 05:46:41,284 - train - INFO - Training Started : n_epoch=30
2022-12-08 05:46:41,415 - train - INFO -  * In epoch 0001, loss=0.693, acc=0.488, AUC=0.495
2022-12-08 05:46:41,416 - train - INFO -  * In epoch 0001, loss=0.693, acc=0.488, AUC=0.495, Best AUC
2022-12-08 05:46:41,532 - train - INFO -  * In epoch 0002, loss=0.693, acc=0.490, AUC=0.497
2022-12-08 05:46:41,533 - train - INFO -  * In epoch 0002, loss=0.693, acc=0.490, AUC=0.497, Best AUC
2022-12-08 05:46:41,673 - train - INFO -  * In epoch 0003, loss=0.693, acc=0.490, AUC=0.498
2022-12-08 05:46:41,674 - train - INFO -  * In epoch 0003, loss=0.693, acc=0.490, AUC=0.498, Best AUC
2022-12-08 05:46:41,809 - train - INFO -  * In epoch 0004, loss=0.693, acc=0.489, AUC=0.500
2022-12-08 05:46:41,810 - train - INFO -  * In epoch 0004, loss=0.693, acc=0.489, AUC=0.500, Best AUC
2022-12-08 05:46:41,940 - train - INFO -  * In epoch 0005, loss=0.693, acc=0.488, AUC=0.501
2022-12-08 05:46:41,941 - train - INFO -  * In epoch 0005, lo

### Inference

In [11]:
def inference(model, data, logger=None):
    model.eval()
    with torch.no_grad():
        pred = model.predict_link(data["edge"], prob=True)
        return pred

In [12]:
"""
pred = inference(model, test_data, logger=logger.getChild("infer"))
pred = pred.detach().cpu().numpy()
pd.DataFrame({"prediction": pred}).to_csv(
    os.path.join(CFG.output_dir, CFG.pred_file), index_label="id"
)
"""

'\npred = inference(model, test_data, logger=logger.getChild("infer"))\npred = pred.detach().cpu().numpy()\npd.DataFrame({"prediction": pred}).to_csv(\n    os.path.join(CFG.output_dir, CFG.pred_file), index_label="id"\n)\n'

# get_embedding

edge_index를 인풋으로 받음: (2, k) 형태, device 통일


In [13]:
"""
def get_embedding(self, edge_index: Adj) -> Tensor:
        x = self.embedding.weight
        out = x * self.alpha[0]

        for i in range(self.num_layers):
            x = self.convs[i](x, edge_index)
            out = out + x * self.alpha[i + 1]

        return out
"""

'\ndef get_embedding(self, edge_index: Adj) -> Tensor:\n        x = self.embedding.weight\n        out = x * self.alpha[0]\n\n        for i in range(self.num_layers):\n            x = self.convs[i](x, edge_index)\n            out = out + x * self.alpha[i + 1]\n\n        return out\n'

In [14]:
gcn_embedding_train = model.get_embedding(train_data["edge"])
gcn_embedding_test = model.get_embedding(test_data["edge"])

In [19]:
k = 20
a = torch.randint(0, 7442, size=(1, k)).to(device)  # 7442
b = torch.randint(7442, 7442+9454, size=(1, k)).to(device)  # 7442+9454
c = torch.cat([a,b], dim=0)
gcn_embedding_c = model.get_embedding(c)

In [20]:
d = torch.randint(7442+9454, size=(2, k)).to(device)
gcn_embedding_d = model.get_embedding(d)

In [35]:
e = torch.randint(7442+9454, size=(2, 1)).to(device)
gcn_embedding_e = model.get_embedding(e)

In [21]:
print("model.embedding.weight:", False in (model.embedding.weight == gcn_embedding_c), "\n"
    "gcn_embedding_train:", False in (gcn_embedding_train == gcn_embedding_c), "\n" 
    "gcn_embedding_test:", False in (gcn_embedding_test == gcn_embedding_c))

model.embedding.weight: True 
gcn_embedding_train: False 
gcn_embedding_test: False


In [22]:
print("model.embedding.weight:", False in (model.embedding.weight == gcn_embedding_d), "\n"
    "gcn_embedding_train:", False in (gcn_embedding_train == gcn_embedding_d), "\n" 
    "gcn_embedding_test:", False in (gcn_embedding_test == gcn_embedding_d))

model.embedding.weight: True 
gcn_embedding_train: False 
gcn_embedding_test: False


In [36]:
print("model.embedding.weight:", False in (model.embedding.weight == gcn_embedding_e), "\n"
    "gcn_embedding_train:", False in (gcn_embedding_train == gcn_embedding_e), "\n" 
    "gcn_embedding_test:", False in (gcn_embedding_test == gcn_embedding_e))

model.embedding.weight: True 
gcn_embedding_train: False 
gcn_embedding_test: False
