# LightGNN 구현

In [1]:
import os, random
from tqdm import tqdm

import numpy as np
import pandas as pd

import torch
import torch.nn as nn

import torch_geometric
import torch_geometric.nn as geonn
from torch_geometric.data.data import Data

from sklearn.metrics import roc_auc_score, accuracy_score

In [2]:
# pip install torch_geometric

In [3]:
# fix random seed
seed = 42
random.seed(seed)
np.random.seed(seed)
_ = torch.manual_seed(seed)

## load data

In [4]:
# load data
data_dir = '../../../dkt/data'
train_df = pd.read_csv(os.path.join(data_dir, 'train_data.csv'))
test_df = pd.read_csv(os.path.join(data_dir, 'test_data.csv'))
submission_df = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv'))

train_df.shape, test_df.shape

((2266586, 6), (260114, 6))

## 학습 및 테스트 데이터에서 맨 마지막 레코드 제외

In [5]:
train_last = train_df.drop_duplicates(['userID'], keep='last')
test_last = test_df.drop_duplicates(['userID'], keep='last')

# train_df2 = train_df.drop(index=train_last.index).drop_duplicates(['userID', 'assessmentItemID'], keep='last')
train_df2 = train_df.drop_duplicates(['userID', 'assessmentItemID'], keep='last')
test_df2 = test_df.drop(index=test_last.index).drop_duplicates(['userID', 'assessmentItemID'], keep='last')

base_df = pd.concat([train_df2, test_df2], axis=0)
base_df.shape

(2475974, 6)

### 유저, 아이템 ID 정보 추출 및 사전 만들기

In [6]:
# total user id and item id
user_ids = np.concatenate([train_df.userID.unique(), test_df.userID.unique()])
item_ids = train_df.assessmentItemID.unique()
len(user_ids), len(item_ids)

(7442, 9454)

In [7]:
user_id2index = {uid:i for i, uid in enumerate(user_ids)}
item_id2index = {iid:i+len(user_ids) for i, iid in enumerate(item_ids)}

### 만든 사전으로 ID를 숫자로 변환하기

In [8]:
base_df['userID'] = base_df['userID'].map(lambda x: user_id2index.get(x,x))
base_df['assessmentItemID'] = base_df['assessmentItemID'].map(lambda x: item_id2index.get(x,x))

In [9]:
train_last['userID'] = train_last['userID'].map(lambda x: user_id2index.get(x,x))
train_last['assessmentItemID'] = train_last['assessmentItemID'].map(lambda x: item_id2index.get(x,x))

test_last['userID'] = test_last['userID'].map(lambda x: user_id2index.get(x,x))
test_last['assessmentItemID'] = test_last['assessmentItemID'].map(lambda x: item_id2index.get(x,x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_last['userID'] = train_last['userID'].map(lambda x: user_id2index.get(x,x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_last['assessmentItemID'] = train_last['assessmentItemID'].map(lambda x: item_id2index.get(x,x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_last['userID'] =

In [10]:
# # define positive edge and negative edge
# pos_edge = base_df[base_df['answerCode'] == 1][['userID','assessmentItemID']].values
# neg_edge = base_df[base_df['answerCode'] == 0][['userID','assessmentItemID']].values
# pos_edge_label = base_df[base_df['answerCode'] == 1][['answerCode']].values
# neg_edge_label = base_df[base_df['answerCode'] == 0][['answerCode']].values
# pos_edge.shape, neg_edge.shape

### 전체 레코드 데이터 추출

In [11]:
total_edges = base_df[['userID','assessmentItemID']].values
total_edge_labels = base_df[['answerCode']].values
total_edges.shape

(2475974, 2)

In [12]:
train_last_edges = train_last[['userID','assessmentItemID']].values
train_last_edge_labels = train_last[['answerCode']].values

test_last_edges = test_last[['userID','assessmentItemID']].values
test_last_edge_labels = test_last[['answerCode']].values
train_last_edges.shape, test_last_edges.shape

((6698, 2), (744, 2))

### 데이터 셔플

In [13]:
shuffle_index = list(range(len(total_edges))) 
np.random.shuffle(shuffle_index)

total_edges = total_edges[shuffle_index]
total_edge_labels = total_edge_labels[shuffle_index]

### 학습, 검증 데이터 분할

In [14]:
from sklearn.model_selection import train_test_split

train_edges, valid_edges, train_edge_labels, valid_edge_labels = \
    train_test_split(total_edges, total_edge_labels, 
                     test_size=.15, shuffle=True, stratify=total_edge_labels)
# train_edges, train_edge_labels = total_edges, total_edge_labels
# valid_edges, valid_edge_labels = train_last_edges, train_last_edge_labels

## 학습에 필요한 변수 지정

In [15]:
batch_size = 4096
learning_rate = 0.01
epochs = 500
patience = 5

embedding_dim = 128
num_layers = 3

total_edge_index = range(total_edges.shape[0])
num_nodes = len(user_ids)+len(item_ids)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [16]:
model = geonn.LightGCN(num_nodes, embedding_dim, num_layers).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5, mode='max', verbose=True)

In [17]:
def train(optimizer, model, edge_index, edge_labels):
    model.train()

    preds = model.predict_link(edge_index, prob=True)
    
    optimizer.zero_grad()
    loss = model.link_pred_loss(preds, edge_labels)
    loss.backward()
    optimizer.step()

    preds_ = preds.detach().cpu().numpy()
    labels = edge_labels.detach().cpu().numpy()
    
    auroc = roc_auc_score(labels, preds_)
    accuracy = accuracy_score(labels, preds_ > 0.5)

    return loss, auroc, accuracy

def validate(optimizer, model, edge_index, edge_labels):
    model.eval()

    preds = model.predict_link(edge_index, prob=True)
    loss = model.link_pred_loss(preds, edge_labels)

    preds_ = preds.detach().cpu().numpy()
    labels = edge_labels.detach().cpu().numpy()

    auroc = roc_auc_score(labels, preds_)
    accuracy = accuracy_score(labels, preds_>0.5)

    return loss, auroc, accuracy

In [18]:
# 학습, 검증 엣지 텐서로 변환 & transpose
train_edges = torch.tensor(train_edges).T.to(device)
valid_edges = torch.tensor(valid_edges).T.to(device)

# 학습, 검증 레이블 텐서로 변환 & unsqueeze
train_edge_labels = torch.tensor(train_edge_labels).squeeze(-1).to(device)
valid_edge_labels = torch.tensor(valid_edge_labels).squeeze(-1).to(device)

In [19]:
from copy import deepcopy

In [20]:
least_loss, num = 1e+9, 0

for e in range(epochs):
    # train
    train_loss, train_auroc, train_accuracy = train(optimizer, model, train_edges, train_edge_labels)
    # valid
    valid_loss, valid_auroc, valid_accuracy = validate(optimizer, model, valid_edges, valid_edge_labels)
    print(f"epochs {e}: train loss: {train_loss}, train auroc: {train_auroc:.4f}, train accuracy: {train_accuracy:.4f}")
    print(f"            valid loss: {valid_loss}, valid auroc: {valid_auroc:.4f}, valid accuracy: {valid_accuracy:.4f}")
    
    if valid_loss < least_loss:
        print(f'minimum valid loss is {valid_loss:.4f} at {e} epoch')
        least_loss, num = valid_loss, 0
        best_auc, best_epochs = valid_auroc, e
        best_model = deepcopy(model.state_dict())
    else:
        num += 1
        if num >= patience:
            print(f'early stopped at {e} epoch')
            break

    # lr_scheduler.step(valid_loss)

epochs 0: train loss: 0.6469267010688782, train auroc: 0.4997, train accuracy: 0.4998
            valid loss: 0.6469271183013916, valid auroc: 0.4997, valid accuracy: 0.4989
minimum valid loss is 0.6469 at 0 epoch
epochs 1: train loss: 0.6469259858131409, train auroc: 0.5421, train accuracy: 0.5276
            valid loss: 0.6469270586967468, valid auroc: 0.5006, valid accuracy: 0.4998
minimum valid loss is 0.6469 at 1 epoch
epochs 2: train loss: 0.6469252705574036, train auroc: 0.5838, train accuracy: 0.5551
            valid loss: 0.646926999092102, valid auroc: 0.5037, valid accuracy: 0.5027
minimum valid loss is 0.6469 at 2 epoch
epochs 3: train loss: 0.6469244360923767, train auroc: 0.6250, train accuracy: 0.5827
            valid loss: 0.6469268202781677, valid auroc: 0.5105, valid accuracy: 0.5071
minimum valid loss is 0.6469 at 3 epoch
epochs 4: train loss: 0.6469236016273499, train auroc: 0.6658, train accuracy: 0.6105
            valid loss: 0.6469265818595886, valid auroc: 0.

In [21]:
model.load_state_dict(best_model)

<All keys matched successfully>

In [34]:
best_epochs

274

In [22]:
train_loss, train_auroc, train_accuracy = train(optimizer, model, train_edges, train_edge_labels)
print(f"train loss: {train_loss}, train auroc: {train_auroc:.4f}, train accuracy: {train_accuracy:.4f}")
valid_loss, valid_auroc, valid_accuracy = validate(optimizer, model, valid_edges, valid_edge_labels)
print(f"valid loss: {valid_loss}, valid auroc: {valid_auroc:.4f}, valid accuracy: {valid_accuracy:.4f}")

train loss: 0.528569757938385, train auroc: 0.8732, train accuracy: 0.8420
valid loss: 0.5568188428878784, valid auroc: 0.8254, valid accuracy: 0.7675


In [23]:
model.predict_link(valid_edges, prob=True)

tensor([9.9723e-01, 1.6548e-01, 9.9336e-01,  ..., 6.0056e-04, 9.9950e-01,
        9.9557e-01], device='cuda:0', grad_fn=<SigmoidBackward0>)

In [24]:
model.predict_link(train_edges, prob=True)

tensor([0.0182, 0.9775, 0.5714,  ..., 0.9961, 0.9993, 0.8813], device='cuda:0',
       grad_fn=<SigmoidBackward0>)

In [25]:
# 학습, 검증 엣지 텐서로 변환 & transpose
train_last_edges = torch.tensor(train_last_edges).T.to(device)
test_last_edges = torch.tensor(test_last_edges).T.to(device)

# 학습, 검증 레이블 텐서로 변환 & unsqueeze
train_last_edge_labels = torch.tensor(train_last_edge_labels).squeeze(-1).to(device)
test_last_edge_labels = torch.tensor(test_last_edge_labels).squeeze(-1).to(device)

train_last_loss, train_last_auroc, train_last_accuracy =\
    validate(optimizer, model, train_last_edges, train_last_edge_labels)
train_last_loss, train_last_auroc, train_last_accuracy

(tensor(0.6054, device='cuda:0',
        grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
 0.8513841649458016,
 0.8156166019707375)

In [26]:
model.eval()
preds = model.predict_link(test_last_edges, prob=True)
preds = preds.detach().cpu().numpy()

In [27]:
np.set_printoptions(precision=6, suppress=True)
preds[:10]

array([0.08815 , 0.97398 , 0.006145, 0.683157, 0.023222, 0.968662,
       0.022186, 0.03584 , 0.000186, 0.741616], dtype=float32)

In [28]:
from datetime import datetime as dt
submission_df = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv'))
modelname = 'lightgcn'

In [33]:
submission_df['prediction'] = preds

In [30]:
submission_df['prediction']

0      0.088150
1      0.973980
2      0.006145
3      0.683157
4      0.023222
         ...   
739    0.022927
740    0.325030
741    0.973013
742    0.758820
743    0.428707
Name: prediction, Length: 744, dtype: float32

In [32]:
now = dt.strftime(dt.now(), '%y%m%d-%H%M%S')
result_dir = '../results/'
savename = f'{modelname}_{now}_{valid_auroc:.4f}.csv'
submission_df.to_csv(os.path.join(result_dir, savename), index=False)

In [38]:
model.get_embedding(train_last_edges).size()

torch.Size([16896, 128])

In [46]:
# 학습, 검증 엣지 텐서로 변환 & transpose
total_edges = torch.tensor(total_edges).T.to(device)
node_embeddings = model.get_embedding(total_edges)
node_embeddings.size()

  total_edges = torch.tensor(total_edges).T.to(device)


torch.Size([16896, 128])

In [50]:
user_index2id = {index: uid for uid,index in user_id2index.items()}
item_index2id = {index: iid for iid,index in item_id2index.items()}

In [61]:
user_embeddings = {uid: ue for uid,ue in zip(user_index2id.values(), node_embeddings)}
item_embeddings = {iid: ue for iid,ue in zip(item_index2id.values(), node_embeddings)}

In [None]:
data = {
    'user_embeddings': user_embeddings,
    'item_embeddings': item_embeddings
}

In [None]:
import pickle

with open(f'{modelname}_{now}_{valid_auroc:.4f}_embeddings.pickle', 'wb') as f:
    pickle.dump(data, f)

dict_values(['A060001001', 'A060001002', 'A060001003', 'A060001004', 'A060001005', 'A060001007', 'A060003001', 'A060003002', 'A060003003', 'A060003004', 'A060003005', 'A060003006', 'A060003007', 'A060005001', 'A060005002', 'A060005003', 'A060005004', 'A060005005', 'A060005006', 'A060005007', 'A060007001', 'A060007002', 'A060007003', 'A060007004', 'A060007005', 'A060007006', 'A060007007', 'A080002001', 'A080002002', 'A080002003', 'A080002004', 'A080002005', 'A080002006', 'A060009001', 'A060009002', 'A060009003', 'A060009004', 'A060009005', 'A060009006', 'A060009007', 'A060016001', 'A060016002', 'A060016003', 'A060016004', 'A060016005', 'A060016006', 'A060016007', 'A080004001', 'A080004002', 'A080004003', 'A080004004', 'A080004005', 'A080004006', 'A080004008', 'A080004007', 'A080006001', 'A080006002', 'A080006003', 'A080006004', 'A080006005', 'A080006006', 'A080006007', 'A080006008', 'A060018001', 'A060018002', 'A060018003', 'A060018004', 'A060018005', 'A060018006', 'A060018007', 'A06002