In [1]:
import os
import pandas as pd
import torch
import numpy as np
import pickle
from torch_geometric.nn.models import LightGCN
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [2]:
def prepare_dataset(device, basepath):
    data = load_data(basepath)
    train_data, valid_data, test_data = separate_data(data)
    id2index, n_user, n_item = indexing_data(data)
    train_data_proc = process_data(train_data, id2index, device)
    valid_data_proc = process_data(valid_data, id2index, device)
    test_data_proc = process_data(test_data, id2index, device)

    return train_data_proc, valid_data_proc, test_data_proc, id2index, n_user, n_item


def load_data(basepath):
    path = os.path.join(basepath, "total_data.csv")
    data = pd.read_csv(path)
    data.drop_duplicates(
        subset=["userID", "assessmentItemID"], keep="last", inplace=True
    )
    data = data.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

    return data

def separate_data(data):
    test_cond = data['answerCode'] == -1
    valid_cond = data['userID'] != data['userID'].shift(-1)

    train_data = data[~test_cond & ~valid_cond].copy()
    valid_data = data[~test_cond & valid_cond].copy()
    test_data = data[test_cond].copy()

    return train_data, valid_data, test_data


def indexing_data(data):
    userid, itemid = (
        sorted(list(set(data.userID))),
        sorted(list(set(data.assessmentItemID))),
    )
    n_user, n_item = len(userid), len(itemid)

    userid_2_index = {v: i for i, v in enumerate(userid)}
    itemid_2_index = {v: i + n_user for i, v in enumerate(itemid)}
    id_2_index = dict(userid_2_index, **itemid_2_index)

    return id_2_index, n_user, n_item


def process_data(data, id_2_index, device):
    edge, label = [], []
    for user, item, acode in zip(data.userID, data.assessmentItemID, data.answerCode):
        uid, iid = id_2_index[user], id_2_index[item]
        edge.append([uid, iid])
        label.append(acode)

    edge = torch.LongTensor(edge).T
    label = torch.LongTensor(label)

    return dict(edge=edge.to(device), label=label.to(device))

def build(n_node, weight=None, **kwargs):
    model = LightGCN(n_node, **kwargs)
    if weight:
        state = torch.load(weight)["model"]
        model.load_state_dict(state)
        return model
    else:
        return model

In [3]:
device = torch.device("cuda" if True else "cpu")
basepath = "/opt/ml/project/data/"
train_data, valid_data, test_data, id_2_index, n_user, n_item = prepare_dataset(
    device, basepath
)
model = build(
    len(id_2_index),
    embedding_dim=512,
    num_layers=3,
    alpha=None,
    weight='/opt/ml/project/code/lightgcn/weight/layer_3_emb_512.pt',
    **{}
).to(device)

In [4]:
indices = torch.arange(0,16896).to(device)
emb_outs = model.embedding(indices).detach().cpu().numpy()

In [5]:
emb_dict = {'user':{}, 'item':{}}
reverse_id2index = {v:k for k,v in id_2_index.items()}

for i in range(n_user):
    emb_dict['user'][reverse_id2index[i]] = emb_outs[i]

for i in range(n_item):
    emb_dict['item'][reverse_id2index[i+n_user]] = emb_outs[i+n_user]

with open('./assets3/gcn_embedding.pickle','wb') as f:
    pickle.dump(emb_dict, f)

In [7]:
GCN_EMB_DIM = 512
with open('./assets3/gcn_embedding.pickle','rb') as f:
    gcn_embedding = pickle.load(f)

gcn_user_embedding = pd.DataFrame.from_dict(gcn_embedding['user']).T
cols = [f'gcn_user_embedding{i+1}' for i in range(GCN_EMB_DIM)]
cols.insert(0, 'userID')
gcn_user_embedding = gcn_user_embedding.reset_index()
gcn_user_embedding.columns = cols

gcn_item_embedding = pd.DataFrame.from_dict(gcn_embedding['item']).T
cols = [f'gcn_question_embedding{i+1}' for i in range(GCN_EMB_DIM)]
cols.insert(0, 'assessmentItemID')
gcn_item_embedding = gcn_item_embedding.reset_index()
gcn_item_embedding.columns = cols

In [2]:
basepath = "/opt/ml/project/data/"
path = os.path.join(basepath, "total_data.csv")
df = pd.read_csv(path)
df.drop_duplicates(
    subset=["userID", "assessmentItemID"], keep="last", inplace=True
)
df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

In [3]:
test_df = df[df['answerCode'] == -1].copy()
train_df = df[df['answerCode'] != -1].copy()
test_users = test_df.userID.unique()
test_items = test_df.assessmentItemID.unique()

In [10]:
cos_sim = cosine_similarity(gcn_user_embedding)
np.fill_diagonal(cos_sim, -np.inf)
test_cossim = cos_sim[test_users,:]
sorted_cossim = np.argsort(test_cossim)[:,::-1]
user_item_dict = test_df[['userID','assessmentItemID']].set_index('userID').to_dict()['assessmentItemID']
item_user_dict = train_df[train_df['assessmentItemID'].isin(test_items)].groupby('assessmentItemID').userID.apply(list).to_dict()

In [51]:
total_valid = []
for test_uid, sim_list in tqdm(zip(test_users, sorted_cossim)):
    solved_users = item_user_dict[user_item_dict[test_uid]]
    
    valid_users = []
    for uid in sim_list:
        if uid not in solved_users:
            continue
        valid_users.append(uid)
        if len(valid_users) == 40:
            break
    total_valid.append(valid_users)

744it [00:54, 13.61it/s]


In [52]:
total_valid = np.array(total_valid)
np.save('assets3/total_valid.npy', total_valid)

In [3]:
total_valid = np.load('assets3/total_valid.npy')[:, :40]
total_valid.shape

(744, 40)

In [None]:
total_valid_idx = []
total_users = []

for ti, valid_users in zip(df[df['answerCode'] == -1].assessmentItemID.values, total_valid):
    for vu in valid_users:
        if vu not in total_users:
            total_users.append(vu)
            total_valid_idx.append((vu, ti))
            break

total_valid_idx = df.set_index(['userID','assessmentItemID']).index.isin(total_valid_idx)