MF를 이용한 간단한 변수 추출

In [1]:
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from copy import deepcopy

import warnings

warnings.filterwarnings(action='ignore')
torch.set_printoptions(sci_mode=True)

# 간단한 데이터 전처리

In [2]:
class MakeDataset():

    def __init__(self, DATA_PATH):
        
        dtype = {
            'userID': 'int16',
            'answerCode': 'int8',
            'KnowledgeTag': 'int16'
        }
        
        train_df = pd.read_csv(os.path.join(DATA_PATH, 'train_data.csv'), dtype=dtype, parse_dates=['Timestamp'])
        train_df = train_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
        test_df = pd.read_csv(os.path.join(DATA_PATH, 'test_data.csv'), dtype=dtype, parse_dates=['Timestamp'])

        def get_paper_number(x):
            return x[1:-3]

        def get_paper_question_number(x):
            return x[-3:]

        def get_large_paper_number(x):
            return x[1:4]
        
        train_df['paper_number'] = train_df['assessmentItemID'].apply(lambda x : get_paper_number(x))
        train_df['paper_question_number'] = train_df['assessmentItemID'].apply(lambda x : get_paper_question_number(x))
        train_df['large_paper_number'] = train_df['assessmentItemID'].apply(lambda x : get_large_paper_number(x))

        test_df['paper_number'] = test_df['assessmentItemID'].apply(lambda x : get_paper_number(x))
        test_df['paper_question_number'] = test_df['assessmentItemID'].apply(lambda x : get_paper_question_number(x))
        test_df['large_paper_number'] = test_df['assessmentItemID'].apply(lambda x : get_large_paper_number(x))

        total_user_list = train_df['userID'].unique().tolist()

        random.seed(22)
        val_user_list = random.sample(total_user_list, test_df['userID'].nunique())

        train = []
        valid = []
        test = []

        group_df = train_df.groupby('userID')

        for userID, df in group_df:
            if userID in val_user_list:
                trn_df = df.iloc[:-1, :]
                val_df = df.iloc[-1:, :]

                train.append(trn_df)
                valid.append(val_df)
            else:
                train.append(df)

        group_df = test_df.groupby('userID')

        for userID, df in group_df:
            trn_df = df.iloc[:-1, :]
            te_df = df.iloc[-1:, :]

            train.append(trn_df)
            test.append(te_df)

        train = pd.concat(train)
        valid = pd.concat(valid)
        test = pd.concat(test)

        def get_val2idx(val_list : list) -> dict:
            val2idx = {}
            for idx, val in enumerate(val_list):
                val2idx[val] = idx
            
            return val2idx

        all_df = pd.concat([train, valid, test])

        assessmentItemID2idx = get_val2idx(all_df['assessmentItemID'].unique().tolist())
        testId2idx = get_val2idx(all_df['testId'].unique().tolist())
        KnowledgeTag2idx = get_val2idx(all_df['KnowledgeTag'].unique().tolist())
        large_paper_number2idx = get_val2idx(all_df['large_paper_number'].unique().tolist())

        train['assessmentItemID2idx'] = train['assessmentItemID'].apply(lambda x : assessmentItemID2idx[x])
        train['testId2idx'] = train['testId'].apply(lambda x : testId2idx[x])
        train['KnowledgeTag2idx'] = train['KnowledgeTag'].apply(lambda x : KnowledgeTag2idx[x])
        train['large_paper_number2idx'] = train['large_paper_number'].apply(lambda x : large_paper_number2idx[x])

        valid['assessmentItemID2idx'] = valid['assessmentItemID'].apply(lambda x : assessmentItemID2idx[x])
        valid['testId2idx'] = valid['testId'].apply(lambda x : testId2idx[x])
        valid['KnowledgeTag2idx'] = valid['KnowledgeTag'].apply(lambda x : KnowledgeTag2idx[x])
        valid['large_paper_number2idx'] = valid['large_paper_number'].apply(lambda x : large_paper_number2idx[x])

        test['assessmentItemID2idx'] = test['assessmentItemID'].apply(lambda x : assessmentItemID2idx[x])
        test['testId2idx'] = test['testId'].apply(lambda x : testId2idx[x])
        test['KnowledgeTag2idx'] = test['KnowledgeTag'].apply(lambda x : KnowledgeTag2idx[x])
        test['large_paper_number2idx'] = test['large_paper_number'].apply(lambda x : large_paper_number2idx[x])

        self.train, self.valid, self.test = train, valid, test
        self.num_userID = train['userID'].nunique()
        self.num_assessmentItemID = len(assessmentItemID2idx)
        self.num_testId = len(testId2idx)
        self.num_KnowledgeTag = len(KnowledgeTag2idx)
        self.num_large_paper_number = len(large_paper_number2idx)
    
    def get_data(self):
        return self.train, self.valid, self.test

In [3]:
class CustomDataset(Dataset):
    def __init__(self, df, test = False):
        self.userID = df['userID'].tolist()
        self.assessmentItemID = df['assessmentItemID2idx'].tolist()
        self.testId = df['testId2idx'].tolist()
        self.KnowledgeTag = df['KnowledgeTag2idx'].tolist()
        self.large_paper_number = df['large_paper_number2idx'].tolist()
        self.test = test
        if not self.test:
            self.answerCode = df['answerCode'].tolist()

    def __len__(self):
        return len(self.userID)

    def __getitem__(self, idx):
        userID = self.userID[idx]
        assessmentItemID = self.assessmentItemID[idx]
        testId = self.testId[idx]
        KnowledgeTag = self.KnowledgeTag[idx]
        large_paper_number = self.large_paper_number[idx]
        if not self.test:
            answerCode = self.answerCode[idx]
            return userID, assessmentItemID, testId, KnowledgeTag, large_paper_number, float(answerCode)
        return userID, assessmentItemID, testId, KnowledgeTag, large_paper_number

# 간단한 모델

In [4]:
class MF(nn.Module):
    def __init__(self, num_userID, num_assessmentItemID, num_testId, num_KnowledgeTag, num_large_paper_number, num_factor = 50, num_layers = 3, dropout_rate = 0.5):
        super(MF, self).__init__()
        self.userID_emb = nn.Embedding(num_userID, num_factor)
        self.assessmentItemID_emb = nn.Embedding(num_assessmentItemID, num_factor)
        self.testId_emb = nn.Embedding(num_testId, num_factor)
        self.KnowledgeTag_emb = nn.Embedding(num_KnowledgeTag, num_factor)
        self.large_paper_number_emb = nn.Embedding(num_large_paper_number, num_factor)

        MLP_modules = []
        input_size = num_factor
        for i in range(num_layers):
            MLP_modules.append(nn.Dropout(p = dropout_rate))
            MLP_modules.append(nn.Linear(input_size, input_size // 2))
            MLP_modules.append(nn.ReLU())
            input_size = input_size // 2

        self.MLP_layers = nn.Sequential(*MLP_modules)

        self.predict_layer = nn.Sequential(
            nn.Linear(input_size, 1, bias = True),
            nn.Sigmoid()
        )
        self._init_weight_()

    def _init_weight_(self):
        self.userID_emb.weight.data.normal_(0, 1.0 / self.userID_emb.embedding_dim)
        self.assessmentItemID_emb.weight.data.normal_(0, 1.0 / self.assessmentItemID_emb.embedding_dim)
        self.testId_emb.weight.data.normal_(0, 1.0 / self.testId_emb.embedding_dim)
        self.KnowledgeTag_emb.weight.data.normal_(0, 1.0 / self.KnowledgeTag_emb.embedding_dim)
        self.large_paper_number_emb.weight.data.normal_(0, 1.0 / self.large_paper_number_emb.embedding_dim)
        
        for m in self.MLP_layers:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
        
        for m in self.predict_layer:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, a=1)
    
    def forward(self, userID, assessmentItemID, testId, KnowledgeTag, large_paper_number):
        
        userID_emb = self.userID_emb(userID)
        assessmentItemID_emb = self.assessmentItemID_emb(assessmentItemID)
        testId_emb = self.testId_emb(testId)
        KnowledgeTag_emb = self.KnowledgeTag_emb(KnowledgeTag)
        large_paper_number_emb = self.large_paper_number_emb(large_paper_number)

        emb = userID_emb + assessmentItemID_emb + testId_emb + KnowledgeTag_emb + large_paper_number_emb

        output = self.MLP_layers(emb)
        output = self.predict_layer(output)

        return output.view(-1)

In [5]:
from sklearn.metrics import roc_auc_score

def train(model, data_loader, criterion, optimizer):
    model.train()
    loss_val = 0

    for userID, assessmentItemID, testId, KnowledgeTag, large_paper_number, answerCode in data_loader:
        userID, assessmentItemID, testId, KnowledgeTag, large_paper_number, answerCode = userID.to(device), assessmentItemID.to(device), testId.to(device), KnowledgeTag.to(device), large_paper_number.to(device), answerCode.type(torch.float32).to(device)

        optimizer.zero_grad()

        output = model(userID, assessmentItemID, testId, KnowledgeTag, large_paper_number)
        loss = criterion(output, answerCode)

        loss.backward()
        optimizer.step()

        loss_val += loss.item()

    loss_val /= len(data_loader)

    return loss_val

def evaluate(model, data_loader):
    model.eval()

    target = []
    pred = []

    with torch.no_grad():
        for userID, assessmentItemID, testId, KnowledgeTag, large_paper_number, answerCode in data_loader:

            userID, assessmentItemID, testId, KnowledgeTag, large_paper_number, answerCode = userID.to(device), assessmentItemID.to(device), testId.to(device), KnowledgeTag.to(device), large_paper_number.to(device), answerCode.type(torch.float32).to(device)

            output = model(userID, assessmentItemID, testId, KnowledgeTag, large_paper_number)

            target.extend(answerCode.cpu().numpy().tolist())
            pred.extend(output.cpu().numpy().tolist())

    roc_auc = roc_auc_score(target, pred)

    return roc_auc


def predict(model, data_loader):
    model.eval()

    pred = []

    with torch.no_grad():
        for userID, assessmentItemID, testId, KnowledgeTag, large_paper_number in data_loader:

            userID, assessmentItemID, testId, KnowledgeTag, large_paper_number = userID.to(device), assessmentItemID.to(device), testId.to(device), KnowledgeTag.to(device), large_paper_number.to(device)

            output = model(userID, assessmentItemID, testId, KnowledgeTag, large_paper_number)

            pred.extend(output.cpu().numpy().tolist())

    return pred

# 학습

In [6]:
batch_size = 5000
epochs = 20
lr = 0.001
device = 'cuda' if torch.cuda.is_available() else 'cpu'

DATA_PATH = '/opt/ml/input/data'
MODEL_PATH = '/opt/ml/model'
SUBMISSION_PATH = '/opt/ml/submission'

model_name = 'MF-base.pt'
submission_name = 'baseline-MF.csv'

In [7]:
if not os.path.isdir(MODEL_PATH):
    os.mkdir(MODEL_PATH)

In [8]:
if not os.path.isdir(SUBMISSION_PATH):
    os.mkdir(SUBMISSION_PATH)

In [9]:
make_dataset = MakeDataset(DATA_PATH = DATA_PATH)
train_df, valid_df, test_df = make_dataset.get_data()

In [10]:
def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

seed_everything(22)

In [11]:
train_dataset = CustomDataset(df = train_df)
train_data_loader = DataLoader(
    train_dataset, 
    batch_size = batch_size, 
    shuffle = True, 
    drop_last = False)

valid_dataset = CustomDataset(df = valid_df)
valid_data_loader = DataLoader(
    valid_dataset, 
    batch_size = batch_size, 
    shuffle = False, 
    drop_last = False)


test_dataset = CustomDataset(df = test_df, test = True)
test_data_loader = DataLoader(
    test_dataset, 
    batch_size = batch_size, 
    shuffle = False, 
    drop_last = False)

In [12]:
model = MF(
    num_userID = make_dataset.num_userID, 
    num_assessmentItemID = make_dataset.num_assessmentItemID, 
    num_testId = make_dataset.num_testId, 
    num_KnowledgeTag = make_dataset.num_KnowledgeTag, 
    num_large_paper_number = make_dataset.num_large_paper_number,).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr = lr)
criterion = nn.BCELoss()

In [None]:
best_roc_auc = 0
for epoch in range(1, epochs + 1):
    tbar = tqdm(range(1))
    for _ in tbar:
        train_loss = train(model = model, data_loader = train_data_loader, criterion = criterion, optimizer = optimizer)
        roc_auc = evaluate(model = model, data_loader = valid_data_loader)
        if best_roc_auc < roc_auc:
            best_roc_auc = roc_auc
            torch.save(model.state_dict(), os.path.join(MODEL_PATH, model_name))

        tbar.set_description(f'Epoch: {epoch:3d}| Train loss: {train_loss:.5f}| roc_auc: {roc_auc:.5f}')

In [15]:
model = MF(
    num_userID = make_dataset.num_userID, 
    num_assessmentItemID = make_dataset.num_assessmentItemID, 
    num_testId = make_dataset.num_testId, 
    num_KnowledgeTag = make_dataset.num_KnowledgeTag, 
    num_large_paper_number = make_dataset.num_large_paper_number,).to(device)

model.load_state_dict(torch.load(os.path.join(MODEL_PATH, model_name)))
pred_list = predict(model = model, data_loader = test_data_loader)

In [16]:
submission = pd.DataFrame(data = np.array(pred_list), columns = ['prediction'])
submission['id'] = submission.index
submission = submission[['id', 'prediction']]
submission.to_csv(os.path.join(SUBMISSION_PATH, submission_name), index = False)