In [1]:
COMPILE = 0
VERSION = 'v2'

DATA_DIR = '/kaggle/input/riiid-test-answer-prediction/'
PARQUETS_DIR = f'/kaggle/input/parquets/'
MODELS_DIR = f'/kaggle/input/riiid-answer-correctness-prediction-models/'

OUT_DIR = '/kaggle/working/'

In [2]:
# import some typical packages and fix few random seed.
# Plz fix the torch seed as well...
# NB many imports are not used, but they are just there...
import riiideducation 

import os
import numpy as np
import pandas as pd
import pickle
import gc
import time
import itertools

from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score, auc, roc_curve
from sklearn.model_selection import train_test_split

from contextlib import contextmanager
from pathlib import Path
from time import time
from collections import namedtuple

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
gc.enable()

import torch
import torch.nn as nn
import torch.nn.functional as F
import math

from torch.nn.modules.dropout import Dropout
from torch.nn.modules.activation import MultiheadAttention
from torch.nn.modules.normalization import LayerNorm

In [3]:
TARGET = 'answered_correctly'
KEY_FEATURE = 'user_id'
FEATURES = [
    'content_id',
    'prior_question_elapsed_time',
    #     'prior_question_had_explanation',
    'part',
    #     'tag1',
    #     'tag2',
    #     'tag3',
    #     'tag4',
    #     'tag5',
    #     'tag6',
#     'user_id_count',
#     'user_id_wmean',
#     'user_id_attempts',
#     'content_id_count',
#     'content_id_mean',
#     'tag_count_0',
#     'tag_count_1',
#     'tag_count_2',
#     'tag_count_3',
    #     'tag_count_4',
    #     'tag_count_5',
#     'tag_mean_0',
#     'tag_mean_1',
#     'tag_mean_2',
#     'tag_mean_3',
    #     'tag_mean_4',
    #     'tag_mean_5',
#     'user_id_tag_count_0',
#     'user_id_tag_count_1',
#     'user_id_tag_count_2',
#     'user_id_tag_count_3',
    #     'user_id_tag_count_4',
    #     'user_id_tag_count_5',
#     'user_id_tag_mean_0',
#     'user_id_tag_mean_1',
#     'user_id_tag_mean_2',
#     'user_id_tag_mean_3',
    #     'user_id_tag_mean_4',
    #     'user_id_tag_mean_5',
#     'user_content_hmean',
    #     'tags_hmean',
#     'tags_whmean',
    #     'user_tags_hmean'
#     'user_tags_whmean'
]

DTYPES = {
    'content_id': int,
    'prior_question_elapsed_time': int,
    #     'prior_question_had_explanation',
    'part': int,
    #     'tag1',
    #     'tag2',
    #     'tag3',
    #     'tag4',
    #     'tag5',
    #     'tag6',
#     'user_id_count',
#     'user_id_wmean',
#     'user_id_attempts',
#     'content_id_count',
#     'content_id_mean',
#     'tag_count_0',
#     'tag_count_1',
#     'tag_count_2',
#     'tag_count_3',
    #     'tag_count_4',
    #     'tag_count_5',
#     'tag_mean_0',
#     'tag_mean_1',
#     'tag_mean_2',
#     'tag_mean_3',
    #     'tag_mean_4',
    #     'tag_mean_5',
#     'user_id_tag_count_0',
#     'user_id_tag_count_1',
#     'user_id_tag_count_2',
#     'user_id_tag_count_3',
    #     'user_id_tag_count_4',
    #     'user_id_tag_count_5',
#     'user_id_tag_mean_0',
#     'user_id_tag_mean_1',
#     'user_id_tag_mean_2',
#     'user_id_tag_mean_3',
    #     'user_id_tag_mean_4',
    #     'user_id_tag_mean_5',
#     'user_content_hmean',
    #     'tags_hmean',
#     'tags_whmean',
    #     'user_tags_hmean'
#     'user_tags_whmean'
}

CAT_FEATURES = [
    'part'
]

# Transformer Model

In [4]:
MODEL_FILENAME = 'transformer_best.pth'
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
SEQ_LEN = 100
PAD_VALUE = 0
START_TOKEN = 2

In [5]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F


class TransformerModel(nn.Transformer):
    def __init__(self, **params):
        '''
        nhead -> number of heads in the transformer multi attention thing.
        nhid -> the number of hidden dimension neurons in the model.
        nlayers -> how many layers we want to stack.
        '''
        print(params)
        super(TransformerModel, self).__init__(**params)
        self.pos_embedding = nn.Embedding(self.d_model, self.d_model) # positional embeddings
        self.exercise_embeddings = nn.Embedding(num_embeddings=13523, embedding_dim=self.d_model) # exercise_id
        self.part_embeddings = nn.Embedding(num_embeddings=7+1, embedding_dim=self.d_model) # part_id_embeddings
        self.elapsed_time_embeddings = nn.Embedding(num_embeddings=301, embedding_dim=self.d_model) # prior_question_elapsed_time
        self.target_embeddings = nn.Embedding(num_embeddings=3, embedding_dim=self.d_model)
        self.linear = nn.Linear(self.d_model, 1)
        self.device = DEVICE
        
        self.future_mask = self.generate_square_subsequent_mask(self.d_model).to(self.device)
        self.init_weights()

    def get_future_mask(self):
        future_mask = np.triu(np.ones((self.d_model, self.d_model)), k=1).astype('bool')
        return torch.from_numpy(future_mask)
    
    def init_weights(self):
        initrange = 0.1
        # init embeddings
        self.exercise_embeddings.weight.data.uniform_(-initrange, initrange)
        self.part_embeddings.weight.data.uniform_(-initrange, initrange)
        self.elapsed_time_embeddings.weight.data.uniform_(-initrange, initrange)
        self.target_embeddings.weight.data.uniform_(-initrange, initrange)

    def forward(self, encoder_input, decoder_input, mask_pad=None):
        '''
        S is the sequence length, N the batch size and E the Embedding Dimension (number of features).
        src: (S, N, E)
        src_mask: (S, S)
        src_key_padding_mask: (N, S)
        padding mask is (N, S) with boolean True/False.
        SRC_MASK is (S, S) with float(’-inf’) and float(0.0).
        '''
        embedded_src = (self.exercise_embeddings(encoder_input['content_id'])
                        + self.pos_embedding(torch.arange(0, encoder_input['content_id'].shape[1])
                                             .to(self.device).unsqueeze(0)
                                             .repeat(encoder_input['content_id'].shape[0], 1))
                        + self.part_embeddings(encoder_input['part'])
                        + self.elapsed_time_embeddings(encoder_input['prior_question_elapsed_time'])
                       ) # (N, S, E)
        embedded_src = embedded_src.transpose(0, 1) # (S, N, E)
        embedded_src = embedded_src * np.sqrt(self.d_model)

        embedded_tgt = self.target_embeddings(decoder_input).transpose(0, 1)

        output = super(TransformerModel, self).forward(src=embedded_src, tgt=embedded_tgt, 
                                                       tgt_mask=self.future_mask, 
                                                       src_key_padding_mask=mask_pad)

        output = self.linear(output.transpose(1, 0)).squeeze(-1)

        return output

In [6]:
def load_data():
    return pd.read_parquet(PARQUETS_DIR + 'train_merged.parquet', columns=[KEY_FEATURE]+FEATURES+[TARGET])#.iloc[-1_000_000:]

def split_train_valid(dt, size):
    uids = dt[KEY_FEATURE].unique()
    n_val = int(len(uids) * size)
    val_uids = np.random.choice(uids, n_val)
    val = dt[dt[KEY_FEATURE].isin(val_uids)]
    trn = dt.drop(val.index)
    return trn, val

def preprocess(data):
    data["prior_question_elapsed_time"].fillna(0, inplace=True) # some random value fill in
    data["prior_question_elapsed_time"] = data["prior_question_elapsed_time"] // 1000  
    return data

def pad_batch(batch, batch_len=SEQ_LEN, pad_value=0):
    shape = ((batch_len - batch.shape[0], 0),) + tuple((0, 0) for i in range(len(batch.shape)-1))
    return np.pad(batch, shape, constant_values=pad_value)

def rolling_window(a, w):
    s0, s1 = a.strides
    m, n = a.shape
    return np.lib.stride_tricks.as_strided(
        a, 
        shape=(m-w+1, w, n), 
        strides=(s0, s0, s1))

def make_time_series(x, windows_size, pad_value=0):
  x = np.pad(x, [[windows_size-1, 0], [0, 0]], constant_values=0)
  x = rolling_window(x, windows_size)
  return x

def create_model():
    return TransformerModel(d_model=SEQ_LEN, nhead=4, num_encoder_layers=3, 
                             num_decoder_layers=3, dim_feedforward=128, dropout=0.3)

def load_model():
    estimator = create_model()
    if os.path.exists(MODEL_FILENAME):
        filepath = MODEL_FILENAME
    else:
        filepath = MODELS_DIR + f'transformer/{VERSION}/{MODEL_FILENAME}'
    estimator.load_state_dict(torch.load(filepath))
    return estimator

In [7]:
class Riiid(torch.utils.data.Dataset):
    def __init__(self, groups, seq_len, pad_value=0, max_samples_per_user=None):
        self.groups = groups
        self.uids = list(groups.groups)
        self.seq_len = seq_len
        self.pad_value = pad_value
        self.max_samples_per_user = max_samples_per_user
    def __len__(self):
        return len(self.uids)
    
    def __getitem__(self, idx):
        uid = self.uids[idx]
        g = self.groups.get_group(uid).copy()
        g['mask'] = 1
        g['decoder_input'] = g[TARGET].shift(fill_value=START_TOKEN)
        rolling_data = make_time_series(g.values, self.seq_len, pad_value=0)
        rolling_data[:, :, -2] = np.logical_not(rolling_data[:, :, -2])
        rolling_data[:, 0, -1] = START_TOKEN
        n_sequences = len(rolling_data)
        if self.max_samples_per_user is not None and n_sequences > self.max_samples_per_user:
            idx = np.random.choice(np.arange(n_sequences), self.max_samples_per_user)
            rolling_data = rolling_data[idx]
#         print(1)
        return rolling_data

def collate_fn(batch):
#     print(2)
    return np.concatenate(batch).transpose(2, 0, 1)

In [8]:
def train_epoch(estimator, train_iterator, optim, criterion, device="cpu", batch_limit=128):
    estimator.train()

    tbar = tqdm(train_iterator)
    num_corrects = 0
    loss_sum = 0
    batch_count = 0
    sample_count = 0
    
    for batch in tbar:
#         print(3)
        encoder_input = {}
        for i, feat in enumerate(FEATURES):
            if DTYPES[feat] is int:
                encoder_input[feat] = torch.Tensor(batch[i]).to(device).long()
            elif DTYPES[feat] is bool:
                encoder_input[feat] = torch.Tensor(batch[i]).to(device).bool()

        decoder_input = torch.Tensor(batch[-1]).to(device).long()
        mask = torch.Tensor(batch[-2]).to(device).bool()
        labels = torch.Tensor(batch[-3]).to(device).long()
            
        n_samples = len(labels)
        n_batches = int(np.ceil(n_samples / batch_limit))
        for nbatch in range(n_batches):
            start_idx = nbatch * batch_limit
            end_idx = (nbatch + 1) * batch_limit

            optim.zero_grad()
            output = estimator(encoder_input={name: feat[start_idx: end_idx] for name, feat in encoder_input.items()}, 
                               decoder_input=decoder_input[start_idx: end_idx], 
                               mask_pad=mask[start_idx: end_idx])
            batch_labels = labels[start_idx: end_idx].float()
            loss = criterion(output, batch_labels)
            loss.backward()
            optim.step()

            loss_sum += loss.item()
            pred = (torch.sigmoid(output) >= 0.5).long()
            num_corrects += (pred == batch_labels).sum().item()
            batch_count += 1
            sample_count += len(batch_labels)  

            tbar.set_description(f'{nbatch+1}/{n_batches} | ' + 'trn_loss - {:.4f}'.format(loss_sum / batch_count))
    
    acc = num_corrects / sample_count
    loss = loss_sum / batch_count
    
    return loss, acc      

In [9]:
def val_epoch(estimator, val_iterator, criterion, device="cpu", batch_limit=128):
    estimator.eval()

    loss_sum = 0
    batch_count = 0
    num_corrects = 0
    sample_count = 0
    targets = []
    outs = []

    tbar = tqdm(val_iterator)
    for batch in tbar:
        encoder_input = {}
        for i, feat in enumerate(FEATURES):
            if DTYPES[feat] is int:
                encoder_input[feat] = torch.Tensor(batch[i]).to(device).long()
            elif DTYPES[feat] is bool:
                encoder_input[feat] = torch.Tensor(batch[i]).to(device).bool()

        decoder_input = torch.Tensor(batch[-1]).to(device).long()
        mask = torch.Tensor(batch[-2]).to(device).bool()
        labels = torch.Tensor(batch[-3]).to(device).long()
            
        n_samples = len(labels)
        n_batches = int(np.ceil(n_samples / batch_limit))
        for nbatch in range(n_batches):
            start_idx = nbatch * batch_limit
            end_idx = (nbatch + 1) * batch_limit
            with torch.no_grad():
                output = estimator(encoder_input={name: feat[start_idx: end_idx] for name, feat in encoder_input.items()}, 
                                   decoder_input=decoder_input[start_idx: end_idx], 
                                   mask_pad=mask[start_idx: end_idx])
            batch_labels = labels[start_idx: end_idx].float()
            loss = criterion(output, batch_labels)
            loss_sum += loss.item()
            batch_count += 1

            pred = (torch.sigmoid(output) >= 0.5).long()
            num_corrects += (pred == batch_labels).sum().item()
            sample_count += len(batch_labels)
            targets.extend(batch_labels.data.cpu().numpy())
            outs.extend(output.data.cpu().numpy())

            tbar.set_description(f'{nbatch+1}/{n_batches} | ' + 'val_loss - {:.4f}'.format(loss_sum / batch_count))                 
        
    acc = num_corrects / sample_count
    auc = roc_auc_score(targets, outs)
    loss = loss_sum / batch_count

    return loss, acc, auc        
        

In [10]:
def train_transformer(estimator, train, valid, lr=1e-3, epochs=10, n_user_batches=32, batch_limit=128, 
                      max_samples_per_user=100, device="cpu", early_stopping=2, eps=1e-4, nworkers=4):
    trn_dataset = Riiid(groups=train.groupby('user_id')[FEATURES+[TARGET]], 
                        seq_len=SEQ_LEN, 
                        pad_value=PAD_VALUE, 
                        max_samples_per_user=max_samples_per_user)
    trn_dataloader = torch.utils.data.DataLoader(dataset=trn_dataset, 
                                                 batch_size=n_user_batches, 
                                                 collate_fn=collate_fn, 
                                                 num_workers=nworkers)

    val_dataset = Riiid(groups=valid.groupby('user_id')[FEATURES+[TARGET]], 
                        seq_len=SEQ_LEN, pad_value=PAD_VALUE, 
                        max_samples_per_user=None)
    val_dataloader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                                 batch_size=n_user_batches, 
                                                 collate_fn=collate_fn, 
                                                 num_workers=nworkers)

    optimizer = torch.optim.Adam(estimator.parameters(), lr=lr)
    criterion = nn.BCEWithLogitsLoss()

    estimator.to(device)
    criterion.to(device)
    
    over_fit = 0
    last_auc = 0
    for epoch in range(epochs):
        trn_loss, trn_acc = train_epoch(estimator, trn_dataloader, optimizer, criterion, device, batch_limit)
        print("Training epoch {} - loss:{:.4f} - acc: {:.4f}".format(epoch + 1, trn_loss, trn_acc))

        val_loss, val_acc, val_auc = val_epoch(estimator, val_dataloader, criterion, device, batch_limit)
        print("Validation epoch {} - loss: {:.4f} - acc: {:.4f}, auc: {:.6f}".format(epoch + 1, val_loss, val_acc, val_auc))

        if val_auc > last_auc + eps:
            last_auc = val_auc
            over_fit = 0
            torch.save(estimator.state_dict(),  OUT_DIR + MODEL_FILENAME)
        else:
            over_fit += 1

        if over_fit >= early_stopping:
            print("early stop epoch ", epoch + 1)
            break
    
    return estimator

In [None]:
retrain_transformer = 1
cont = 0
if retrain_transformer and not COMPILE:
    data = preprocess(load_data())
    df_train, df_valid = split_train_valid(data, 0.2)
    del data
    gc.collect()
    
    if cont:
        model = load_model()
    else:
        model = create_model()
    
    model = train_transformer(model, df_train, df_valid,
                              lr=1e-3,
                              epochs=1,
                              n_user_batches=32,
                              batch_limit=512,
                              max_samples_per_user=32,
                              early_stopping=2,
                              eps=1e-4,
                              nworkers=1,
                              device=DEVICE)
else:
    model = load_model()

{'d_model': 100, 'nhead': 4, 'num_encoder_layers': 3, 'num_decoder_layers': 3, 'dim_feedforward': 128, 'dropout': 0.3}


HBox(children=(FloatProgress(value=0.0, max=5034.0), HTML(value='')))

# Test

In [None]:
class RiiidTest(torch.utils.data.Dataset):
    def __init__(self, data, queries, seq_len, pad_value=0):
        self.data = data
        self.dcols = {col: i for i, col in enumerate([KEY_FEATURE] + FEATURES + [TARGET])}
        self.queries = queries
        self.seq_len = seq_len
        self.pad_value = pad_value
        
    def __len__(self):
        return len(self.queries)
    
    def __getitem__(self, idx):
        query = self.queries[[idx]]
        uid = query[0, self.dcols[KEY_FEATURE]]
        query = np.delete(query, self.dcols[KEY_FEATURE], axis=1)

        encoder_data = self.data[self.data[:, self.dcols[KEY_FEATURE]] == uid][-self.seq_len+1:]
        labels = encoder_data[:, self.dcols[TARGET]]
        encoder_data = np.delete(encoder_data, [self.dcols[KEY_FEATURE], self.dcols[TARGET]], axis=1)
        
        if len(encoder_data) == 0:
            encoder_data = query
        else:
            encoder_data = np.r_[encoder_data, query]
        decoder_data = np.r_[2, labels]
        mask = np.zeros(encoder_data.shape[0])

        encoder_data = pad_batch(encoder_data, self.seq_len, self.pad_value)
        decoder_data = pad_batch(decoder_data, self.seq_len, self.pad_value)
        mask = pad_batch(mask, self.seq_len, 1)

        
        return np.c_[encoder_data, mask, decoder_data]

#         return {'encoder_input': encoder_data.T,
#                 'decoder_input': decoder_data,
#                 'mask': mask}
    
def collate_fn_test(batch):
    return np.array(batch).transpose(2, 0, 1)

In [None]:
def predict_test(estimator, tst_iterator, device="cpu"):
    estimator.eval()

    preds = []
    for batch in tst_iterator:
        encoder_input = {}
        for i, feat in enumerate(FEATURES):
            if DTYPES[feat] is int:
                encoder_input[feat] = torch.Tensor(batch[i].astype(np.int64)).to(device).long()
            elif DTYPES[feat] is bool:
                encoder_input[feat] = torch.Tensor(batch[i].astype(bool)).to(device).bool()

        decoder_input = torch.Tensor(batch[-1].astype(np.int64)).to(device).long()
        mask = torch.Tensor(batch[-2].astype(bool)).to(device).bool()
        
        with torch.no_grad():
            output = model(encoder_input=encoder_input, decoder_input=decoder_input, mask_pad=mask)
        output = torch.sigmoid(output)
        preds.extend(output.data.cpu().numpy())

    return preds

In [None]:
ADDED_FEATURES = [
    'part'
]

In [None]:
import psutil

def predict_batch(estimator, tst_batch, prev_batch, prev_data, batch_size=128):
    all_cols = list(tst_batch.columns) + ADDED_FEATURES + [TARGET]
    all_cols = dict(zip(all_cols, range(len(all_cols))))
    used_cols = [all_cols[feat] for feat in [KEY_FEATURE] + FEATURES]

    tst_batch = preprocess(tst_batch).values

    if (prev_batch is not None) & (psutil.virtual_memory().percent<90):
        print(psutil.virtual_memory().percent)
        prev_batch = np.c_[prev_batch, eval(tst_batch[0, all_cols['prior_group_answers_correct']])]
        prev_batch = prev_batch[prev_batch[:, all_cols['content_type_id']] == 0][:, used_cols + [all_cols[TARGET]]]
        prev_data = np.r_[prev_data, prev_batch]

    parts = np.apply_along_axis(lambda rid: TAGS_DF[rid[0]]['part'] if rid[0] in TAGS_DF else 0,
                                axis=1, arr=tst_batch[:, [all_cols['content_id']]])
    tst_batch = np.c_[tst_batch, parts]
    prev_batch = tst_batch.copy()
    
    qrows = tst_batch[:, all_cols['content_type_id']] == 0
#     tst_batch = tst_batch[tst_batch[:, all_cols['content_type_id']] == 0][:, used_cols]
    tst_dataset = RiiidTest(prev_data, tst_batch[qrows][:, used_cols], SEQ_LEN)
    tst_dataloader = torch.utils.data.DataLoader(dataset=tst_dataset, batch_size=batch_size, collate_fn=collate_fn_test, num_workers=8)
    
    preds = predict_test(estimator, tst_dataloader, DEVICE)
        
    tst_batch = np.c_[tst_batch, preds]
    predictions = pd.DataFrame(tst_batch[:, [all_cols['row_id'], all_cols[TARGET]]],
                               columns=['row_id', TARGET])
        
    return {'preds': predictions,
            'prev_batch': prev_batch,
            'prev_data': prev_data}

In [None]:
local_test = 1

TAGS_DF = pd.read_parquet(PARQUETS_DIR + 'tags.parquet').to_dict('index')
previous_data = preprocess(load_data()).groupby(KEY_FEATURE).tail(SEQ_LEN).values
previous_batch = None
model.eval()

if local_test and not COMPILE:
    example_test = pd.read_csv(DATA_DIR + 'example_test.csv')
    submission = pd.DataFrame(columns=['row_id', TARGET])

    for gnum in tqdm(example_test['group_num'].unique()):
        test_batch = example_test[example_test['group_num'] == gnum].copy()
        preds, previous_batch, previous_data = predict_batch(model, test_batch, previous_batch, previous_data, batch_size=10).values()
#         df = predict_batch(lgbm_model, test_batch, previous_batch)
        submission = submission.append(preds)

    submission = submission.reset_index(drop=True)
    print(submission)
else:
    env = riiideducation.make_env()
    for test_batch, _ in tqdm(env.iter_test()):
        preds, previous_batch, previous_data = predict_batch(model, test_batch, previous_batch, previous_data, batch_size=128).values()
        env.predict(preds)