In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import random
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import pickle

## Load data

In [2]:
%%time
with open('../input/rid-group-w-lag-time/group_w_lag_time.p','rb') as f:
    group=pickle.load(f)

CPU times: user 7.89 s, sys: 3.64 s, total: 11.5 s
Wall time: 35 s


In [3]:
!cp ../input/rid-test110/Network.py .
from Network import *


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

n_skill=13523
MAX_SEQ=129

models=[]
layers=[6]
for i, nlayer in enumerate(layers):
    if nlayer is not None:
        model = SAKTModel(n_skill, max_seq=MAX_SEQ, embed_dim=256, nlayers=nlayer).to(device)
        model = nn.DataParallel(model)

        model.load_state_dict(torch.load(f"../input/rid-test110-loss-weight/model1.pth"))

        model.eval()
        models.append(model)
        
for i, nlayer in enumerate(layers):
    if nlayer is not None:
        model = SAKTModel(n_skill, max_seq=MAX_SEQ, embed_dim=384, nlayers=nlayer, nheads=12).to(device)
        model = nn.DataParallel(model)

        model.load_state_dict(torch.load(f"../input/rid-test110-loss-weight-12head/model1.pth"))

        model.eval()
        models.append(model)        


In [5]:
for user in group.index:
    group[user]=group[user][:6]

In [6]:
question_cluster=pd.read_csv('../input/rid-tag-community/question_cmnts.csv')

question_df=pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')
possible_tags=[]
for i, tags in enumerate(question_df.tags):
    try:
        tags=tags.split()
        for tag in tags:
            tag=int(tag)
            if tag not in possible_tags:
                possible_tags.append(tag)
    except:
        pass

tag_encoding=np.zeros((len(question_df),len(possible_tags)))
for i, tags in enumerate(question_df.tags):
    try:
        tags=tags.split()
        for tag in tags:
            tag=int(tag)
            tag_encoding[i,tag]=1
    except:

        #exit()
        #print(i)
        pass#exit()

## Test

In [7]:
def task_mask(tasks):
    seq_length=len(tasks)
    future_mask = np.triu(np.ones((seq_length, seq_length)), k=1).astype('bool')
    container_mask= np.ones((seq_length, seq_length))
    container_mask=(container_mask*tasks.reshape(1,-1))==(container_mask*tasks.reshape(-1,1))
    #comparison_mask=np.ones((seq_length, seq_length))*tasks.reshape(-1,1)
    #mask=future_mask(task)
    future_mask=future_mask+container_mask
    np.fill_diagonal(future_mask,0)
    return future_mask


class TestDataset(Dataset):
    def __init__(self, samples, test_df, question_cluster=question_cluster, tag_encoding=tag_encoding, max_seq=MAX_SEQ):
        super(TestDataset, self).__init__()
        self.samples = samples
        self.user_ids = [x for x in test_df["user_id"].unique()]
        self.test_df = test_df
        self.n_skill = 13523
        self.max_seq = max_seq
        self.question_cluster=np.append(question_cluster.community.values,[5])
        self.tag_encoding=np.concatenate([tag_encoding,np.zeros((1,188))],0)
        
    def __len__(self):
        return self.test_df.shape[0]

    def __getitem__(self, index):
        test_info = self.test_df.iloc[index]
    
        user_id = test_info["user_id"]
        target_id = test_info["content_id"]
        elapsed_time=test_info["prior_question_elapsed_time"]
        explanation=test_info["prior_question_had_explanation"]
        time_stamp=test_info["timestamp"]
        task_container=test_info["task_container_id"]
        #et_ = test_info["prior_question_elapsed_time"]
        
        
        q = np.zeros(self.max_seq, dtype=int)
        q[:]=13523
        qa = np.zeros(self.max_seq, dtype=int)
        et = np.zeros(self.max_seq, dtype=int)
        pq = np.zeros(self.max_seq, dtype=int)
        ts = np.zeros(self.max_seq, dtype=int)
        tasks = np.zeros(self.max_seq, dtype=int)
        
        if user_id in self.samples.index:
            q_, qa_, et_, pq_, ts_, tasks_ = self.samples[user_id]
            
            seq_len = len(q_)

            if seq_len >= self.max_seq:
                q = q_[-self.max_seq:]
                qa = qa_[-self.max_seq:]
                et = et_[-self.max_seq:]
                pq = pq_[-self.max_seq:]
                ts = ts_[-self.max_seq:]
                tasks = tasks_[-self.max_seq:]
                
            else:
                q[-seq_len:] = q_
                qa[-seq_len:] = qa_
                et[-seq_len:] = et_
                pq[-seq_len:] = pq_
                ts[-seq_len:] = ts_
                tasks[-seq_len:] = tasks_
        
        x = q[1:].copy()
        xa = qa[1:].copy()
        #x += (qa[1:] == 1) * self.n_skill
        
        questions = np.append(q[2:], [target_id])
        pq = np.append(pq[2:], [explanation])
        et = np.append(et[2:], [elapsed_time])//1000
        ts = (np.append(ts[2:], [time_stamp])-ts[1:])/1000
        tasks= np.append(tasks[1:], [task_container])
        
        for i in range(len(ts)-1):
            if tasks[i+1]==tasks[i+2]:
                ts[i+1]=ts[i]
                #xa[i+1]=xa[i]
        #print(tasks[1:])
        #print(ts)
        et = np.clip(et,0,300)
        #et = np.clip(et,0,300)
        #print(f"###last elapsed time: {et[-1]}###")
        mask=(questions==13523)
        mask[0]=False
        cluster=self.question_cluster[questions]
        tags=self.tag_encoding[questions]
        
        #attention_mask=task_mask(tasks[1:])
        attention_mask=0
        #attention_mask[:,0]=False
        
        return questions, xa, et, pq, ts, attention_mask, mask, cluster, tags

In [8]:
import riiideducation

env = riiideducation.make_env()
iter_test = env.iter_test()

In [9]:
import psutil



prev_test_df = None

for (test_df, sample_prediction_df) in tqdm(iter_test):
    #HDKIM
    if (prev_test_df is not None) & (psutil.virtual_memory().percent<90):
        print(psutil.virtual_memory().percent)
        prev_test_df['answered_correctly'] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prev_test_df['prior_question_elapsed_time']=prev_test_df['prior_question_elapsed_time'].fillna(0)
        prev_test_df['prior_question_elapsed_time']=prev_test_df['prior_question_elapsed_time'].values        
        #prev_test_df['prior_question_had_explanation']=prev_test_df['prior_question_had_explanation'].fillna(False).astype('int')
        prev_test_df = prev_test_df[prev_test_df.content_type_id == False]
        prev_group = prev_test_df[['user_id', 'content_id', 'answered_correctly','prior_question_elapsed_time','prior_question_had_explanation','timestamp','task_container_id']].groupby('user_id').apply(lambda r: (
            r['content_id'].values,
            r['answered_correctly'].values,
            r['prior_question_elapsed_time'].values,
            r['prior_question_had_explanation'].values,
            r['timestamp'].values,
            r['task_container_id'].values))
        for prev_user_id in prev_group.index:
            prev_group_content = prev_group[prev_user_id][0]
            prev_group_ac = prev_group[prev_user_id][1]
            prev_group_et = prev_group[prev_user_id][2]
            prev_group_pq = prev_group[prev_user_id][3]
            prev_group_ts = prev_group[prev_user_id][4]
            prev_group_tc = prev_group[prev_user_id][5]
            if prev_user_id in group.index:
                group[prev_user_id] = (np.append(group[prev_user_id][0],prev_group_content), 
                                       np.append(group[prev_user_id][1],prev_group_ac),
                                       np.append(group[prev_user_id][2],prev_group_et),
                                       np.append(group[prev_user_id][3],prev_group_pq),
                                       np.append(group[prev_user_id][4],prev_group_ts),
                                       np.append(group[prev_user_id][5],prev_group_tc))
 
            else:
                group[prev_user_id] = (prev_group_content,prev_group_ac,prev_group_et,prev_group_pq,prev_group_ts,prev_group_tc)
            if len(group[prev_user_id][0])>MAX_SEQ:
                new_group_content = group[prev_user_id][0][-MAX_SEQ:]
                new_group_ac = group[prev_user_id][1][-MAX_SEQ:]
                new_group_et = group[prev_user_id][2][-MAX_SEQ:]
                new_group_pq = group[prev_user_id][3][-MAX_SEQ:]
                new_group_ts = group[prev_user_id][4][-MAX_SEQ:]
                new_group_tc = group[prev_user_id][5][-MAX_SEQ:]
                group[prev_user_id] = (new_group_content,new_group_ac,new_group_et,new_group_pq,new_group_ts,new_group_tc)

    vec=test_df['prior_question_had_explanation'].to_numpy()
    for i, entry in enumerate(vec):
        try:
            if entry != 0:
                pass
        except:
            vec[i]=2

    vec=vec.astype(int)
    test_df['prior_question_had_explanation']=test_df['prior_question_had_explanation'].fillna(True).astype(int)
    test_df['prior_question_had_explanation']=vec                
                
                
    prev_test_df = test_df.copy()
    
    test_df = test_df[test_df.content_type_id == False]

    test_dataset = TestDataset(group, test_df)
    test_dataloader = DataLoader(test_dataset, batch_size=51200, shuffle=False)
    
    outs = []

    for item in tqdm(test_dataloader):
        target_id = item[0].to(device).long()
        xa = item[1].to(device).long()
        et = item[2].to(device).float()
        et = torch.clamp(et,0,300)
        et[et!=et] = 0

        #et = item[3].to(device).float()
        #x=torch.nan_to_num(x,nan=0)
        #print(et)
        #print(et)
        pq = item[3].to(device).long()
        ts = item[4].to(device).float()
        ts = torch.clamp(ts,0,1440)
        ts[ts!=ts] = 0
        attn_mask = item[5].to(device).bool()
        mask=item[6].to(device).bool()
        cluster=item[7].to(device).long()
        tags=item[8].to(device).float()
        
        outputs=[]
        with torch.no_grad():
            for model in models:
                #print(target_id.shape)
                output = model(target_id, xa, et, ts, pq, None, mask, cluster, tags)
                outputs.append(output)
        
        output=torch.sigmoid(torch.stack(outputs,0)).mean(0)
        
        output = output[:, -1]
        #print(output.shape)
        # pred = (output >= 0.5).long()
        # loss = criterion(output, label)

        # val_loss.append(loss.item())
        # num_corrects += (pred == label).sum().item()
        # num_total += len(label)

        # labels.extend(label.squeeze(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())
        
    test_df['answered_correctly'] =  outs
    
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

0it [00:00, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  1.05it/s]
1it [00:01,  1.05s/it]
100%|██████████| 1/1 [00:00<00:00, 14.94it/s]
2it [00:01,  1.28it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A

46.0
46.1


100%|██████████| 1/1 [00:00<00:00, 15.22it/s]

100%|██████████| 1/1 [00:00<00:00, 13.03it/s]
4it [00:01,  1.74it/s]

46.1


4it [00:01,  2.20it/s]
