In [1]:
import gc
import psutil
import joblib
import random
from tqdm import tqdm

import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score

import torch

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import math

In [174]:
TRAIN_SAMPLES = 320000

MAX_SEQ = 100
# MAX_SEQ = 128
MIN_SAMPLES = 5
EMBED_DIM = 128
DROPOUT_RATE = 0.2
LEARNING_RATE = 1e-3
# LEARNING_RATE = 1e-5
MAX_LEARNING_RATE = 2e-3
# EPOCHS = 30
EPOCHS = 10
# TRAIN_BATCH_SIZE = 2048
TRAIN_BATCH_SIZE = 64

In [3]:
%%time

dtypes = {'timestamp': 'int64', 'user_id': 'int32' ,'content_id': 'int16','content_type_id': 'int8','answered_correctly':'int8'}
# train_df = pd.read_feather('../input/riiid-cross-validation-dataset/train.feather')[[
#     'timestamp', 'user_id', 'content_id', 'content_type_id', 'answered_correctly'
# ]]
train_df = pd.read_csv('./input/riiid-test-answer-prediction/train.csv')[['timestamp', 'user_id', 'content_id', 'content_type_id', 'answered_correctly']]
for col, dtype in dtypes.items():
    train_df[col] = train_df[col].astype(dtype)
    
    
#train_df have only rows with False in content_type_id (0 if the event was a question being posed to the user)
train_df = train_df[train_df.content_type_id == False]  

train_df = train_df.sort_values(['timestamp'], ascending=True)
train_df.reset_index(drop=True, inplace=True)


skills = train_df["content_id"].unique()
# joblib.dump(skills, "skills.pkl.zip")
n_skill = len(skills)  # (unique content IDs)
print("number skills", n_skill)

group = train_df[['user_id', 'content_id', 'answered_correctly', 'timestamp']].groupby('user_id').apply(lambda r: (
            r['content_id'].values,
            r['answered_correctly'].values,
            r['timestamp'].values))

# joblib.dump(group, "group.pkl.zip")
del train_df
gc.collect()
# group

# The training data is sorted by timestamp and split into two sets using an 80/20 split
TRAIN_SAMPLES = int(len(group.index)*0.8)
print('TRAIN_SAMPLES',TRAIN_SAMPLES)


# The method then creates a dictionary of samples, where each key is a user ID and the corresponding value is a tuple containing the user's content IDs and answered correctly values.
train_indexes = list(group.index)[:TRAIN_SAMPLES]
valid_indexes = list(group.index)[TRAIN_SAMPLES:]
train_group = group[group.index.isin(train_indexes)]
valid_group = group[group.index.isin(valid_indexes)]
print('train_group \n', train_group[:5] )
print('valid_group \n', valid_group[:5] )

del group, train_indexes, valid_indexes
print(len(train_group), len(valid_group))

number skills 13523
TRAIN_SAMPLES 314924
train_group 
 user_id
115     ([5692, 5716, 128, 7860, 7922, 156, 51, 50, 78...
124     ([7900, 7876, 175, 1278, 2065, 2063, 2064, 336...
2746    ([5273, 758, 5976, 236, 404, 382, 405, 873, 53...
5382    ([5000, 3944, 217, 5844, 5965, 4990, 5235, 605...
8623    ([3915, 4750, 6456, 3968, 6104, 5738, 6435, 54...
dtype: object
valid_group 
 user_id
1720820513    ([3849, 1320, 5285, 8918, 3644, 6111, 8397, 94...
1720823127    ([7900, 7876, 175, 1278, 2064, 2065, 2063, 336...
1720823509    ([128, 7860, 7922, 156, 51, 50, 7896, 7863, 15...
1720827508    ([7900, 7876, 175, 1278, 2065, 2063, 2064, 336...
1720827841    ([7900, 7876, 175, 1278, 2065, 2063, 2064, 336...
dtype: object
314924 78732
CPU times: user 1min 2s, sys: 5.45 s, total: 1min 7s
Wall time: 1min 8s


In [4]:
len(train_group[115][2])

46

### The BERTDataset class returns three values for each sample in the dataset:

- `x`: the input data for the BERT model, which consists of the content IDs for each sample shifted by one and the answered correctly values added to the content IDs
- `target_id`: the target IDs for each sample, which consist of the content IDs shifted by one
- `label`: the labels for each sample, which consist of the answered correctly values shifted by one

These values are used as input to the BERT model and are used to calculate the model's performance during training and evaluation. The `x` and `target_id` arrays are used as input to the BERT model, while the `label` array is used to calculate the model's loss and accuracy.

The __init__ method iterates over the users in the `group` object and retrieves the questions and answers for each user. If a user has answered fewer than `min_samples` questions, their questions and answers are not included in the `samples` dictionary. If a user has answered more than `max_seq` questions, their questions and answers are split into multiple sequences of length `max_seq` and each sequence is added to the `samples` dictionary using a unique key that includes the user's ID and the sequence number. For example, if the user's ID is `123` and they have answered 150 questions, their questions and answers will be split into two sequences with lengths 128 and 22, and the keys `123_0` and `123_1` will be added to the samples dictionary with the values of the first and second sequence, respectively.

### decay rate parameter `k`:
- The value of the decay rate parameter k will depend on the time scale of the data and the desired forgetting rate. In general, a larger value of k will result in faster forgetting, while a smaller value will result in slower forgetting.
- To choose a suitable value for k, you will need to consider the range of the timestamps and the desired forgetting rate. For example, if you want the decay factor to decrease by half over a period of one hour (3600 seconds), you can set k = ln(2)/3600 so that the decay function f(t) = e^(-kt) satisfies f(3600) = 1/2
- It is difficult to recommend a specific value for k without knowing more about the data and the desired forgetting rate. You may need to experiment with different values of k to find the one that works best for your data.

In [5]:
class BERTDataset(Dataset):
    def __init__(self, group, n_skill, min_samples=1, max_seq=128):
        super(BERTDataset, self).__init__()
        self.max_seq = max_seq
        self.n_skill = n_skill
        self.samples = {}
        
        self.user_ids = []
        for user_id in group.index:
            q, qa, t = group[user_id]  # q:content_id(questions); qa:answered_correctly(user's question answer)
            if len(q) < min_samples:  # If a user has answered fewer than min_samples questions, their questions and answers are not included in the 'samples' dictionary
                continue 
            
            # Main Contribution
            if len(q) > self.max_seq:
                total_questions = len(q)
                initial = total_questions % self.max_seq
                if initial >= min_samples:
                    self.user_ids.append(f"{user_id}_0")
                    self.samples[f"{user_id}_0"] = (q[:initial], qa[:initial], t[:initial])
                for seq in range(total_questions // self.max_seq):
                    self.user_ids.append(f"{user_id}_{seq+1}")
                    start = initial + seq * self.max_seq
                    end = start + self.max_seq
                    self.samples[f"{user_id}_{seq+1}"] = (q[start:end], qa[start:end], t[start:end])
            else:
                user_id = str(user_id)
                self.user_ids.append(user_id)
                self.samples[user_id] = (q, qa, t)
    
    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index]
        q_, qa_, t_ = self.samples[user_id]
        seq_len = len(q_)

        q = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)
        t = np.zeros(self.max_seq, dtype=int)
        
        if seq_len == self.max_seq:
            q[:] = q_
            qa[:] = qa_
            t[:] = t_
        else:
            q[-seq_len:] = q_
            qa[-seq_len:] = qa_
            t[-seq_len:] = t_
        
        # 'x' also has a length of max_seq-1
        target_id = q[1:]  
        label = qa[1:]
        
        
        x = np.zeros(self.max_seq-1, dtype=int)
        x = q[:-1].copy()
        x += (qa[:-1] == 1) * self.n_skill  # the model needs to be able to distinguish between the question IDs and the correct answers in order to make predictions
        
        # To predict the next response from student, we shouldn't how long a student take for the next question
#         times = np.array(list(map(round, t[:-1]/1000)))  # (in seconds) round each number to the nearest integer.
        times = np.array(t[:-1])
        
        
#         # decay factor to decrease by half over a period of one hour (3600 seconds)
#         k = np.log(2) / 3600
#         elapsed_time = t[:-1]/1000  # Elapsed time in seconds
#         x = x * np.exp(-k*elapsed_time)  # Update x using the decay factor 
        
        return x, target_id, times, label 
    
    
train_dataset = BERTDataset(train_group, n_skill, min_samples=MIN_SAMPLES, max_seq=MAX_SEQ)
train_dataloader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=8)
valid_dataset = BERTDataset(valid_group, n_skill, max_seq=MAX_SEQ)
valid_dataloader = DataLoader(valid_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=False, num_workers=8)


## Define model



- the `d_model` parameter specifies the size of the hidden states used by the model. This is also known as the "model size" or the "embedding size" of the model.
- The `d_model` parameter is used as a scaling factor when computing the dot product between the query and key vectors in the Attention mechanism. It is also used to specify the size of the input and output vectors for the linear layers in the MultiHeadedAttention class, as well as the size of the input and output vectors for the LayerNorm and SublayerConnection classes. In general, a larger d_model value will result in a more expressive BERT model, but will also increase the computational complexity and memory usage of the model.



### Sublayer Connection:
- ### residual connection:
    - A residual connection is a type of connection in a neural network that allows information to bypass one or more layers of the network. This allows the network to learn to perform tasks more efficiently by allowing the information to flow more directly from the input to the output layers. Residual connections can help improve the performance of the network, particularly on tasks that require the network to process long sequences of data. They are often used in deep learning networks, where they can help prevent the vanishing gradient problem, allowing the network to learn more effectively.
    
- ### LayerNorm:
    - the `eps` parameter specifies a small value used to stabilize the division operation in the layer normalization computation. This is necessary because division by zero is undefined, and division by a very small value can lead to numerical instability.

In [268]:
class Attention(nn.Module):
    """
    Compute 'Scaled Dot Product Attention' mechanism used by BERT
    """

    def forward(self, query, key, value, mask=None, dropout=None):
        scores = torch.matmul(query, key.transpose(-2, -1)) \
                 / math.sqrt(query.size(-1))

        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        p_attn = F.softmax(scores, dim=-1)

        if dropout is not None:
            p_attn = dropout(p_attn)

        return torch.matmul(p_attn, value), p_attn


# import torch.nn as nn
# from .single import Attention
class MultiHeadedAttention(nn.Module):
    """
    Take in model size and number of heads.
    This class extends the Attention class to support multiple "heads" for improved performance
    """

    def __init__(self, h, d_model, dropout=0.1):
        super().__init__()
        assert d_model % h == 0

        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h

        self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)])
        self.output_linear = nn.Linear(d_model, d_model)
        self.attention = Attention()

        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
                             for l, x in zip(self.linear_layers, (query, key, value))]

        # 2) Apply attention on all the projected vectors in batch.
        x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)

        # 3) "Concat" using a view and apply a final linear.
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)

        return self.output_linear(x)

    
# implements layer normalization for BERT
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."

    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
    
# implements the residual connections and layer normalization used by BERT to improve training
# from .layer_norm import LayerNorm    
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """

    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))
    

# activation function used by BERT
class GELU(nn.Module):
    """
    Paper Section 3.4, last paragraph notice that BERT used the GELU instead of RELU
    """

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))

# from .gelu import GELU
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."

    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        self.activation = GELU()

    def forward(self, x):
        return self.w_2(self.dropout(self.activation(self.w_1(x))))
    
    

# from .attention import MultiHeadedAttention
# from .utils import SublayerConnection, PositionwiseFeedForward
class TransformerBlock(nn.Module):
    """
    Bidirectional Encoder = Transformer (self-attention)
    Transformer = MultiHead_Attention + Feed_Forward with sublayer connection
    """

    def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout):
        """
        :param hidden: hidden size of transformer
        :param attn_heads: head sizes of multi-head attention
        :param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size
        :param dropout: dropout rate
        """

        super().__init__()
        self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden)
        self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout)
        self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout)
        self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, mask):
        x = self.input_sublayer(x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask))
        x = self.output_sublayer(x, self.feed_forward)
        return self.dropout(x)
    
# Copy from https://github.com/codertimo/BERT-pytorch
class BERT(nn.Module):
    """
    BERT model : Bidirectional Encoder Representations from Transformers.
    """

    def __init__(self, vocab_size, hidden=128, n_layers=12, attn_heads=12, dropout=0.1):
        """
        :param vocab_size: vocab_size of total words
        :param hidden: BERT model hidden size
        :param n_layers: numbers of Transformer blocks(layers)
        :param attn_heads: number of attention heads
        :param dropout: dropout rate
        """

        super().__init__()
        self.hidden = hidden
        self.n_layers = n_layers
        self.attn_heads = attn_heads
        
        # paper noted they used 4*hidden_size for ff_network_hidden_size
        self.feed_forward_hidden = hidden * 4

        # embedding for BERT, sum of positional, segment, token embeddings
        self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=hidden)
#         self.embedding = BERTEmbedding(vocab_size=2*vocab_size+1, embed_size=hidden)

        # multi-layers transformer blocks, deep network
        self.transformer_blocks = nn.ModuleList(
            [TransformerBlock(hidden, attn_heads, hidden * 4, dropout) for _ in range(n_layers)])
        
        
        
        
        # Cross_effect embedding
        self.alpha_inter_embeddings = torch.nn.Embedding(vocab_size * 2, hidden//2)
        self.alpha_skill_embeddings = torch.nn.Embedding(vocab_size, hidden//2)
        self.beta_inter_embeddings = torch.nn.Embedding(vocab_size * 2, hidden)
        self.beta_skill_embeddings = torch.nn.Embedding(vocab_size, hidden)
        
        
        
        
        
        self.pred = nn.Linear(hidden, 1)
        
        
        self.finalEmbedding = torch.nn.Embedding(vocab_size+1, 1)
        
        self.finalLinear = nn.Linear(99, 99)
        
        
    def forward(self, x, segment_info, times):
               

#         # inter is just the same as x, which is student response
#         # skills = target_id = segment_label
# #         x = x.long()
        alpha_src_emb = self.alpha_inter_embeddings(x)  # [bs, seq_len, emb]
        alpha_target_emb = self.alpha_skill_embeddings(segment_info)
        alphas = torch.matmul(alpha_src_emb, alpha_target_emb.transpose(-2, -1))  # [bs, seq_len, seq_len]
        
        beta_src_emb = self.beta_inter_embeddings(x)  # [bs, seq_len, emb]
        beta_target_emb = self.beta_skill_embeddings(segment_info)
        betas = torch.matmul(beta_src_emb, beta_target_emb.transpose(-2, -1))  # [bs, seq_len, seq_len]
        betas = torch.clamp(betas + 1, min=0, max=10)
#         betas = torch.clamp(betas, min=1e-10, max=10)
#         print('torch.isnan(betas)',torch.isnan(betas))
        
        seq_len = segment_info.shape[1]  # 99
#         times = times.unsqueeze(-1)  # Add a new dimension at the end of the tensor
#         times = times.expand(-1, -1, seq_len)  # Expand the tensor along the new dimension
        
        
        delta_t = (times[:, :, None] - times[:, None, :]).abs().double()
        delta_t = torch.log(delta_t + 1e-10) 
#         print('torch.isnan(delta_t)',torch.isnan(delta_t))
#         delta_t = torch.log(delta_t + 1e-10) / np.log(np.e)
#         delta_t = torch.log(delta_t + 1e-10) / np.log(self.time_log) 



#         betas = betas.unsqueeze(-1)  # Add a new dimension at the end of the tensor
        cross_effects = alphas * torch.exp(-betas * delta_t)  # [64,99,99]
#         print('torch.isnan(cross_effects)',torch.isnan(cross_effects), 'cross_effects.size()', cross_effects.size())
#         seq_len = segment_info.shape[1]  # 99
#         valid_mask = np.triu(np.ones((1, seq_len, seq_len)), k=1)
        valid_mask = np.triu(np.ones((1, seq_len, seq_len)), k=1)
        mask = (torch.from_numpy(valid_mask) == 0)
#         mask = mask.cuda() if self.gpu != '' else mask
        mask = mask.cuda()
#         sum_t = cross_effects.masked_fill(mask, 0)
        sum_t = cross_effects.masked_fill(mask, 0).sum(-2)
    
    
    
    
    
    
    
    
    
        
        # attention masking for padded token
        # torch.ByteTensor([batch_size, 1, seq_len, seq_len)
        mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)
        
        # embedding the indexed sequence to sequence of vectors
        x = self.embedding(x, segment_info)  # [64, 99, 128]
        

        # running over multiple transformer blocks
        for transformer in self.transformer_blocks:
            x = transformer.forward(x, mask)  # [64, 99, 128]


        
        

        
        
        
        
        
#         x = x.squeeze(-1)

#         # apply the cross effects to the output of the transformer blocks before passing it through the prediction layer.
#         x = x * sum_t
        
        
#         x = x.squeeze(-1).long()
#         print('x1',x,'x.size()',x.size())
        
#         x = self.finalEmbedding(x).squeeze(dim=-1)
        
        
        
        x = self.pred(x)  # [64,99,1]
        x = x.squeeze(-1)

#         print('x.size(-1)',x.size(-1),'sum_t.size(-1)',sum_t.size(-1))
#         fc = nn.Linear(x.size(-1) + sum_t.size(-1), 1)
#         print(fc.size())

#         x = self.finalLinear((x+sum_t).float())
        
#         print('x',x,'x.size()',x.size())
#         x = fc(x + sum_t)  # [64,99]

#         return x.squeeze(-1)
        x = x+ sum_t
        return x



## Embedding
Each type of embedding serves a different purpose in the BERT model.

The BERTEmbedding class has four main parts:

- The `TokenEmbedding` module, which is a subclass of nn.Embedding that represents each token in the input sequence as a vector of fixed length.

- The `PositionalEmbedding` module, which adds position information to the input representation by using sinusoidal functions of the position index.

- The `SegmentEmbedding` module, which represents the sentence segment (e.g., "sentence A" or "sentence B") of each token in the input sequence as a vector of fixed length.

- The `Cross_effect` modules, which represent the interactions between different skills and interleavings as vectors of fixed length.

Together, these embeddings provide BERT with a rich representation of the input text that it can use to perform various natural language understanding tasks.

In [269]:
class TokenEmbedding(nn.Embedding):
    def __init__(self, vocab_size, embed_size=128):
        super().__init__(vocab_size, embed_size, padding_idx=0)

class PositionalEmbedding(nn.Module):

    def __init__(self, d_model, max_len=128):
#     def __init__(self, d_model, max_len=99):
        super().__init__()

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model).float()
        pe.require_grad = False

        position = torch.arange(0, max_len).float().unsqueeze(1)
        div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return self.pe[:, :x.size(1)]

class SegmentEmbedding(nn.Embedding):
#     def __init__(self, embed_size=128):
#         super().__init__(3, embed_size, padding_idx=0)
    def __init__(self, vocab_size, embed_size=128):
        super().__init__(vocab_size, embed_size, padding_idx=0)

 
        
        
        
class BERTEmbedding(nn.Module):
    """
    BERT Embedding which is consisted with under features
        1. TokenEmbedding : normal embedding matrix
        2. PositionalEmbedding : adding positional information using sin, cos
        2. SegmentEmbedding : adding sentence segment info, (sent_A:1, sent_B:2)
        sum of all these features are output of BERTEmbedding
    """

    def __init__(self, vocab_size, embed_size, dropout=0.1):
        """
        :param vocab_size: total vocab size
        :param embed_size: embedding size of token embedding
        :param dropout: dropout rate
        """
        super().__init__()
#The self.token.embedding_dim attribute specifies the size of the embedding dimension for the tokens. In other words, it determines the length of the vectors that will be used to represent each token in the input sequence. This is a fixed property of the model and cannot be changed once it has been initialized.
#          self.token = TokenEmbedding(vocab_size=vocab_size, embed_size=embed_size)
        self.token = TokenEmbedding(vocab_size=2*vocab_size+1, embed_size=embed_size)
        self.position = PositionalEmbedding(d_model=self.token.embedding_dim)
#         self.position = PositionalEmbedding(max_len=99, d_model=self.token.embedding_dim)

# max_len=100 is also ok!!!
#         self.position = PositionalEmbedding(max_len=100, d_model=self.token.embedding_dim)
        self.segment = SegmentEmbedding(vocab_size=vocab_size+1, embed_size=self.token.embedding_dim)
        
        
        self.dropout = nn.Dropout(p=dropout)
        self.embed_size = embed_size

    def forward(self, sequence, segment_label):
        
        x = self.token(sequence) + self.position(sequence) + self.segment(segment_label)
        return self.dropout(x)

In [270]:
n_skill

13523

AssertionError: 
The error message you are seeing indicates that there is an issue with the BERT model that you are trying to instantiate. The specific error is that the d_model argument must be divisible by the h argument, but this is not the case in your code.

The d_model and h arguments correspond to the hidden size of the model and the number of attention heads, respectively. In order to fix the error, you need to make sure that the hidden size is divisible by the number of attention heads.
```
model = BERT(vocab_size=n_skill, hidden=128, attn_heads=4)
model = BERT(vocab_size=n_skill, hidden=128*4, attn_heads=4)
model = BERT(vocab_size=n_skill, hidden=128, attn_heads=2)
```

In [271]:
# # chatGPT model 
# # Create a BERT model with 4 attention heads and a model size of 128

# # Define the number of words in the input vocabulary
# num_embeddings = 10000

# # Define the maximum length of the input sequence
# num_positions = 512

# # Define the number of token types in the input
# # For example, the BERT tokenizer used in the example code uses the "bert-base-uncased" pre-trained model, which uses a vocabulary of 30,522 tokens. This means that the BERT model will use 30,522 different token types to represent the input text. However, this is just one example, and the number of token types used by a BERT model can vary depending on the specific pre-trained model and tokenization scheme that is used.
# num_token_types = 2


# # Create a BERT embedding layer with a model size of 128, a vocabulary size of 10000, and a dropout rate of 0.1
# # # Create a position-wise feed-forward network with a model size of 128 and a hidden layer size of 512
# model = BERT(4, 128, 512)


# Official BERT model
model = BERT(vocab_size=n_skill,attn_heads=8)
model

BERT(
  (embedding): BERTEmbedding(
    (token): TokenEmbedding(27047, 128, padding_idx=0)
    (position): PositionalEmbedding()
    (segment): SegmentEmbedding(13524, 128, padding_idx=0)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): ModuleList(
    (0): TransformerBlock(
      (attention): MultiHeadedAttention(
        (linear_layers): ModuleList(
          (0): Linear(in_features=128, out_features=128, bias=True)
          (1): Linear(in_features=128, out_features=128, bias=True)
          (2): Linear(in_features=128, out_features=128, bias=True)
        )
        (output_linear): Linear(in_features=128, out_features=128, bias=True)
        (attention): Attention()
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (feed_forward): PositionwiseFeedForward(
        (w_1): Linear(in_features=128, out_features=512, bias=True)
        (w_2): Linear(in_features=512, out_features=128, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        

## Define Train process

In [272]:
def train_fn(model, dataloader, optimizer, scheduler, criterion, device="cpu"):
    model.train()

    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    for item in dataloader:
#         x = item[0]
#         print('x',x, 'x.size()',x.size())
        x = item[0].to(device).long()
        segment_info = item[1].to(device).long()
        times = item[2].to(device).long()
        
        label = item[3].to(device).float()
        

        
        optimizer.zero_grad()
        output = model(x, segment_info, times)
#         print('output',output, 'output.size()',output.size())
              
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss.append(loss.item())
        
        
        target_mask = (segment_info != 0)  # Create a mask indicating which values in segment_info are not 0
        last_nonzero_idx = target_mask.sum(dim=1) - 1  # Find the last non-zero value for each batch
        
        output = output[range(last_nonzero_idx.size()[0]), last_nonzero_idx]  # Index into the output tensor using last_nonzero_idx
        label = label[range(last_nonzero_idx.size()[0]), last_nonzero_idx]  # Index into the label tensor using last_nonzero_idx
        
        
        
        print('output',output, 'output.size()',output.size())
        print('label',label, 'label.size()',label.size())
              
#         print('output = output[range(64), last_nonzero_idx]', output,'output.size()',output.size())
    
        pred = (torch.sigmoid(output) >= 0.5).long()
        print('pred',pred, 'pred.size()',pred.size())
        
        
        
#         output = torch.masked_select(output, target_mask)
#         label = torch.masked_select(label, target_mask)
#         pred = (torch.sigmoid(output) >= 0.5).long()
        
#         print('output = torch.masked_select(output, target_mask)', output,'output.size()',output.size())
        
#         print('label = torch.masked_select(label, target_mask)', label,'label.size()',label.size())
#         print('pred = (torch.sigmoid(output) >= 0.5).long()', pred, 'pred.size()',pred.size())
        
        num_corrects += (pred == label).sum().item()
        num_total += len(label)

#         labels.extend(label.view(-1).data.cpu().numpy())
#         outs.extend(output.view(-1).data.cpu().numpy())
        labels.extend(label.data.cpu().numpy())
        outs.extend(output.data.cpu().numpy())
        
    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.mean(train_loss)

    return loss, acc, auc

## Define Test process

In [273]:
def valid_fn(model, dataloader, criterion, device="cpu"):
    model.eval()

    valid_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    for item in dataloader:
        x = item[0].to(device).long()
        segment_info = item[1].to(device).long()
        times = item[2].to(device).long()
        
        label = item[3].to(device).float()
#         target_mask = (segment_info != 0)

        output= model(x, segment_info)
        loss = criterion(output, label)
        valid_loss.append(loss.item())

        
        
        target_mask = (segment_info != 0)  # Create a mask indicating which values in segment_info are not 0
        
        last_nonzero_idx = target_mask.sum(dim=1) - 1  # Find the last non-zero value for each batch
        
        output = output[range(last_nonzero_idx.size()[0]), last_nonzero_idx]  # Index into the output tensor using last_nonzero_idx
        label = label[range(last_nonzero_idx.size()[0]), last_nonzero_idx]  # Index into the label tensor using last_nonzero_idx

#         output = torch.masked_select(output, target_mask)
#         label = torch.masked_select(label, target_mask)
        pred = (torch.sigmoid(output) >= 0.5).long()
#         pred = (torch.sigmoid(output) >= 0.5)
    
        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.mean(valid_loss)

    return loss, acc, auc

## Training

- **Criterion**: BCELoss and BCEWithLogitsLoss are both functions used to calculate the binary cross-entropy loss for a given set of predicted and target values. The main difference between the two is that BCEWithLogitsLoss applies a sigmoid function to the predicted values, whereas BCELoss expects the predicted values to already be in the range of 0 to 1. The negative log likelihood loss (which BERT use originally), also known as the cross-entropy loss, is a common loss function used in classification tasks, particularly when working with a multi-class classification problem.

In [274]:
# try using SGD with weight decay
# optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

criterion = nn.BCEWithLogitsLoss()
# criterion = nn.BCELoss()
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=MAX_LEARNING_RATE, steps_per_epoch=len(train_dataloader), epochs=EPOCHS)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion.to(device)


best_auc = 0
max_steps = 3
step = 0
for epoch in range(EPOCHS):
    loss, acc, auc = train_fn(model, train_dataloader, optimizer, scheduler, criterion, device)
    print("epoch - {}/{} train: loss-{:.4f}, acc-{:.4f}, auc - {:.4f}".format(epoch+1, EPOCHS, loss, acc, auc))
    loss, acc, auc = valid_fn(model, valid_dataloader, criterion, device)
    print("epoch - {}/{} valid: loss-{:.4f}, [acc-{:.4f}, auc - {:.4f}]".format(epoch+1, EPOCHS, loss, acc, auc))

    
    if auc > best_auc:
        best_auc = auc
        step = 0
        torch.save(model.state_dict(), "bert_cross_model.pt")
    else:
        step += 1
        if step >= max_steps:
            break

output tensor([ -5.2062e+01,  -6.9609e+00,   2.6880e+01,  -4.5676e+01,  -8.2700e+01,
          1.4646e+01,   1.2234e+02,  -2.5002e+01,  -5.3937e+01,  -5.1212e+01,
         -1.2794e+02,   6.7122e+01,   1.7777e+02, -2.2531e+100,  -6.5031e+01,
          9.7545e+01,  -6.9896e+99,   8.1997e+36,  -3.6284e+01,   2.4067e+02,
         -1.2594e+02,   1.6713e+02,  -4.2773e+01,  -1.8873e+00,   8.8455e+00,
          5.0615e+00,  -1.2903e+01,   1.3374e+02,  -4.1311e+72,   7.3560e+01,
          3.1650e+01,  -2.6523e+01,   1.7860e+02,  -2.4671e+01,   1.7900e+14,
          7.7845e+01,   4.6255e+91,   1.9556e+01,  -6.9970e+01,  -8.2393e+00,
         -6.1273e+01,   2.8261e+02,   3.4242e+01,   7.6701e+01,   8.4990e+01,
          1.7608e+01,   1.2970e+02,  -2.3921e+02,   7.8888e+01,   1.1543e+02,
          4.4623e+01,   2.3862e+02,   3.5455e+01,  -1.9240e+01,  -7.9682e+01,
          9.1805e+01,  -8.9153e+01,  -5.5195e+01,  -4.2838e+01,   1.1818e+01,
          1.7123e+02,   9.9363e+01,   1.7691e+02,   1.607

output tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       device='cuda:0', dtype=torch.float64, grad_fn=<IndexBackward0>) output.size() torch.Size([64])
label tensor([0., 0., 1., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 1., 0., 0., 1.,
        1., 0., 0., 0., 0., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0.,
        1., 1., 1., 0., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 1., 0., 0.,
        1., 1., 1., 0., 0., 1., 0., 0., 1., 1.], device='cuda:0') label.size() torch.Size([64])
pred tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

output tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       device='cuda:0', dtype=torch.float64, grad_fn=<IndexBackward0>) output.size() torch.Size([64])
label tensor([0., 0., 1., 0., 1., 1., 1., 1., 0., 1., 0., 1., 0., 0., 0., 1., 1., 0.,
        0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0., 1., 0., 1.,
        0., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 1., 0., 0., 1., 0., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 0.], device='cuda:0') label.size() torch.Size([64])
pred tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

output tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       device='cuda:0', dtype=torch.float64, grad_fn=<IndexBackward0>) output.size() torch.Size([64])
label tensor([1., 0., 1., 0., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0.,
        1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1.,
        0., 1., 1., 0., 0., 1., 0., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 0.,
        0., 1., 0., 1., 1., 0., 1., 0., 0., 1.], device='cuda:0') label.size() torch.Size([64])
pred tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

output tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       device='cuda:0', dtype=torch.float64, grad_fn=<IndexBackward0>) output.size() torch.Size([64])
label tensor([0., 0., 1., 1., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
        1., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1.,
        1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 1., 1., 0.,
        1., 0., 1., 1., 1., 0., 1., 1., 0., 0.], device='cuda:0') label.size() torch.Size([64])
pred tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

KeyboardInterrupt: 

In [None]:
# 0.8087