In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pickle
import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, Dataset
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print ("device:[%s]."%(device))

In [270]:
#data 로드
with open('/opt/ml/data.pickle', 'rb') as fr:
    data = pickle.load(fr)
    
data.keys()

dict_keys(['train', 'test', 'field_dims', 'users', 'books', 'sub', 'idx2user', 'idx2isbn', 'user2idx', 'isbn2idx', 'X_train', 'X_valid', 'y_train', 'y_valid', 'train_dataloader', 'valid_dataloader', 'test_dataloader'])

In [257]:
data['train'] = data['train'].drop(["user_mean","book_mean"],axis=1)
data['test'] = data['test'].drop(["user_mean","book_mean"],axis=1)
data["field_dims"] = data["field_dims"][:7] 
data["field_dims"]
# data['X_train']

array([ 68069, 149570,      6,   3865,  11571,     27,  62059],
      dtype=uint32)

In [271]:
# X_train, X_valid, y_train, y_valid = train_test_split(
#                                                     data['train'].drop(['rating'], axis=1),
#                                                     data['train']['rating'],
#                                                     test_size=0.2,
#                                                     random_state=42,
#                                                     shuffle=True
#                                                     )
# data['X_train'], data['X_valid'], data['y_train'], data['y_valid'] = X_train, X_valid, y_train, y_valid
train_dataset = TensorDataset(torch.LongTensor(data['X_train'].values), torch.LongTensor(data['y_train'].values))
valid_dataset = TensorDataset(torch.LongTensor(data['X_valid'].values), torch.LongTensor(data['y_valid'].values))
test_dataset = TensorDataset(torch.LongTensor(data['test'].values))

train_dataloader = DataLoader(train_dataset, batch_size=100, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=100, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=100, shuffle=False)

data['train_dataloader'], data['valid_dataloader'], data['test_dataloader'] = train_dataloader, valid_dataloader, test_dataloader


In [272]:
class FeaturesEmbedding(nn.Module):

    def __init__(self, field_dims: np.ndarray, embed_dim: int):
        super().__init__()
        self.embedding = torch.nn.Embedding(sum(field_dims), embed_dim)
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long)
        torch.nn.init.xavier_uniform_(self.embedding.weight.data)

    def forward(self, x: torch.Tensor):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
 
        return self.embedding(x)


class MultiHeadAttention(nn.Module):
    def __init__(self,field_dims: np.ndarray,d_feat=128,n_head=5,
                 actv=F.relu,USE_BIAS=True,dropout=0.1,device=None):
        """
        :param d_feat: feature dimension
        :param n_head: number of heads
        :param actv: activation after each linear layer
        :param USE_BIAS: whether to use bias
        :param dropout_p: dropout rate
        :device: which device to use (e.g., cuda:0)
        """
        super(MultiHeadAttention,self).__init__()
        
        if (d_feat%n_head) != 0:
            raise ValueError("d_feat(%d) should be divisible by b_head(%d)"%(d_feat,n_head)) 
        self.d_feat = d_feat
        self.n_head = n_head
        self.d_head = self.d_feat // self.n_head
        self.actv = actv
        self.USE_BIAS = USE_BIAS
        self.dropout = dropout # prob. of zeroed
        self.embdding = FeaturesEmbedding(field_dims, self.d_feat)
        
        
        
        self.lin_Q = nn.Linear(self.d_feat,self.d_feat,self.USE_BIAS)
        self.lin_K = nn.Linear(self.d_feat,self.d_feat,self.USE_BIAS)
        self.lin_V = nn.Linear(self.d_feat,self.d_feat,self.USE_BIAS)
        self.lin_O = nn.Linear(self.d_feat,self.d_feat,self.USE_BIAS)

        self.dropout = nn.Dropout(p=self.dropout)
    
    def forward(self,X,mask=None):
        """
        :param Q: [n_batch, n_Q, d_feat]
        :param K: [n_batch, n_K, d_feat]
        :param V: [n_batch, n_V, d_feat] <= n_K and n_V must be the same 
        :param mask: 
        """
        Q = self.embdding(X)
        K = self.embdding(X)
        V = self.embdding(X)

        n_batch = Q.shape[0]
        Q_feat = self.lin_Q(Q) 
        K_feat = self.lin_K(K) 
        V_feat = self.lin_V(V)
        # Q_feat: [n_batch, n_Q, d_feat]
        # K_feat: [n_batch, n_K, d_feat]
        # V_feat: [n_batch, n_V, d_feat]

        # Multi-head split of Q, K, and V (d_feat = n_head*d_head)
        Q_split = Q_feat.view(n_batch, -1, self.n_head, self.d_head).permute(0, 2, 1, 3)
        K_split = K_feat.view(n_batch, -1, self.n_head, self.d_head).permute(0, 2, 1, 3)
        V_split = V_feat.view(n_batch, -1, self.n_head, self.d_head).permute(0, 2, 1, 3)
        # Q_split: [n_batch, n_head, n_Q, d_head]
        # K_split: [n_batch, n_head, n_K, d_head]
        # V_split: [n_batch, n_head, n_V, d_head]

        # Multi-Head Attention
        d_K = K.size()[-1] # key dimension
        scores = torch.matmul(Q_split,K_split.permute(0,1,3,2)) / np.sqrt(d_K)
        if mask is not None:
            scores = scores.masked_fill(mask==0,-1e9)
        attention = torch.softmax(scores,dim=-1)
        x_raw = torch.matmul(self.dropout(attention),V_split) # dropout is NOT mentioned in the paper
        # attention: [n_batch, n_head, n_Q, n_K]
        # x_raw: [n_batch, n_head, n_Q, d_head]

        # Reshape x
        x_rsh1 = x_raw.permute(0,2,1,3).contiguous()
        # x_rsh1: [n_batch, n_Q, n_head, d_head]
        x_rsh2 = x_rsh1.view(n_batch,-1,self.d_feat)
        # x_rsh2: [n_batch, n_Q, d_feat]

        # Linear
        x = self.lin_O(x_rsh2)
        # x: [n_batch, n_Q, d_feat]

        return x
    

class MultiLayerPerceptron(nn.Module):

    def __init__(self, field_dims, n_head, input_dim, embed_dim, embed_dims, USE_BIAS, dropout, device, output_layer=True):
        super().__init__()
        
        self.attention = MultiHeadAttention(field_dims = field_dims,
                                            d_feat=embed_dim,n_head=n_head,
                                            actv=F.relu,USE_BIAS=True,dropout=0.1)
        layers = list()
        d_feat=embed_dim
        self.input_dim = input_dim
        # self.embdding = FeaturesEmbedding(field_dims, embed_dim)
        for embed_dim in embed_dims:
            layers.append(torch.nn.Linear(input_dim, embed_dim))
            layers.append(torch.nn.BatchNorm1d(embed_dim))
            layers.append(torch.nn.LeakyReLU())
            layers.append(torch.nn.Dropout(p=dropout))
            input_dim = embed_dim
        # if output_layer:
        #     layers.append(torch.nn.Linear(input_dim, 1))
        self.mlp = torch.nn.Sequential(*layers)
        self.fc = torch.nn.Linear(embed_dims[-1] + d_feat, 1)
    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, embed_dim)``
        """
        # x = torch.tensor(x,dtype= np.long)
        # x_user = torch.cat((x[:,0].unsqueeze(1),x[:,2].unsqueeze(1)),dim=1)
        # x_item = torch.cat((x[:,1].unsqueeze(1),x[:,3:]),dim=1)
        # x_user = torch.tensor(x_user,dtype= np.long)
        # x_item = torch.tensor(x_item,dtype= np.long)
        # x_user = self.embdding(x_user)
        x = self.attention(x)
        print(x)
        # print(x_user)
        # print(x_item)
        user_x = x[:, np.array((0, ), dtype=np.long)].squeeze(1)
        item_x = x[:, np.array((1, ), dtype=np.long)].squeeze(1)
        gmf = user_x * item_x
        # x = torch.cat((x_user[:,0],x_item[:,0],x_user[:,1],x_item[:,1:]),dim=1)
        x = self.mlp(x.view(-1, self.input_dim))

        x = torch.cat([gmf, x], dim=1)

        x = self.fc(x).squeeze(1)
        return x

In [265]:
# field_dims = data['train'].drop(["rating"],axis=1).nunique().values
# field_dims = np.array([field_dims[1]]+list(field_dims[3:]))
# offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long)
# x = torch.tensor(data['train'].drop(["rating"],axis=1).values)
# x = torch.tensor(x,dtype= np.long )
# x = torch.cat((x[:,1].unsqueeze(1),x[:,3:]),dim=1)
# print(x)
# f = FeaturesEmbedding(field_dims, 10)
# f(x)

# next(iter(train_dataloader))[0]

In [273]:
def rmse(real: list, predict: list) -> float:
    pred = np.array(predict)
    return np.sqrt(np.mean((real-pred) ** 2))

class RMSELoss(torch.nn.Module):
    def __init__(self):
        super(RMSELoss,self).__init__()
        self.eps = 1e-6

    def forward(self, x, y):
        criterion = nn.MSELoss()
        loss = torch.sqrt(criterion(x, y)+self.eps)
        return loss
    
    
class BST:

    def __init__(self, data):
        super().__init__()

        self.criterion = RMSELoss()

        self.train_dataloader = data['train_dataloader']
        self.valid_dataloader = data['valid_dataloader']
        self.field_dims = data['field_dims']
        self.n_head  = 10
        self.embed_dim = 60
        self.epochs = 5 #args.EPOCHS
        self.learning_rate = 5e-3 #args.LR
        self.weight_decay = 1e-5 #args.WEIGHT_DECAY
        self.log_interval = 100

        self.device = "cuda"#args.DEVICE

        self.mlp_dims = [1024,512,256]#args.NCF_MLP_DIMS
        self.dropout = 0.1 #args.NCF_DROPOUT

        self.model = MultiLayerPerceptron(field_dims=self.field_dims, n_head = self.n_head, 
                                          input_dim = len(self.field_dims) * self.embed_dim, embed_dim = self.embed_dim, embed_dims = self.mlp_dims,
                                        USE_BIAS=True,dropout=0.2,device=self.device, output_layer=True).to(self.device)
        
        
        self.optimizer = torch.optim.Adam(params=self.model.parameters(), lr=self.learning_rate, 
                                          amsgrad=True, weight_decay=self.weight_decay)


    def train(self):
      # model: type, optimizer: torch.optim, train_dataloader: DataLoader, criterion: torch.nn, device: str, log_interval: int=100
        for epoch in range(self.epochs):
            self.model.train()
            total_loss = 0
            tk0 = tqdm.tqdm(self.train_dataloader, smoothing=0, mininterval=1.0)
            for i, (fields, target) in enumerate(tk0):
                fields, target = fields.to(self.device), target.to(self.device)
                y = self.model(fields)
                loss = self.criterion(y, target.float())
                self.model.zero_grad()
                loss.backward()
                self.optimizer.step()
                total_loss += loss.item()
                if (i + 1) % self.log_interval == 0:
                    tk0.set_postfix(loss=total_loss / self.log_interval)
                    total_loss = 0

            rmse_score = self.predict_train()
            print('epoch:', epoch, 'validation: rmse:', rmse_score)


    def predict_train(self):
        self.model.eval()
        targets, predicts = list(), list()
        with torch.no_grad():
            for fields, target in tqdm.tqdm(self.valid_dataloader, smoothing=0, mininterval=1.0):
                fields, target = fields.to(self.device), target.to(self.device)
                y = self.model(fields)
                targets.extend(target.tolist())
                predicts.extend(y.tolist())
        return rmse(targets, predicts)


    def predict(self, dataloader):
        self.model.eval()
        predicts = list()
        with torch.no_grad():
            for fields in tqdm.tqdm(dataloader, smoothing=0, mininterval=1.0):
                fields = fields[0].to(self.device)
                y = self.model(fields)
                predicts.extend(y.tolist())
        return predicts

In [274]:
model = BST(data)
model.train()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.