In [4]:
import numpy as np
import torch as th 
from torch import nn
np.random.seed(seed=42)
print(
    np.__version__,
    th.__version__,
    'document reference : https://www.i32n.com/docs/pytorch/tutorials/beginner/basics/intro.html',
    sep='\n\n'
)


# Reference

# https://pytorch.org/tutorials/beginner/basics/intro.html

1.23.5

1.13.1+cu117

document reference : https://www.i32n.com/docs/pytorch/tutorials/beginner/basics/intro.html


# Quick Start

## Creating Model with nn.Nodule

In [5]:
device = (
    "cuda"
    if th.cuda.is_available()
    else "mps"
    if th.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")


Using cpu device


In [6]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [9]:
from cp_spark_ml.algo.eges.model import EGES

EGES(
    dim=16, num_nodes=400, side_info_name_with_n_unique_values={'n_shop':2}
)

EGES(
  (side_info_weights): Embedding(400, 2)
)

## Tensors

In [16]:
data = [[1, 2],[3, 4]]
x_data = th.tensor(data)

print(x_data)

np_array = np.array(data)
x_np = th.from_numpy(np_array)

print(x_np)

x_ones = th.ones_like(x_data) # retains the properties of x_data
print(f"Ones Tensor: \n {x_ones} \n")

x_rand = th.rand_like(x_data, dtype=th.float) # overrides the datatype of x_data
print(f"Random Tensor: \n {x_rand} \n")


tensor([[1, 2],
        [3, 4]])
tensor([[1, 2],
        [3, 4]])
Ones Tensor: 
 tensor([[1, 1],
        [1, 1]]) 

Random Tensor: 
 tensor([[0.0247, 0.6037],
        [0.9539, 0.1645]]) 



In [18]:
shape = (2,3,)
rand_tensor = th.rand(shape)
ones_tensor = th.ones(shape)
zeros_tensor = th.zeros(shape)

print(f"Random Tensor: \n {rand_tensor} \n")
print(f"Ones Tensor: \n {ones_tensor} \n")
print(f"Zeros Tensor: \n {zeros_tensor}")

Random Tensor: 
 tensor([[0.9303, 0.8023, 0.2242],
        [0.7083, 0.8109, 0.1115]]) 

Ones Tensor: 
 tensor([[1., 1., 1.],
        [1., 1., 1.]]) 

Zeros Tensor: 
 tensor([[0., 0., 0.],
        [0., 0., 0.]])


In [19]:
tensor = th.rand(3,4)

print(f"Shape of tensor: {tensor.shape}")
print(f"Datatype of tensor: {tensor.dtype}")
print(f"Device tensor is stored on: {tensor.device}")


Shape of tensor: torch.Size([3, 4])
Datatype of tensor: torch.float32
Device tensor is stored on: cpu


## Tensor Operations

In [23]:
# We move our tensor to the GPU if available
is_gpu = th.cuda.is_available()
print(is_gpu)
if is_gpu:
    tensor = tensor.to("cuda")

False


In [25]:
tensor = th.ones(4, 4)
print(f"First row: {tensor[0]}")
print(f"First column: {tensor[:, 0]}")
print(f"Last column: {tensor[..., -1]}")
tensor[:,1] = 0
print(tensor)


First row: tensor([1., 1., 1., 1.])
First column: tensor([1., 1., 1., 1.])
Last column: tensor([1., 1., 1., 1.])
tensor([[1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.]])


In [27]:
# Concat tensor
t1 = th.cat([tensor, tensor, tensor], dim=1)
print(t1)


tensor([[1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.],
        [1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.],
        [1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.],
        [1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.]])


In [31]:
# This computes the matrix multiplication between two tensors. y1, y2, y3 will have the same value
# ``tensor.T`` returns the transpose of a tensor
y1 = tensor @ tensor.T
y2 = tensor.matmul(tensor.T)

y3 = th.rand_like(y1)
th.matmul(tensor, tensor.T, out=y3)


# This computes the element-wise product. z1, z2, z3 will have the same value
z1 = tensor * tensor
z2 = tensor.mul(tensor)

z3 = th.rand_like(tensor)
th.mul(tensor, tensor, out=z3)


tensor([[1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.]])

In [32]:
agg = tensor.sum()
agg_item = agg.item()
print(agg_item, type(agg_item))


12.0 <class 'float'>


## Th <--> Np

In [33]:
t = th.ones(5)
print(f"t: {t}")
n = t.numpy()
print(f"n: {n}")

t: tensor([1., 1., 1., 1., 1.])
n: [1. 1. 1. 1. 1.]


In [34]:
t.add_(1)
print(f"t: {t}")
print(f"n: {n}")


t: tensor([2., 2., 2., 2., 2.])
n: [2. 2. 2. 2. 2.]


In [36]:
n = np.ones(5)
t = th.from_numpy(n)


In [37]:
np.add(n, 1, out=n)
print(f"t: {t}")
print(f"n: {n}")

t: tensor([2., 2., 2., 2., 2.], dtype=torch.float64)
n: [2. 2. 2. 2. 2.]


## ReShaping

In [62]:
# https://pytorch.org/docs/stable/generated/torch.Tensor.view.html

t = th.tensor([1,2,3])
t2 = t.unsqueeze(0)
t3 = t.unsqueeze(0)
# t4 = t.view([0])
print(t.shape, t2.shape, t3.shape)

torch.Size([3]) torch.Size([1, 3]) torch.Size([1, 3])


# Custom Layers

## DIN (Deep Interest Network)

* https://github.com/fanoping/DIN-pytorch/tree/master/din
* http://arxiv.org/pdf/1706.06978

## Dice Activation Function

* Basically, regulaization based on the id frequency

In [41]:
import torch.nn as nn
import torch


class Dice(nn.Module):
    def __init__(self, num_features, dim=2):
        super(Dice, self).__init__()
        assert dim == 2 or dim == 3
        self.bn = nn.BatchNorm1d(num_features, eps=1e-9)
        self.sigmoid = nn.Sigmoid()
        self.dim = dim
        
        if self.dim == 3:
            # self.alpha = th.zeros((num_features, 1)).cuda()
            self.alpha = th.zeros((num_features, 1))
        elif self.dim == 2:
            # self.alpha = th.zeros((num_features,)).cuda()
            self.alpha = th.zeros((num_features,))
        

    def forward(self, x):
        if self.dim == 3:
            x = th.transpose(x, 1, 2)
            x_p = self.sigmoid(self.bn(x))
            out = self.alpha * (1 - x_p) * x + x_p * x
            out = th.transpose(out, 1, 2)
        
        elif self.dim == 2:
            x_p = self.sigmoid(self.bn(x))
            out = self.alpha * (1 - x_p) * x + x_p * x
        
        return out

In [42]:
a = Dice(32)
b = torch.zeros((10, 32))
    #b = torch.transpose(b, 1, 2)
c = a(b)
print(c.size())

torch.Size([10, 32])


## FC Layer

In [43]:
import torch.nn as nn
# from .dice import Dice
#from dice import Dice

class FullyConnectedLayer(nn.Module):
    def __init__(self, input_size, hidden_size, bias, batch_norm=True, dropout_rate=0.5, activation='relu', sigmoid=False, dice_dim=2):
        super(FullyConnectedLayer, self).__init__()
        assert len(hidden_size) >= 1 and len(bias) >= 1
        assert len(bias) == len(hidden_size)
        self.sigmoid = sigmoid

        layers = []
        layers.append(nn.Linear(input_size, hidden_size[0], bias=bias[0]))
        
        for i, h in enumerate(hidden_size[:-1]):
            if batch_norm:
                layers.append(nn.BatchNorm1d(hidden_size[i]))
            
            if activation.lower() == 'relu':
                layers.append(nn.ReLU(inplace=True))
            elif activation.lower() == 'dice':
                assert dice_dim
                layers.append(Dice(hidden_size[i], dim=dice_dim))
            elif activation.lower() == 'prelu':
                layers.append(nn.PReLU())
            else:
                raise NotImplementedError
            
            layers.append(nn.Dropout(p=dropout_rate))
            layers.append(nn.Linear(hidden_size[i], hidden_size[i+1], bias=bias[i]))
        
        self.fc = nn.Sequential(*layers)
        if self.sigmoid:
            self.output_layer = nn.Sigmoid()
        
        # weight initialization xavier_normal (or glorot_normal in keras, tf)
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight.data, gain=1.0)
                if m.bias is not None:
                    nn.init.zeros_(m.bias.data)

    def forward(self, x):
        return self.output_layer(self.fc(x)) if self.sigmoid else self.fc(x) 

In [49]:
a = FullyConnectedLayer(input_size=2, hidden_size=[200, 80, 1], bias=[True,True,True])
b = torch.zeros((3, 2))
out = a(b)
print(
    b,
    a,
    out,
    out.size()
)

tensor([[0., 0.],
        [0., 0.],
        [0., 0.]]) FullyConnectedLayer(
  (fc): Sequential(
    (0): Linear(in_features=2, out_features=200, bias=True)
    (1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=200, out_features=80, bias=True)
    (5): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU(inplace=True)
    (7): Dropout(p=0.5, inplace=False)
    (8): Linear(in_features=80, out_features=1, bias=True)
  )
) tensor([[0.],
        [0.],
        [0.]], grad_fn=<AddmmBackward0>) torch.Size([3, 1])


## LocalAttention

In [50]:
class LocalActivationUnit(nn.Module):
    def __init__(self, hidden_size=[80, 40], bias=[True, True], embedding_dim=4, batch_norm=False):
        super(LocalActivationUnit, self).__init__()
        self.fc1 = FullyConnectedLayer(input_size=4*embedding_dim,
                                       hidden_size=hidden_size,
                                       bias=bias,
                                       batch_norm=batch_norm,
                                       activation='dice',
                                       dice_dim=3)

        self.fc2 = FullyConnectedLayer(input_size=hidden_size[-1],
                                       hidden_size=[1],
                                       bias=[True],
                                       batch_norm=batch_norm,
                                       activation='dice',
                                       dice_dim=3)
        # TODO: fc_2 initialization

    def forward(self, query, user_behavior):
        # query ad            : size -> batch_size * 1 * embedding_size
        # user behavior       : size -> batch_size * time_seq_len * embedding_size

        user_behavior_len = user_behavior.size(1)
        queries = torch.cat([query for _ in range(user_behavior_len)], dim=1)

        attention_input = torch.cat([queries, user_behavior, queries-user_behavior, queries*user_behavior], dim=-1)
        attention_output = self.fc1(attention_input)
        attention_output = self.fc2(attention_output)

        return attention_output

In [54]:
class AttentionSequencePoolingLayer(nn.Module):
    def __init__(self, embedding_dim=4):
        super(AttentionSequencePoolingLayer, self).__init__()

        # TODO: DICE acitivation function
        # TODO: attention weight normalization
        self.local_att = LocalActivationUnit(hidden_size=[64, 16], bias=[True, True], embedding_dim=embedding_dim, batch_norm=False)

    
    def forward(self, query_ad, user_behavior, user_behavior_length):
        # query ad            : size -> batch_size * 1 * embedding_size
        # user behavior       : size -> batch_size * time_seq_len * embedding_size
        # user behavior length: size -> batch_size * 1
        # output              : size -> batch_size * 1 * embedding_size
        
        attention_score = self.local_att(query_ad, user_behavior)
        attention_score = th.transpose(attention_score, 1, 2)  # B * 1 * T
        #print(attention_score.size())
        
        # define mask by length
        user_behavior_length = user_behavior_length.type(th.LongTensor)
        mask = th.arange(user_behavior.size(1))[None, :] < user_behavior_length[:, None]
        
        # mask
        output = th.mul(attention_score, mask.type(th.FloatTensor))  # batch_size *

        # multiply weight
        output = th.matmul(output, user_behavior)

        return output

In [55]:
a = AttentionSequencePoolingLayer()
b = th.zeros((3, 1, 4))
c = th.zeros((3, 20, 4))
d = th.ones((3, 1))
a(b, c, d)

tensor([[[0., 0., 0., 0.]],

        [[0., 0., 0., 0.]],

        [[0., 0., 0., 0.]]], grad_fn=<UnsafeViewBackward0>)