In [6]:
import torch
from torch import nn

torch.__version__

'2.3.0'

# 1. Create the Tensor directly to Device 

In [2]:
# Bad example
for i in range(100):
    tensor = torch.ones([100, 2])
    tensor = tensor.cuda()

In [3]:
# Better Example
for i in range(100):
    tensor = torch.ones([100, 2], device="cuda:0")

# 2. Using 'Sequential' Layers when Possible

In [9]:
# Bad example
class ExampleModel(nn.Module):
    def __init__(self, inp_shape, out_shape):
        super().__init__()
        self.inp = inp_shape
        self.out = out_shape
        hidden_size = 512

        self.layer1 = nn.Linear(inp_shape, hidden_size)
        self.ac = nn.ReLU()

        self.layer2 = nn.Linear(hidden_size, out_shape)

    def forward(self, x):
        x = self.layer1(x)
        x = self.ac(x)
        x = self.layer2(x)

        return x


model = ExampleModel(3, 5)
tensor = torch.ones([255, 3])
model(tensor).shape

torch.Size([255, 5])

In [11]:
# Better example
class ExampleModel(nn.Module):
    def __init__(self, inp_shape, out_shape):
        super().__init__()
        self.inp = inp_shape
        self.out = out_shape
        hidden_size = 512

        self.layer = nn.Sequential(
            nn.Linear(inp_shape, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, out_shape)
        )

    def forward(self, x):
        return self.layer(x)


model = ExampleModel(3, 5)
tensor = torch.ones([255, 3])
model(tensor).shape

torch.Size([255, 5])

# 3. Dont use 'List'

In [14]:
# Bad example
class ExampleModel(nn.Module):
    def __init__(self, inp_shape, out_shape):
        super().__init__()
        self.inp = inp_shape
        self.out = out_shape
        hidden_size = 512

        self.layer1 = nn.Linear(inp_shape, hidden_size)
        self.ac = nn.ReLU()

        self.layer_hidden = []
        for i in range(5):
            self.layer_hidden.append(nn.Linear(hidden_size, hidden_size))
            self.layer_hidden.append(nn.ReLU())

        self.layer2 = nn.Linear(hidden_size, out_shape)

    def forward(self, x):
        x = self.layer1(x)
        x = self.ac(x)

        for layer in self.layer_hidden:
            x = layer(x)

        x = self.layer2(x)

        return x


model = ExampleModel(3, 5).cuda(0)
tensor = torch.ones([255, 3], device='cuda:0')
model(tensor).shape

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)

In [18]:
# Bad example
class ExampleModel(nn.Module):
    def __init__(self, inp_shape, out_shape):
        super().__init__()
        self.inp = inp_shape
        self.out = out_shape
        hidden_size = 512

        layer1 = nn.Linear(inp_shape, hidden_size)
        ac = nn.ReLU()

        layer_hidden = []
        for i in range(5):
            layer_hidden.append(nn.Linear(hidden_size, hidden_size))
            layer_hidden.append(nn.ReLU())

        layer2 = nn.Linear(hidden_size, out_shape)

        self.layer = nn.Sequential(
            layer1, ac,
            *layer_hidden,  # unpacking each layer
            layer2
        )

    def forward(self, x):
        return self.layer(x)


model = ExampleModel(3, 5).cuda(0)
tensor = torch.ones([5, 3], device='cuda:0')
model(tensor).shape

torch.Size([5, 5])

# 4. Using Distribution Module 

In [25]:
output = model(tensor).detach().cpu()
output

tensor([[-0.0012, -0.0071, -0.0035, -0.0184,  0.0034],
        [-0.0012, -0.0071, -0.0035, -0.0184,  0.0034],
        [-0.0012, -0.0071, -0.0035, -0.0184,  0.0034],
        [-0.0012, -0.0071, -0.0035, -0.0184,  0.0034],
        [-0.0012, -0.0071, -0.0035, -0.0184,  0.0034]])

In [16]:
from torch.distributions import Categorical
from torch.distributions.kl import kl_divergence

In [28]:
dist = Categorical(logits = output)
dist

Categorical(logits: torch.Size([5, 5]))

In [29]:
dist.probs

tensor([[0.2008, 0.1996, 0.2004, 0.1974, 0.2018],
        [0.2008, 0.1996, 0.2004, 0.1974, 0.2018],
        [0.2008, 0.1996, 0.2004, 0.1974, 0.2018],
        [0.2008, 0.1996, 0.2004, 0.1974, 0.2018],
        [0.2008, 0.1996, 0.2004, 0.1974, 0.2018]])

In [35]:
dist.sample()

tensor([1, 3, 0, 1, 3])

In [36]:
dist_1 = Categorical(logits=output[0])
dist_2 = Categorical(logits=output[1])

kl_divergence(dist_1, dist_2)

tensor(0.)

# 5. Using 'detach()'

In [37]:
# Setup 
model = ExampleModel(3, 2)
data_batches = [torch.rand([5, 3]) for _ in range(5)]
criterion = nn.MSELoss(reduce='mean')



In [39]:
#Bad example
losses = []
for batch in data_batches:
    output = model(batch)
    target = torch.rand([5, 2])
    loss = criterion(output, target)
    
    losses.append(loss)

print(losses)

[tensor(0.4546, grad_fn=<MseLossBackward0>), tensor(0.3607, grad_fn=<MseLossBackward0>), tensor(0.1973, grad_fn=<MseLossBackward0>), tensor(0.2186, grad_fn=<MseLossBackward0>), tensor(0.4076, grad_fn=<MseLossBackward0>)]


In [41]:
#Better example
losses = []
for batch in data_batches:
    output = model(batch)
    target = torch.rand([5, 2])
    loss = criterion(output, target).detach()
    
    losses.append(loss)

print(losses)

[tensor(0.4338), tensor(0.3132), tensor(0.6140), tensor(0.3455), tensor(0.1800)]


# 6. Trick to Delete a Model from GPU

In [42]:
import gc

del model

gc.collect()

torch.cuda.empty_cache()

# 7. Call 'eval()' Before Testing 

In [43]:
model = ExampleModel(3, 2)
# Training model 
model.train()

# Testing model
model.eval()

# => Turn off the Drop, Batch Norm, .... 

ExampleModel(
  (layer): Sequential(
    (0): Linear(in_features=3, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=512, bias=True)
    (5): ReLU()
    (6): Linear(in_features=512, out_features=512, bias=True)
    (7): ReLU()
    (8): Linear(in_features=512, out_features=512, bias=True)
    (9): ReLU()
    (10): Linear(in_features=512, out_features=512, bias=True)
    (11): ReLU()
    (12): Linear(in_features=512, out_features=2, bias=True)
  )
)