# Check GPU availability

In [1]:
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))
    print("GPU memory:", torch.cuda.get_device_properties(0).total_memory / 1e9, "GB")
else:
    print("No GPU found - will use CPU")

PyTorch version: 2.8.0+cu128
CUDA available: True
GPU name: NVIDIA RTX 4000 Ada Generation
GPU memory: 21.125267456 GB


# Understanding nn.Linear

In [4]:
import torch
import torch.nn as nn

#This helps us initilize the weights and biases for the linear layer
#Start at random values up front
linear = nn.Linear(4, 3)  # 4 input features â†’ 3 output features

#This is the input to the linear layer
x = torch.randn(2, 4)     # batch of 2 samples, 4 features each

# Here y=xw^T+b happens in the background(pytorch abstracts this away)
output = linear(x)         # shape: [2, 3]
print(output)
# See the weight matrix
print(linear.weight.shape)  # torch.Size([3, 4])
print(linear.weight)

# See the bias vector
print(linear.bias.shape)    # torch.Size([3])
print(linear.bias)

tensor([[ 0.4444, -0.4627, -0.2067],
        [-0.8967, -0.1703, -0.2728]], grad_fn=<AddmmBackward0>)
torch.Size([3, 4])
Parameter containing:
tensor([[-0.0066,  0.4653, -0.2662, -0.1063],
        [ 0.3114, -0.1291, -0.2150, -0.2389],
        [-0.3712, -0.4483,  0.2502,  0.0055]], requires_grad=True)
torch.Size([3])
Parameter containing:
tensor([ 0.0478, -0.2054, -0.3932], requires_grad=True)


# Understanding nn.Dropout

In [9]:
import torch
import torch.nn as nn
import math

# Input data
x = torch.tensor([[1.0, 2.0, 3.0, 4.0],
                  [5.0, 6.0, 7.0, 8.0]])

# Create dropout layer with 10% dropout rate
dropout = nn.Dropout(0.1)

# Apply dropout to x
output = dropout(x)

print("Original:")
print(x)
print("\nAfter Dropout:")
print(output)

Original:
tensor([[1., 2., 3., 4.],
        [5., 6., 7., 8.]])

After Dropout:
tensor([[1.1111, 2.2222, 3.3333, 4.4444],
        [5.5556, 6.6667, 7.7778, 8.8889]])


# Understanding forward()

# In PyTorch, forward() is a special method name that defines how data flows through your neural network.


In [10]:
class MyModel(nn.Module):
    def forward(self, x):
        # This is what happens when you call the model
        return x * 2

# Create model
model = MyModel()

# When you do this:
output = model(x)

# PyTorch automatically calls forward() behind the scenes:
# output = model.forward(x)

In [11]:
#Python has a special method called __call__() that allows you to call an object like a function.
class MyCallableClass:
    def __call__(self, x):
        print(f"I was called with: {x}")
        return x * 2

# Create object
obj = MyCallableClass()

# Call it like a function!
result = obj(5)  # Prints: "I was called with: 5"
print(result)     # 10

I was called with: 5
10


In [13]:
# Understanding what does .view() do?
import torch

# Setup
batch = 2
seq_len = 5
d_model = 512
num_heads = 8
head_dim = d_model // num_heads  # 512 // 8 = 64

# Create Q
Q = torch.randn(batch, seq_len, d_model)
print(f"Before: {Q.shape}")  # [2, 5, 512]

# Split into multiple heads
Q = Q.view(batch, seq_len, num_heads, head_dim)
print(f"After:  {Q.shape}")  # [2, 5, 8, 64]


Before: torch.Size([2, 5, 512])
After:  torch.Size([2, 5, 8, 64])
