In [4]:
!pip install torchviz
import numpy as np
from sklearn.linear_model import LinearRegression

import torch
import torch.optim as optim
import torch.nn as nn
from torchviz import make_dot



In [5]:
# 1 Steps of gradient descent

# linear regression no activation function
# y = wx + b + e


In [6]:
# Data generation

true_b = 1
true_w = 2
N = 100

np.random.seed(42)
x = np.random.rand(N, 1)
epsilon = .1 * np.random.randn(N, 1)
y = true_b + true_w * x + epsilon

In [7]:
# Splitting Data
#shuffle
idx = np.arange(N)
np.random.shuffle(idx)

train_idx = idx[:int(0.8*N)]
test_idx = idx[int(0.8*N):]

In [8]:
# train and validation
x_train, y_train = x[train_idx], y[train_idx]
x_val, y_val = x[test_idx], y[test_idx]

In [9]:
# Gradient descent
# step 0 : random initialization
b = np.random.randn(1)
w = np.random.randn(1)
print(b, w)

[-2.02514259] [0.18645431]


In [10]:
# step 1 : Model prediction
yhat = b + w * x_train

In [11]:
# step #2 : Compute the loss
error = (yhat - y_train)

loss = (error ** 2).mean()
print(loss)

15.444680120636235


In [12]:
# step #3 Compute the gradients
b_grad = 2*error.mean()
w_grad = 2*(x_train*error).mean()
print(b_grad, w_grad)

-7.781692427768675 -4.010835411595505


In [13]:
# step #4 update the parameters
lr = 0.1
b = b - lr * b_grad
w = w - lr * w_grad
print(b, w)

[-1.24697334] [0.58753786]


In [14]:
# Rinse and repeat
lr = 0.1
epoch = 10000
for epoch in range(epoch):
    # predict forward pass
    yhat = b + w * x_train
    #mse loss
    error = (yhat - y_train)
    loss = (error ** 2).mean()
    #calculate grad
    b_grad = 2*error.mean()
    w_grad = 2*(x_train*error).mean()
    # update weights
    b = b - lr * b_grad
    w = w - lr * w_grad


print(b, w)
print(true_b, true_w)

[1.02354075] [1.96896447]
1 2


In [15]:
linr = LinearRegression()
linr.fit(x_train,y_train)
print(linr.intercept_,linr.coef_[0])

[1.02354075] [1.96896447]


In [16]:
# Pytorch Tensors
scalar = torch.tensor(3.14)
vector = torch.tensor([1,2,3])
matrix = torch.ones((2,3),dtype = torch.float)
tensor = torch.randn((2,3,4),dtype = torch.float)
print(scalar)
print(vector)
print(matrix)
print(tensor)

tensor(3.1400)
tensor([1, 2, 3])
tensor([[1., 1., 1.],
        [1., 1., 1.]])
tensor([[[-1.1097, -2.2141,  0.8423,  1.0552],
         [-0.3254,  0.5537,  1.0209, -1.7730],
         [ 1.1366, -1.6212,  0.1696,  1.0675]],

        [[-1.1751, -0.6766, -1.0260,  2.7235],
         [ 0.0332, -0.5168,  1.3203, -0.5577],
         [ 0.2482, -0.6728, -0.1930, -0.4676]]])


In [17]:
tensor.size()

torch.Size([2, 3, 4])

In [18]:
tensor.shape

torch.Size([2, 3, 4])

In [19]:
scalar.size()

torch.Size([])

In [20]:
scalar.shape

torch.Size([])

In [21]:
same_matrix = matrix.view(1,6)

In [22]:
same_matrix[0,1] = 2

In [23]:
print(matrix)
print(same_matrix)

tensor([[1., 2., 1.],
        [1., 1., 1.]])
tensor([[1., 2., 1., 1., 1., 1.]])


In [24]:
different_matrix = matrix.new_tensor(matrix.view(1,6))
different_matrix[0,1] = 3.
print(matrix)
print(different_matrix)

tensor([[1., 2., 1.],
        [1., 1., 1.]])
tensor([[1., 3., 1., 1., 1., 1.]])


  different_matrix = matrix.new_tensor(matrix.view(1,6))


In [25]:
another_matrix = matrix.view(1,6).clone().detach()
another_matrix[0,1] = 4.
print(matrix)
print(another_matrix)

tensor([[1., 2., 1.],
        [1., 1., 1.]])
tensor([[1., 4., 1., 1., 1., 1.]])


In [26]:
x_train_tensor = torch.as_tensor(x_train)
print(x_train_tensor.dtype)

torch.float64


In [27]:
float_tensor = x_train_tensor.to(torch.float)
print(float_tensor.dtype)

torch.float32


In [28]:
print(float_tensor.numpy()[:3])

[[0.77127033]
 [0.06355835]
 [0.86310345]]


In [29]:
# cheching if device has GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [30]:
n_cudas = torch.cuda.device_count()
for i in range(n_cudas):
    print(torch.cuda.get_device_name(i))

In [31]:
gpu_tensor = float_tensor.to(device)
print(gpu_tensor[0])

tensor([0.7713])


In [32]:
# converting GPU tensor to cpu first and then to numpy
back_to_numpy = x_train_tensor.cpu().numpy()
print(back_to_numpy.shape)

(80, 1)


In [33]:
# creating parameters, tensors
# first send tensor to device and then use the requires_grad()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
b = torch.randn(1).to(device)
w = torch.randn(1).to(device)
print(b, w)

b.requires_grad_()
w.requires_grad_()
print(b, w)

tensor([-0.6828]) tensor([1.9209])
tensor([-0.6828], requires_grad=True) tensor([1.9209], requires_grad=True)


In [34]:
b = torch.randn(1, requires_grad = True,dtype=torch.float,device=device)
w = torch.randn(1, requires_grad = True,dtype=torch.float,device=device)
print(b, w)

tensor([0.2398], requires_grad=True) tensor([1.4581], requires_grad=True)


In [35]:
# assign tensors to a device at the moment of its creation to avoid unexpected behaviors

In [41]:
import torch
import numpy as np

def train_step(x_train, y_train, w, b, device='cpu'):
    """
    Performs one training step for linear regression using PyTorch autograd

    Parameters:
    -----------
    x_train : numpy.ndarray
        Input training data
    y_train : numpy.ndarray
        Target training data
    w : torch.Tensor
        Weight parameter with requires_grad=True
    b : torch.Tensor
        Bias parameter with requires_grad=True
    device : str
        Device to run computations on ('cpu' or 'cuda')

    Returns:
    --------
    loss : torch.Tensor
        Mean squared error loss
    yhat : torch.Tensor
        Model predictions
    """
    # Convert numpy arrays to PyTorch tensors and move to specified device
    x_train_tensor = torch.as_tensor(x_train, dtype=torch.float32).to(device)
    y_train_tensor = torch.as_tensor(y_train, dtype=torch.float32).to(device)

    # Step 1: Compute model predictions (forward pass)
    yhat = b + w * x_train_tensor

    # Step 2: Compute loss (MSE)
    error = yhat - y_train_tensor
    loss = torch.mean(error ** 2)

    # Step 3: Compute gradients (backward pass)
    loss.backward()

    return loss, yhat

# Example usage:
if __name__ == "__main__":
    # Generate sample data
    np.random.seed(42)
    x_train = np.random.rand(100, 1)
    y_train = 2 * x_train + 1 + np.random.randn(100, 1) * 0.1

    # Initialize parameters with requires_grad=True
    w = torch.tensor(0.0, requires_grad=True)
    b = torch.tensor(0.0, requires_grad=True)

    # Perform one training step
    loss, predictions = train_step(x_train, y_train, w, b)

    print(f"Loss: {loss.item():.4f}")
    print(f"Weight gradient: {w.grad.item():.4f}")
    print(f"Bias gradient: {b.grad.item():.4f}")


Loss: 4.1072
Weight gradient: -2.1669
Bias gradient: -3.8805


In [42]:
# gradient accumulation
# This code will be placed _after_ step 4
# (updating the parameters)
print(b.grad.zero_(), w.grad.zero_())

tensor(0.) tensor(0.)


In [43]:
# Updating Parameters
# Let's break down this PyTorch gradient descent implementation

# 1. Setup and Initialization
lr = 0.1  # Learning rate
torch.manual_seed(42)  # Set random seed for reproducibility

# Initialize parameters with random values and enable gradient tracking
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)  # bias
w = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)  # weight

# Training loop
n_epochs = 1000
for epoch in range(n_epochs):
    # Forward pass
    yhat = b + w * x_train_tensor  # Linear model: y = wx + b
    error = (yhat - y_train_tensor)  # Calculate error
    loss = (error ** 2).mean()  # Mean squared error loss

    # Backward pass
    loss.backward()  # Compute gradients

    # Parameter updates using gradient descent
    with torch.no_grad():  # Temporarily disable gradient tracking
        b -= lr * b.grad  # Update bias: b = b - lr * gradient
        w -= lr * w.grad  # Update weight: w = w - lr * gradient

    # Reset gradients to zero
    b.grad.zero_()  # Clear bias gradient
    w.grad.zero_()  # Clear weight gradient

print(b, w)  # Print final parameters

"""
Key Concepts Explained:

1. requires_grad=True
   - This tells PyTorch to track operations on these tensors
   - Enables automatic differentiation
   - Creates computational graph for backpropagation

2. loss.backward()
   - Computes gradients for all tensors with requires_grad=True
   - Uses chain rule to calculate derivatives
   - Stores gradients in .grad attribute of tensors

3. with torch.no_grad():
   - Temporarily disables gradient computation
   - Necessary for parameter updates to prevent tracking
   - Without this, PyTorch would try to build a computational graph for the updates
   - Would lead to incorrect gradients and higher memory usage

4. Why zero_grad() is necessary:
   - PyTorch accumulates gradients by default
   - Without zeroing, gradients from previous steps would be added
   - Must reset before next forward/backward pass
   - Alternative: optimizer.zero_grad() if using PyTorch optimizer

Common Pitfalls to Avoid:
1. Forgetting torch.no_grad() for updates
2. Not zeroing gradients between iterations
3. Using inplace operations without care
4. Not handling GPU/CPU device placement consistently

Best Practices:
1. Always use torch.no_grad() for parameter updates
2. Clear gradients after each iteration
3. Set random seed for reproducibility
4. Use appropriate data types and devices
5. Consider using PyTorch's built-in optimizers for production code
"""

tensor([1.0235], requires_grad=True) tensor([1.9690], requires_grad=True)


"\nKey Concepts Explained:\n\n1. requires_grad=True\n   - This tells PyTorch to track operations on these tensors\n   - Enables automatic differentiation\n   - Creates computational graph for backpropagation\n\n2. loss.backward()\n   - Computes gradients for all tensors with requires_grad=True\n   - Uses chain rule to calculate derivatives\n   - Stores gradients in .grad attribute of tensors\n\n3. with torch.no_grad():\n   - Temporarily disables gradient computation\n   - Necessary for parameter updates to prevent tracking\n   - Without this, PyTorch would try to build a computational graph for the updates\n   - Would lead to incorrect gradients and higher memory usage\n\n4. Why zero_grad() is necessary:\n   - PyTorch accumulates gradients by default\n   - Without zeroing, gradients from previous steps would be added\n   - Must reset before next forward/backward pass\n   - Alternative: optimizer.zero_grad() if using PyTorch optimizer\n\nCommon Pitfalls to Avoid:\n1. Forgetting torch.no