<a href="https://colab.research.google.com/github/anshika0601/pytorchz-learn/blob/main/Day10%5Cworking_with_cuda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
print(torch.cuda.is_available())   # should be True
print(torch.cuda.get_device_name(0))  # should show Tesla T4, A100, etc.



True
Tesla T4


In [2]:
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = nn.Linear(4, 2).to(device)   # model on GPU
x = torch.randn(3, 4)                # stays on CPU by default

out = model(x)   # ❌ RuntimeError: Tensors on different devices


RuntimeError: Expected all tensors to be on the same device, but got mat1 is on cpu, different from other tensors on cuda:0 (when checking argument in method wrapper_CUDA_addmm)

In [4]:
x = x.to(device)
out = model(x)


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import time

# Simple model
class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(1000, 500)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(500, 10)

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))


# dataset (10,000 samples, 1000 features each)
X = torch.randn(10000, 1000)
y = torch.randint(0, 10, (10000,))

# Loss & optimizer
criterion = nn.CrossEntropyLoss()

def train(device):
    model = SimpleNet().to(device)
    optimizer = optim.SGD(model.parameters(), lr=0.01)

    X_device = X.to(device)
    y_device = y.to(device)

    start = time.time()
    for epoch in range(5):  # train 5 epochs
        optimizer.zero_grad()
        outputs = model(X_device)
        loss = criterion(outputs, y_device)
        loss.backward()
        optimizer.step()
    elapsed = time.time() - start
    print(f"Training on {device}: {elapsed:.3f} sec")


# Benchmark CPU
train(torch.device("cpu"))

# Benchmark GPU (if available)
if torch.cuda.is_available():
    train(torch.device("cuda"))


Training on cpu: 1.091 sec
Training on cuda: 0.259 sec


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import time
from torch.cuda.amp import autocast, GradScaler

# -----------------------------
# 1. Check Device
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# -----------------------------
# 2. Dummy Dataset
# -----------------------------
X = torch.randn(1000, 20).to(device)   # 1000 samples, 20 features
y = torch.randint(0, 2, (1000,)).to(device)  # binary labels

dataset = torch.utils.data.TensorDataset(X, y)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)

# -----------------------------
# 3. Simple Model
# -----------------------------
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc1 = nn.Linear(20, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 2)

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

# -----------------------------
# 4. Training Function
# -----------------------------
def train_model(use_amp=False, epochs=3):
    model = SimpleModel().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scaler = GradScaler(enabled=use_amp)

    start = time.time()

    for epoch in range(epochs):
        for batch_x, batch_y in dataloader:
            optimizer.zero_grad()

            if use_amp:  # ✅ Mixed Precision
                with autocast():
                    outputs = model(batch_x)
                    loss = criterion(outputs, batch_y)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:  # 🚫 Fallback to FP32
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()

        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

    end = time.time()
    elapsed = end - start
    print(f"Training complete ✅ | Time: {elapsed:.2f}s")
    return elapsed

# -----------------------------
# 5. Run FP32 vs AMP
# -----------------------------
print("\n--- FP32 Training ---")
time_fp32 = train_model(use_amp=False)

if torch.cuda.is_available():
    print("\n--- AMP Training ---")
    time_amp = train_model(use_amp=True)
else:
    time_amp = None

print("\n--- Benchmark Results ---")
print(f"FP32 Time: {time_fp32:.2f}s")
if time_amp:
    print(f"AMP  Time: {time_amp:.2f}s")


Using device: cuda

--- FP32 Training ---


  scaler = GradScaler(enabled=use_amp)


Epoch 1, Loss: 0.6952
Epoch 2, Loss: 0.7196
Epoch 3, Loss: 0.6930
Training complete ✅ | Time: 0.26s

--- AMP Training ---


  with autocast():


Epoch 1, Loss: 0.6854
Epoch 2, Loss: 0.6812
Epoch 3, Loss: 0.6982
Training complete ✅ | Time: 0.37s

--- Benchmark Results ---
FP32 Time: 0.26s
AMP  Time: 0.37s
