In [None]:
# !pip install -U "git+https://github.com/ab7289-tandon-nyu/csgy6953_DeepLearning_Midterm.git"

!git clone -b directml "https://github.com/ab7289-tandon-nyu/csgy6953_DeepLearning_Midterm.git"
!cp -r /content/csgy6953_DeepLearning_Midterm/src/ .

In [1]:
import torch
import torch.nn as nn
import random
import numpy as np

import copy
import time

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("dml")

In [3]:
from src.transforms import make_transforms
from src.data import make_data_loaders, get_transformed_data

BATCH_SIZE = 128

train_data, valid_data, test_data = (
    get_transformed_data(
        make_transforms = make_transforms,
        valid_ratio = 0.1
    )
)
train_iterator, valid_iterator, test_iterator = (
    make_data_loaders(train_data, valid_data, test_data, batch_size=BATCH_SIZE)
)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


**Define our Model

In [4]:
from src.model import ResNet, StemConfig
from src.utils import initialize_parameters, epoch_time

model_architecture = (
    (1, 128, 0.5),
    (2, 128, 0.5),
    (2, 128, 0.5),
    (2, 128, 0.5),
    (2, 196, 0.5),
    (2, 196, 0.5),
)

stem_config = StemConfig(num_channels=128, kernel_size=3, stride=1, padding=1)
model = ResNet(model_architecture, stem_config=stem_config, output_size=10)

In [5]:
from pathlib import Path

# path = "/content/drive/MyDrive/School/Tandon MSCS/Classes/CS-GY 6953: Deep Learning/midterm/"
# file_path = path + "resnet_alex_48m_dropout.pt"
file_path = "resnet_alex_48m_dropout.pt"

model_file = Path(file_path)

Need to run a dummy set of data to initialize the lazy modules before we can use torchsummary

In [6]:
if model_file.exists() and model_file.is_file():
  print("loading saved model")
  # load our previously trained model
  model.load_state_dict(torch.load(model_file))
else:
  print("initializing new model")
  # intialize a new model
  # inputs = torch.empty((BATCH_SIZE, 3, 32, 32))
  # inputs.normal_()
  model = model.to(device)
  # y = model(inputs.to(device))
  # print(y.size())

  # model.apply(initialize_parameters)

initializing new model


In [7]:
print(model)

ResNet(
  (stem): Sequential(
    (0): Conv2d(3, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (classifier): Sequential(
    (0): AdaptiveAvgPool2d(output_size=1)
    (1): Flatten(start_dim=1, end_dim=-1)
    (2): Linear(in_features=196, out_features=10, bias=True)
  )
  (body): Sequential(
    (block_2): Sequential(
      (0): ResidualBlock(
        (dropout): Dropout(p=0.5, inplace=False)
        (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (relu): ReLU(inplace=True)
        (out): ReLU(inplace=True)
        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
   

In [8]:
print(f"num params: {sum([p.numel() for p in model.parameters() if p.requires_grad]):,}")

num params: 4,835,570


In [9]:
from src.engine import train_one_epoch, evaluate

best_loss = float('inf')

EPOCHS  = 20
learning_rate = 1e-3
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

if model_file.is_file():
  # if we loaded a previously saved iteration, we want to get the current
  # best loss otherwise we could overwrite our save with a worse model
  loss, acc = evaluate(model.to(device), test_iterator, criterion, device)
  best_loss = loss
  print(f"Prevous best loss: {loss:.4f}, acc: {acc * 100:.2f}%")

In [10]:
for epoch in range(1, EPOCHS+1):
    start = time.time()

    print(f"Epoch {epoch}")
    train_loss, train_acc = train_one_epoch(model, train_iterator, criterion, optimizer, device)
    train_mins, train_secs = epoch_time(start, time.time())

    print(f"\tTrain elapsed: {train_mins}:{train_secs}, loss: {train_loss:.4f}, acc: {train_acc * 100:.2f}%")

    start = time.time()
    val_loss, val_acc = evaluate(model, valid_iterator, criterion, device)
    val_mins, val_secs = epoch_time(start, time.time())

    print(f"\tValidation elapsed: {val_mins}:{val_secs}, loss: {val_loss:.4f}, acc: {val_acc * 100:.2f}%")

    if val_loss < best_loss:
        best_loss = val_loss
        # torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/midterm/resnet_alex_485.pt")
        torch.save(model.state_dict(), file_path)

Epoch 1


RuntimeError: Could not run 'aten::argmax' with arguments from the 'DML' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::argmax' is only available for these backends: [CPU, BackendSelect, Named, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, AutogradNestedTensor, UNKNOWN_TENSOR_TYPE_ID, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, Tracer, Autocast, Batched, VmapMode].

CPU: registered at D:\a\_work\1\s\pytorch-directml\build\aten\src\ATen\RegisterCPU.cpp:5926 [kernel]
BackendSelect: fallthrough registered at D:\a\_work\1\s\pytorch-directml\aten\src\ATen\core\BackendSelectFallbackKernel.cpp:3 [backend fallback]
Named: registered at D:\a\_work\1\s\pytorch-directml\aten\src\ATen\core\NamedRegistrations.cpp:7 [backend fallback]
AutogradOther: registered at D:\a\_work\1\s\pytorch-directml\torch\csrc\autograd\generated\VariableType_0.cpp:9283 [autograd kernel]
AutogradCPU: registered at D:\a\_work\1\s\pytorch-directml\torch\csrc\autograd\generated\VariableType_0.cpp:9283 [autograd kernel]
AutogradCUDA: registered at D:\a\_work\1\s\pytorch-directml\torch\csrc\autograd\generated\VariableType_0.cpp:9283 [autograd kernel]
AutogradXLA: registered at D:\a\_work\1\s\pytorch-directml\torch\csrc\autograd\generated\VariableType_0.cpp:9283 [autograd kernel]
AutogradNestedTensor: registered at D:\a\_work\1\s\pytorch-directml\torch\csrc\autograd\generated\VariableType_0.cpp:9283 [autograd kernel]
UNKNOWN_TENSOR_TYPE_ID: registered at D:\a\_work\1\s\pytorch-directml\torch\csrc\autograd\generated\VariableType_0.cpp:9283 [autograd kernel]
AutogradPrivateUse1: registered at D:\a\_work\1\s\pytorch-directml\torch\csrc\autograd\generated\VariableType_0.cpp:9283 [autograd kernel]
AutogradPrivateUse2: registered at D:\a\_work\1\s\pytorch-directml\torch\csrc\autograd\generated\VariableType_0.cpp:9283 [autograd kernel]
AutogradPrivateUse3: registered at D:\a\_work\1\s\pytorch-directml\torch\csrc\autograd\generated\VariableType_0.cpp:9283 [autograd kernel]
Tracer: registered at D:\a\_work\1\s\pytorch-directml\torch\csrc\autograd\generated\TraceType_0.cpp:10499 [kernel]
Autocast: fallthrough registered at D:\a\_work\1\s\pytorch-directml\aten\src\ATen\autocast_mode.cpp:250 [backend fallback]
Batched: registered at D:\a\_work\1\s\pytorch-directml\aten\src\ATen\BatchingRegistrations.cpp:1016 [backend fallback]
VmapMode: fallthrough registered at D:\a\_work\1\s\pytorch-directml\aten\src\ATen\VmapModeRegistrations.cpp:33 [backend fallback]


## Evaluate the Model  

In [None]:
# model.load_state_dict(torch.load("/content/drive/MyDrive/Colab Notebooks/midterm/resnet_alex_485.pt"))
model.load_state_dict(torch.load(file_path))
test_loss, test_acc = evaluate(model.to(device), test_iterator, criterion, device)
print(f"Test Loss: {test_loss:.4f}\nTest Accuracy: {test_acc * 100:.2f}%")