In [1]:
import torch
import numpy as np
import os
import resnet
import time
import matplotlib.pyplot as plt
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
print(device)

cuda


In [2]:
INIT_LR = 1e-3
BATCH_SIZE = 16 #100
EPOCHS = 10 #10
# define the train and val splits
TRAIN_SPLIT = 0.75
VAL_SPLIT = 1 - TRAIN_SPLIT

In [3]:
class LensTrainDataset(torch.utils.data.Dataset):
    def __init__(self, img_dir, u_length):
        self.img_dir = img_dir
        self.sub_dirs = ['no','sphere','vort']
        self.img_labels = list(range(len(self.sub_dirs)))
        self.u_length = u_length

    def __len__(self):
        return len(self.img_labels)*self.u_length

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, "%s/%s.npy"%(self.sub_dirs[int(idx/self.u_length)],idx+1-int(idx/self.u_length)*self.u_length))   
        image = torch.Tensor(np.load(img_path))
        label = self.img_labels[int(idx/self.u_length)]
        # if self.transform:
        #     image = self.transform(image)
        # if self.target_transform:
        #     label = self.target_transform(label)
        return image, label
    
class LensTestDataset(torch.utils.data.Dataset):
    def __init__(self, img_dir, u_length):
        self.img_dir = img_dir
        self.sub_dirs = ['no','sphere','vort']
        self.img_labels = list(range(len(self.sub_dirs)))
        self.u_length = u_length

    def __len__(self):
        return len(self.img_labels)*self.u_length

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, "%s/%s.npy"%(self.sub_dirs[int(idx/self.u_length)],idx+1-int(idx/self.u_length)*self.u_length))    
        image = torch.Tensor(np.load(img_path))
        label = self.img_labels[int(idx/self.u_length)]
        # if self.transform:
        #     image = self.transform(image)
        # if self.target_transform:
        #     label = self.target_transform(label)
        return image, label

In [4]:
train_dataset = LensTrainDataset('../dataset/train/',10000)
test_dataset = LensTestDataset('../dataset/val/',2500)
labels_length = len(train_dataset.img_labels)
numTrainSamples = int(len(train_dataset) * TRAIN_SPLIT)
numValSamples = int(len(train_dataset) * VAL_SPLIT)
(train_dataset, val_dataset) = torch.utils.data.random_split(train_dataset,
	[numTrainSamples, numValSamples],
	generator=torch.Generator().manual_seed(42))

train_dataloader = torch.utils.data.DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset,batch_size=BATCH_SIZE,shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset,batch_size=BATCH_SIZE,shuffle=True)

trainSteps = len(train_dataloader.dataset) // BATCH_SIZE
valSteps = len(val_dataloader.dataset) // BATCH_SIZE

print("[INFO] initializing the ResNet model...")
model = resnet.ResNet50().to(device)

opt = torch.optim.Adam(model.parameters(), lr=INIT_LR)
lossFn = torch.nn.NLLLoss()

H = {
	"train_loss": [],
	"train_acc": [],
	"val_loss": [],
	"val_acc": []
}
# measure how long training is going to take
print("[INFO] training the network...")
startTime = time.time()
for e in range(0, EPOCHS):
    # set the model in training mode
    model.train()
    # initialize the total training and validation loss
    totalTrainLoss = 0
    totalValLoss = 0
    # initialize the number of correct predictions in the training
    # and validation step
    trainCorrect = 0
    valCorrect = 0
    # loop over the training set
    for (x, y) in train_dataloader:
        # perform a forward pass and calculate the training loss
        (x, y) = (x.to(device), y.to(device))
        pred = model(x)
        loss = lossFn(pred, y)
        # zero out the gradients, perform the backpropagation step,
        # and update the weights
        opt.zero_grad()
        loss.backward()
        opt.step()
        # add the loss to the total training loss so far and
        # calculate the number of correct predictions
        totalTrainLoss += loss
        trainCorrect += (pred.argmax(1) == y).type(torch.float).sum().item()
    with torch.no_grad():
		# set the model in evaluation mode
        model.eval()
        # loop over the validation set
        for (x, y) in val_dataloader:
            (x,y) = (x.to(device), y.to(device))
            # make the predictions and calculate the validation loss
            pred = model(x)
            totalValLoss += lossFn(pred, y)
            # calculate the number of correct predictions
            valCorrect += (pred.argmax(1) == y).type(
                torch.float).sum().item()
    # calculate the average training and validation loss
    avgTrainLoss = totalTrainLoss / trainSteps
    avgValLoss = totalValLoss / valSteps
    # calculate the training and validation accuracy
    trainCorrect = trainCorrect / len(train_dataloader.dataset)
    valCorrect = valCorrect / len(val_dataloader.dataset)
    # update our training history
    H["train_loss"].append(avgTrainLoss.cpu().detach().numpy())
    H["train_acc"].append(trainCorrect) 
    H["val_loss"].append(avgValLoss.cpu().detach().numpy())
    H["val_acc"].append(valCorrect)
    # print the model training and validation information
    print("[INFO] EPOCH: {}/{}".format(e + 1, EPOCHS))
    print("Train loss: {:.6f}, Train accuracy: {:.4f}".format(
        avgTrainLoss, trainCorrect))
    print("Val loss: {:.6f}, Val accuracy: {:.4f}\n".format(
        avgValLoss, valCorrect))

# finish measuring how long training took
endTime = time.time()
print("[INFO] total time taken to train the model: {:.2f}s".format(
	endTime - startTime))



[INFO] initializing the ResNet model...
[INFO] training the network...


../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [2,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [3,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [4,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [6,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [7,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [8,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_f

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# we can now evaluate the network on the test set
print("[INFO] evaluating network...")
# turn off autograd for testing evaluation
with torch.no_grad():
	# set the model in evaluation mode
	model.eval()
	
	# initialize a list to store our predictions
	preds = []
	# loop over the test set
	for (x, y) in test_dataloader:
		x = x.to(device)
		# make the predictions and add them to the list
		pred = model(x)
		preds.extend(pred.argmax(axis=1).cpu().numpy())
# generate a classification report
# print(classification_report(test_dataloader.dataset.targets.cpu().numpy(),
# 	np.array(preds), target_names=test_dataloader.dataset.classes))

# plot the training loss and accuracy


In [None]:
plt.style.use("ggplot")
plt.figure()
plt.plot(H["train_loss"], label="train_loss")
plt.plot(H["val_loss"], label="val_loss")
plt.plot(H["train_acc"], label="train_acc")
plt.plot(H["val_acc"], label="val_acc")
plt.title("Training Loss and Accuracy on Dataset")
plt.xlabel("Epoch #")
plt.ylabel("Loss/Accuracy")
plt.legend(loc="lower left")
plt.savefig('plot.png')
# serialize the model to disk
torch.save(model, 'model.pth')