Training
===

In [1]:
import sys; sys.path.append('..')
import time
from math import ceil, floor
from os.path import join as opj, dirname

import tqdm
import torch
from torch.utils.data.dataloader import DataLoader
from torch.autograd import Variable
import torchvision.transforms as T

from src.dataset import StatoilIcebergDataset
from src.network import Net
from src.settings import logger
from src.tensorboard_logger import Logger
from src.utils import mkdir_r
import src.torchsample.transforms as TST

  return f(*args, **kwds)


## Define Const

In [2]:
train_data_path = '/home/rlan/datasets/statoil-iceberg/train.json'

BASE_DIR = '/home/rlan/projects/kaggle/kaggle-statoil-iceberg'
LOG_DIR = opj(BASE_DIR, 'log')
CHECKPOINTS_PATH = opj(BASE_DIR, 'checkpoints')
MAX_EPOCH = 30
BATCH_SIZE = 256

## Setup Logger

In [3]:
model_id = str(int(time.time()))
print('model_id: %s' % model_id)
tb_logger = Logger(opj(LOG_DIR, model_id))

model_id: 1514654380


## Transform

In [4]:
transform = T.Compose([T.ToTensor(), T.Lambda(lambda x: (x - x.min()) / (x.max() - x.min())), T.ToPILImage(),
                       T.RandomHorizontalFlip(), T.RandomVerticalFlip(), 
                       T.ColorJitter(brightness=0.7, contrast=0.5, saturation=0.5),
                       T.ToTensor(), TST.RandomRotate(15), TST.RandomShear(15), T.ToPILImage(),
                       T.RandomResizedCrop(size=75, scale=(0.7, 1.0)), T.ToTensor(),
                       T.Lambda(lambda x: x - x.mean())])

## Dataset

In [5]:
dataset = StatoilIcebergDataset(train_data_path, transform=transform)
loader = DataLoader(dataset, shuffle=True, batch_size=BATCH_SIZE, num_workers=8)

## Network

In [6]:
net = Net(input_channel=2).cuda() if torch.cuda.is_available() else Net(input_channel=2)
net.train()

Net(
  (conv1): Conv2d (2, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv1_bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True)
  (conv2): Conv2d (32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2_bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True)
  (conv3): Conv2d (32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3_bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True)
  (conv4): Conv2d (32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv4_bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
  (conv5): Conv2d (64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv5_bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
  (conv6): Conv2d (64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv6_bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
  (fc1): Linear(in_features=20736, out_features=120)
  (fc1_bn): BatchNorm1d(120, eps=1e-05, momentum=0.1, affine=True)


## Loss and Optimizer

In [7]:
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
loss_fn = torch.nn.CrossEntropyLoss()

## Train

In [9]:
niter_per_epoch = ceil(len(dataset) / BATCH_SIZE)
pbar = tqdm.tqdm(range(niter_per_epoch * MAX_EPOCH))
for epoch in pbar:
    lr_scheduler.step()
    for i_batch, sampled_batch in enumerate(loader):
        data, target = sampled_batch

        if torch.cuda.is_available():
            data, target = Variable(data).cuda(), Variable(target).cuda()
        else:
            data, target = Variable(data), Variable(target)

        optimizer.zero_grad()
        pred = net(data)
        loss = loss_fn(pred, target.float())
        loss.backward()
        optimizer.step()
        pbar.set_description('Epoch: {:d}, Training loss: {:.4f}'.format(
            floor(pbar.n / niter_per_epoch), 
            loss.data[0]))
        tb_logger.scalar_summary('loss', loss.data[0], epoch * niter_per_epoch + i_batch + 1)

    # (2) Log values and gradients of the parameters (histogram)
    for tag, value in net.named_parameters():
        tag = tag.replace('.', '/')
        tb_logger.histo_summary(tag, value.data.cpu().numpy(), epoch + 1)
        tb_logger.histo_summary(tag+'/grad', value.grad.data.cpu().numpy(), epoch + 1)

    if (epoch + 1) % (niter_per_epoch * 5) == 0:
        cp_path = opj(CHECKPOINTS_PATH, model_id, 'model_%s' % epoch)
        mkdir_r(dirname(cp_path))
        torch.save(net.state_dict(), cp_path)



  0%|          | 0/210 [00:00<?, ?it/s][A








Epoch: 0, Training loss: 1.2062:   0%|          | 0/210 [00:00<?, ?it/s][A
Epoch: 0, Training loss: 0.4381:   0%|          | 0/210 [00:01<?, ?it/s][A
Epoch: 0, Training loss: 0.3809:   0%|          | 0/210 [00:01<?, ?it/s][A
Epoch: 0, Training loss: 0.3534:   0%|          | 0/210 [00:01<?, ?it/s][A
Epoch: 0, Training loss: 0.2801:   0%|          | 0/210 [00:01<?, ?it/s][A
Epoch: 0, Training loss: 0.3040:   0%|          | 0/210 [00:01<?, ?it/s][A
Epoch: 0, Training loss: 0.2894:   0%|          | 0/210 [00:01<?, ?it/s][A
Epoch: 0, Training loss: 0.2894:   0%|          | 1/210 [00:01<06:31,  1.87s/it][A








Exception in thread Thread-5:
Traceback (most recent call last):
  File "/home/rlan/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/rlan/anaconda3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/home/rlan/a

KeyboardInterrupt: 

Process Process-684:
Traceback (most recent call last):
  File "/home/rlan/anaconda3/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/home/rlan/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/rlan/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 36, in _worker_loop
    r = index_queue.get()
  File "/home/rlan/anaconda3/lib/python3.6/multiprocessing/queues.py", line 342, in get
    res = self._reader.recv_bytes()
  File "/home/rlan/anaconda3/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/home/rlan/anaconda3/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/home/rlan/anaconda3/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt
