In [None]:
%load_ext autoreload
%autoreload 2

import os
import yaml
import time
import random
import numpy as np
from tqdm import tqdm

import torch
from torch.utils import data
from tensorboardX import SummaryWriter

# Datasets
from data_loader.mnist import NoisyMNISTDataLoader

# Models
from models.autoencoder import SmallAutoEncoder
# Loss
from loss.cross_entropy_2d import CrossEntropy2d
# Metrics
from utils.metrics import MetricsComp, AverageComp

def train(config, data_set):
    # setup seeds
    torch.manual_seed(config["seed"])
    torch.cuda.manual_seed(config["seed"])
    np.random.seed(config["seed"])
    torch.cuda.empty_cache()
    
    # setup device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # setup dataloader
    train_data_set = data_set(split="training")
    train_data = data.DataLoader(train_data_set, batch_size=config["batch_sz"],\
                                 num_workers=config["num_workers"], shuffle=True)
    val_data = data.DataLoader(data_set(split="validation"), batch_size=config["batch_sz"],\
                               num_workers=config["num_workers"])  
    
    n_classes = 2
    
    # setup metrics
    metrics_comp = MetricsComp(n_classes)
    val_loss_avg_comp = AverageComp()

    # setup model
    model = SmallAutoEncoder().to(device)
    
    # setup optimizer
    optimizer = torch.optim.Adam(model.parameters())
        
    # setup loss
    try:
      loss_fn_weight = train_data_set.class_imbalance_weight.to(device)
    except:
      loss_fn_weight = None
    loss_fn = CrossEntropy2d(weight=loss_fn_weight)
    
    # setup tensorboard writer & checkpoint dir 
    exp_name = config.get("exp_name", "")
    uniq_name = f"{train_data_set.name}_{model.name}_{exp_name}_logs"
    writer = SummaryWriter(log_dir="checkpoints/" + uniq_name)
    
    i = 0
    # Load a saved checkpoint
    if config.get("resume_ckpoint") is not None:
        if os.path.isfile(config["resume_ckpoint"]):
            print(f"Loading checkpoint: {config['resume_ckpoint']}")
            ckpoint = torch.load(config["resume_ckpoint"])
            model.load_state_dict(ckpoint["model_state"])
            optimizer.load_state_dict(ckpoint["optimizer_state"])
            i = ckpoint["epoch"]
            print(f"Saved epoch: {i}, loss: {ckpoint['epoch_loss']}, time: {ckpoint['epoch_time']}")
        else:
            raise FileNotFoundError("Unable to load saved checkpoint!, Quitting")
            
    while i < config["epochs"]:
        i += 1
        epoch_loss = 0
        epoch_time = 0
        # Prepare for training
        model.train()
        optimizer.zero_grad()
        acc_gradients_batch = 0
        
        print(f"Starting Epoch: {i}")
        for b_i, (images, labels) in enumerate(train_data):
            start_ts = time.time()
            images = images.to(device)
            labels = labels.to(device)

            out = model(images) # [batch_sz, 2, H=28, W=28]
            acc_gradients_batch += out.shape[0]
            
            loss = loss_fn(out, labels)    
            loss.backward()
            
            # See if it is time to take a gradient step.
            if acc_gradients_batch >= 100: # gradient step every 100 samples
              optimizer.step()
              optimizer.zero_grad()
              acc_gradients_batch = 0
             
            epoch_loss += float(loss.item())
            epoch_time += time.time() - start_ts
        
        print(f"Epoch Loss: {epoch_loss}")
        writer.add_scalar(f"epoch_loss", epoch_loss, i)

        # Run through validation
        model.eval()
        with torch.no_grad():
            ctr = 0
            for _, (images_val, labels_val) in tqdm(enumerate(val_data)):
                if ctr > 10:
                    break
                ctr += 1
                images_val = images_val.to(device)
                labels_val = labels_val.to(device)               
                out = model(images_val) # [batch_sz, 2, H=28, W=28]
                val_loss = loss_fn(out, labels_val)
                val_loss_avg_comp.update(val_loss.item())
                
                _, pred = out.max(1)
                
                metrics_comp.update(label_trues=labels_val.cpu().numpy(), label_preds=pred.cpu().numpy())

        # Add validation results to tensorboard writer
        writer.add_scalar("val_loss", val_loss_avg_comp.avg, i)
        overall_scores, class_iou = metrics_comp.get_results()
        print(f"Scores after epoch {i}: {overall_scores}")
        for k, v in overall_scores.items():
            writer.add_scalar(f"val_metrics/{k}", v, i)
        for k, v in class_iou.items():
            writer.add_scalar(f"val_metrics/cls_iou_{k}", v, i)
        
        if (config["save_ckpoint"]):
          # Save the model checkpoint
          ckpoint = {"epoch":i,
                     "model_state": model.state_dict(),
                     "optimizer_state": optimizer.state_dict(),
                     "epoch_loss": epoch_loss,
                     "epoch_time": epoch_time,
                    }
          ckpoint_name = f"checkpoints/{uniq_name}_{i%3}.pkl"
          torch.save(ckpoint, ckpoint_name)
    
#TMP batch size reduced to 8->4            
config = {"exp_name": "denoise", "batch_sz": 256, "epochs": 500,\
          "seed": 3642, "num_workers": 1, "save_ckpoint": True} #"resume_ckpoint": "checkpoints/fcn32s_3.pkl", }

train(config, NoisyMNISTDataLoader)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Generating 60000 images
Generating 10000 images
Loss weight: None
Starting Epoch: 1


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 71.61345940828323


11it [00:00, 18.70it/s]


Scores after epoch 1: {'Overall Acc: ': 0.8765237273886828, 'FreqW Acc : ': 1681150.1773176598, 'Mean IoU : ': 967570.0}
Starting Epoch: 2


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 52.347816959023476


11it [00:00, 22.08it/s]


Scores after epoch 2: {'Overall Acc: ': 0.8862062358679267, 'FreqW Acc : ': 3328788.988586992, 'Mean IoU : ': 1956516.5}
Starting Epoch: 3


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 42.60454726219177


11it [00:00, 21.34it/s]


Scores after epoch 3: {'Overall Acc: ': 0.8970804586038961, 'FreqW Acc : ': 4958815.580483969, 'Mean IoU : ': 2970786.0}
Starting Epoch: 4


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 37.23239415884018


11it [00:00, 22.07it/s]


Scores after epoch 4: {'Overall Acc: ': 0.9049821899640538, 'FreqW Acc : ': 6607596.367039838, 'Mean IoU : ': 3995938.0}
Starting Epoch: 5


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 34.195771247148514


11it [00:00, 22.15it/s]


Scores after epoch 5: {'Overall Acc: ': 0.9112340017683209, 'FreqW Acc : ': 8260222.852051235, 'Mean IoU : ': 5029428.5}
Starting Epoch: 6


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 31.972687289118767


11it [00:00, 21.85it/s]


Scores after epoch 6: {'Overall Acc: ': 0.9157770707714904, 'FreqW Acc : ': 9919958.915438566, 'Mean IoU : ': 6065404.0}
Starting Epoch: 7


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 30.677599228918552


11it [00:00, 21.69it/s]


Scores after epoch 7: {'Overall Acc: ': 0.9195685731679035, 'FreqW Acc : ': 11579014.704916874, 'Mean IoU : ': 7105602.0}
Starting Epoch: 8


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 29.5351674631238


11it [00:00, 22.16it/s]


Scores after epoch 8: {'Overall Acc: ': 0.9226802337589866, 'FreqW Acc : ': 13240378.176049398, 'Mean IoU : ': 8148167.0}
Starting Epoch: 9


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 28.688684955239296


11it [00:00, 18.70it/s]


Scores after epoch 9: {'Overall Acc: ': 0.9253074631841373, 'FreqW Acc : ': 14901733.017191306, 'Mean IoU : ': 9192789.0}
Starting Epoch: 10


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 28.046926364302635


11it [00:00, 18.68it/s]


Scores after epoch 10: {'Overall Acc: ': 0.9275160978809137, 'FreqW Acc : ': 16563258.571022274, 'Mean IoU : ': 10238590.5}
Starting Epoch: 11


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 27.356624789536


11it [00:00, 22.04it/s]


Scores after epoch 11: {'Overall Acc: ': 0.9294425358276058, 'FreqW Acc : ': 18231716.066647675, 'Mean IoU : ': 11285841.5}
Starting Epoch: 12


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 26.853881739079952


11it [00:00, 21.99it/s]


Scores after epoch 12: {'Overall Acc: ': 0.9309971325177798, 'FreqW Acc : ': 19887701.59208495, 'Mean IoU : ': 12332420.0}
Starting Epoch: 13


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 26.41144237667322


11it [00:00, 21.92it/s]


Scores after epoch 13: {'Overall Acc: ': 0.9324938105978843, 'FreqW Acc : ': 21562709.538058307, 'Mean IoU : ': 13381599.5}
Starting Epoch: 14


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 26.01795619726181


11it [00:00, 21.60it/s]


Scores after epoch 14: {'Overall Acc: ': 0.9337890042634343, 'FreqW Acc : ': 23225558.880563598, 'Mean IoU : ': 14430969.5}
Starting Epoch: 15


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 25.60328460484743


11it [00:00, 21.10it/s]


Scores after epoch 15: {'Overall Acc: ': 0.9350014313253711, 'FreqW Acc : ': 24900320.7328943, 'Mean IoU : ': 15481828.5}
Starting Epoch: 16


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 25.218156442046165


11it [00:00, 21.35it/s]


Scores after epoch 16: {'Overall Acc: ': 0.9361051088803775, 'FreqW Acc : ': 26570186.072477154, 'Mean IoU : ': 16533443.5}
Starting Epoch: 17


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 24.871228240430355


11it [00:00, 21.94it/s]


Scores after epoch 17: {'Overall Acc: ': 0.936934876933728, 'FreqW Acc : ': 28225953.15818048, 'Mean IoU : ': 17582355.0}
Starting Epoch: 18


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 24.66270024329424


11it [00:00, 22.22it/s]


Scores after epoch 18: {'Overall Acc: ': 0.9378759997133324, 'FreqW Acc : ': 29896814.850665655, 'Mean IoU : ': 18635311.0}
Starting Epoch: 19


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 24.334876403212547


11it [00:00, 21.91it/s]


Scores after epoch 19: {'Overall Acc: ': 0.9387059226165048, 'FreqW Acc : ': 31578603.551331587, 'Mean IoU : ': 19688012.5}
Starting Epoch: 20


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 24.103244625031948


11it [00:00, 21.28it/s]


Scores after epoch 20: {'Overall Acc: ': 0.939504217880334, 'FreqW Acc : ': 33249856.30604545, 'Mean IoU : ': 20741848.0}
Starting Epoch: 21


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 23.88468150794506


11it [00:00, 21.01it/s]


Scores after epoch 21: {'Overall Acc: ': 0.9402439775626159, 'FreqW Acc : ': 34925158.88469949, 'Mean IoU : ': 21796089.0}
Starting Epoch: 22


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 23.657935604453087


11it [00:00, 21.29it/s]


Scores after epoch 22: {'Overall Acc: ': 0.9409301572514336, 'FreqW Acc : ': 36597853.826344, 'Mean IoU : ': 22850662.0}
Starting Epoch: 23


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 23.506035789847374


11it [00:00, 22.17it/s]


Scores after epoch 23: {'Overall Acc: ': 0.941569725948062, 'FreqW Acc : ': 38270001.38034618, 'Mean IoU : ': 23905566.5}
Starting Epoch: 24


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 23.258119739592075


11it [00:00, 22.23it/s]


Scores after epoch 24: {'Overall Acc: ': 0.94219159165797, 'FreqW Acc : ': 39948913.391751945, 'Mean IoU : ': 24961414.0}
Starting Epoch: 25


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 23.187062554061413


11it [00:00, 21.84it/s]


Scores after epoch 25: {'Overall Acc: ': 0.9427582183441559, 'FreqW Acc : ': 41625354.105318375, 'Mean IoU : ': 26017110.0}
Starting Epoch: 26


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 23.02402748912573


11it [00:00, 22.23it/s]


Scores after epoch 26: {'Overall Acc: ': 0.9432975994429678, 'FreqW Acc : ': 43300367.88441504, 'Mean IoU : ': 27073275.0}
Starting Epoch: 27


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 22.8247454687953


11it [00:00, 18.31it/s]


Scores after epoch 27: {'Overall Acc: ': 0.9438140707671958, 'FreqW Acc : ': 44979414.22446262, 'Mean IoU : ': 28129948.0}
Starting Epoch: 28


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 22.665088079869747


11it [00:00, 22.07it/s]


Scores after epoch 28: {'Overall Acc: ': 0.9442807745307945, 'FreqW Acc : ': 46659071.727253705, 'Mean IoU : ': 29186223.0}
Starting Epoch: 29


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 22.530371502041817


11it [00:00, 18.72it/s]


Scores after epoch 29: {'Overall Acc: ': 0.944727911981799, 'FreqW Acc : ': 48334356.97691128, 'Mean IoU : ': 30242902.0}
Starting Epoch: 30


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 22.42612673342228


11it [00:00, 18.84it/s]


Scores after epoch 30: {'Overall Acc: ': 0.9451700770862322, 'FreqW Acc : ': 50014795.341243826, 'Mean IoU : ': 31300403.5}
Starting Epoch: 31


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 22.30680575966835


11it [00:00, 18.60it/s]


Scores after epoch 31: {'Overall Acc: ': 0.9455851473195583, 'FreqW Acc : ': 51695114.51178579, 'Mean IoU : ': 32357954.0}
Starting Epoch: 32


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 22.168194212019444


11it [00:00, 21.71it/s]


Scores after epoch 32: {'Overall Acc: ': 0.945979102423107, 'FreqW Acc : ': 53375585.03526677, 'Mean IoU : ': 33415675.0}
Starting Epoch: 33


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 22.060218684375286


11it [00:00, 21.89it/s]


Scores after epoch 33: {'Overall Acc: ': 0.9463554541457595, 'FreqW Acc : ': 55052216.9977357, 'Mean IoU : ': 34473624.5}
Starting Epoch: 34


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


Epoch Loss: 21.952831253409386


11it [00:00, 21.98it/s]


Scores after epoch 34: {'Overall Acc: ': 0.9467172744452895, 'FreqW Acc : ': 56732657.89485783, 'Mean IoU : ': 35531859.5}
Starting Epoch: 35


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4f4423c290>
Traceback (most recent call last):
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/google/home/ammarh/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process
