In [None]:
%precision %.5g
from tqdm import tqdm_notebook

INFERENCE_ONLY = False#True

import os
root = os.environ['HOME'] + '/'
competition_name = "tgs-salt-identification-challenge"
short_comp_name = "salt"
DATASET_PATH     = root + "datasets/competitions/" + competition_name
SAVE_PATH        = root + "models/" + short_comp_name + '/'
SUBMISSIONS_PATH = root + "submissions/" + short_comp_name + '/'
SRC_PATH         = root + "code/kaggle_salt/code_gazay/"

COMP_SRC_PATH    = SRC_PATH + short_comp_name
LENIN_SRC_PATH   = SRC_PATH + "lenin"
SCRIPTS_PATH     = SRC_PATH + "scripts"

ARCH_NAME = 'albunet34'

# Debug
from IPython.core.debugger import set_trace

# stdlib
import sys
import numpy as np
import pandas as pd
import subprocess

# Lenin straight from repo
sys.path.insert(0, LENIN_SRC_PATH)
import lenin
from lenin import test
from lenin.datasets.salt import Dataset

# Torch
import torch
from torch import nn

# Plots in IPython mode
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-white')
import seaborn as sns
sns.set_style("white")

# Utils from competition directory
sys.path.insert(0, COMP_SRC_PATH)
from src.utils import id_generator
from src.utils.metrics import old_iou_lb, old_iou_numpy, intersection_over_union, intersection_over_union_thresholds
from src.utils.split_data import split_data
from src.utils.randomization import set_random_seed, base_model_name
from src.utils.train_wrapper import Trainer
from src.utils.plot import show_images
from src.utils.rle import rle_encoding

# Model architecture
from src.models.albunet34 import new_model, load_model

# Images resizing
from src.utils.processors import Resizers, Processors

# Data splitting
from src.utils.split_data import split_data

In [None]:
# Randomization (setting random seed and model name)
global RANDOM_SEED
RANDOM_SEED = set_random_seed()

In [None]:
dataset = Dataset(DATASET_PATH)
procs = Processors(aug_type='resize_pad_crop')
dataset.preprocessors = procs.pre()
dataset.postprocessors = procs.post()
#dataset.train = dataset.train[0:500]
#show_images(dataset)
downsample_fn = procs.resizers.downsampler

In [None]:
if not INFERENCE_ONLY:
    MODEL_NAME = base_model_name(ARCH_NAME)
    print(MODEL_NAME)
    
    import torchbearer
    from torchbearer import metrics
    from lenin import train

    sys.path.insert(0, COMP_SRC_PATH)
    from src.utils import id_generator
    from src.utils.metrics import intersection_over_union, intersection_over_union_thresholds
    from src.utils.split_data import split_data, cross_validate
    from src.utils.randomization import set_random_seed, base_model_name
    from src.utils.train_wrapper import Trainer

    def get_iou_vector(y_t, y_p):
        batch_size = y_t.shape[0]
        metric = []
        for batch in range(batch_size):
            t, p = y_t[batch]>0, y_p[batch]>0

            intersection = np.logical_and(t, p)
            union = np.logical_or(t, p)
            iou = (np.sum(intersection > 0) + 1e-10 )/ (np.sum(union > 0) + 1e-10)
            thresholds = np.arange(0.5, 1, 0.05)
            s = []
            for thresh in thresholds:
                s.append(iou > thresh)
            metric.append(np.mean(s))

        return np.mean(metric)

    # Metric
    @metrics.default_for_key('acc')
    @metrics.running_mean
    @metrics.std
    @metrics.mean
    @metrics.lambda_metric('acc', on_epoch=False)
    def iou_pytorch(y_pred: torch.Tensor, y_true: torch.Tensor, prob_thres=0):
        batch_size = y_true.shape[0]

        metric = torch.tensor([]).float().cuda()
        for batch in range(batch_size):
            t, p = y_true[batch]>0, y_pred[batch]>0

            intersection = (t & p)
            union = (t | p)
            iou = (torch.sum(intersection > 0).float() + 1e-10 ) / (torch.sum(union > 0).float() + 1e-10)
            thresholds = torch.arange(0.5, 1, 0.05).float().cuda()
            s = torch.tensor([]).float().cuda()
            for thresh in thresholds:
                s = torch.cat((s, (iou > thresh).float().unsqueeze(0)))
            metric = torch.cat((metric, torch.mean(s).unsqueeze(0)))

        return torch.mean(metric)
    
    # Hyperparams
    stratify = True
    val_size = 0.10
    orig_options = {
        'random_seed': RANDOM_SEED,
        # For debug purposes set num workers to 0
        'preload': { 'num_workers': 0 }, # 'pin_memory': True, 'worker_init_fn': _init_fn },

        'augment': { ('image', 'mask'): [
            {'type': 'HorizontalFlip'},
            {'type': 'ShiftScaleRotate'},
            {'type': 'Blur'},
            {'type': 'GaussNoise' },
            #{'type': 'RandomGamma' },
            #{'type': 'OpticalDistortion' },
            #{'type': 'GridDistortion' },
            #{'type': 'ElasticTransform' },
            #{'type': 'HueSaturationValue' },
            #{'type': 'RandomBrightness' },
            #{'type': 'RandomContrast' },
            #{'type': 'MotionBlur' },
            #{'type': 'MedianBlur' },
            #{'type': 'CLAHE' },
            #{'type': 'JpegCompression' },
        ]},
        'batch_size': 96,
        'optimizer': ('adam', { 'lr': 1e-4 }),
        'epochs': 10,
        'loss': 'bce_xloss',
        'metrics': ['loss', 'acc'],
        'val_size': val_size,
        'stratify_split': stratify,
        'split': split_data(dataset, RANDOM_SEED, stratify=stratify, val_size=val_size)
    }

    steps = [
        {
            'step': 0,
        },
        {
            'step': 1,
            'optimizer': ('adam', { 'lr': 1e-4 }),
            'loss': 'lovasz',
            'epochs': 100
        },
        {
            'step': 2,
            'optimizer': ('adam', { 'lr': 1e-5 }),
            'loss': 'lovasz',
            'epochs': 100
        }
    ]
    
    model_name = "jupyter_" + id_generator(MODEL_NAME, with_folds=False)
    #model_base = '2018-10-09-162719_b3b5b9e_seed-4621_albunet34'
    #model_name = 'st%i_' + model_base + '_--_{epoch:02d}_{val_loss:.4f}_{val_acc:.4f}.pt'

    trainer = Trainer(save_path=SAVE_PATH,
                      code_path=SCRIPTS_PATH,
                      dataset=dataset,
                      model_name=model_name,
                      model_load_fn=load_model,
                      model_new_fn=new_model,
                      train_fn=train)

    trainer.sequence(steps, **orig_options)

In [None]:
%debug

In [None]:
t = torch.cat((t, torch.tensor([1]).long()))
t

# INFERENCE

In [None]:
_DIR = subprocess.check_output(['ls', SAVE_PATH]).decode('utf8').split("\n")[-2] + '/'
_DIR = 'folds_2018-10-10_1c1844b_seed-4621_albunet34/'

!ls {SAVE_PATH}
print('----')
!ls {SAVE_PATH + _DIR}
!cat {SAVE_PATH + _DIR + 'params_st0.json'}

In [None]:
_dir = SAVE_PATH + _DIR
arrs = []
for _d in subprocess.check_output(['ls', _dir]).decode('utf8').split("\n")[:4]:
    _dir_d = _dir + _d + '/'
    arr = next(x for x in subprocess.check_output(['ls', _dir_d]).decode('utf8').split("\n") if x.endswith('test_preds.npy'))
    arrs.append(np.load(_dir_d + arr))

In [None]:
predictions = arrs.pop(0)
for arr in arrs:
    predictions += arr

In [None]:
import copy
fold_predictions = copy.deepcopy(predictions)

In [None]:
threshold_best = 0
binary_predictions = (predictions > threshold_best).astype(int)
all_masks = rle_encoding(binary_predictions)
model_name = 'folds_2018-10-10-223412_1c1844b_seed-4621_albunet34_thres0.pt'
submit = pd.DataFrame([[rec.split('-')[1] for rec in dataset.test], all_masks]).T
submit.columns = ['id', 'rle_mask']
submission_path = SUBMISSIONS_PATH + model_name.split('/')[-1].replace('.pt', '.csv')
submit.to_csv(submission_path, index = False)

In [None]:
model_name = subprocess.check_output(['ls', SAVE_PATH + _DIR]).decode('utf8').split("\n")[-2]
model_name = 'st0_2018-10-10-045239_1f5e39f_seed-4621_albunet34_--_53_0.0247_0.8361.pt'
acc = model_name.split('_')[-1].split('.')[1]
print("Accuracy: " + '.'.join([acc[:2], acc[2:]]))

model = load_model(SAVE_PATH + _DIR + model_name)

In [None]:
threshold_best = 0

# МЕТРИКУ ЗАПИЛИ
seed = int(model_name.split('seed-')[-1].split('_')[0])
RANDOM_SEED = set_random_seed(seed)
val_size = 0.1
stratify_split = True

train, val = split_data(dataset, random_seed=seed, val_size=val_size, stratify=stratify_split)
val_dataset = Dataset(DATASET_PATH)
val_dataset.preprocessors = procs.pre()
val_dataset.postprocessors = procs.post()
val_dataset.test = val
val_preds = test(model, val_dataset, batch_size=24)
val_preds = np.array([downsample_fn(pred) for pred in val_preds.data.cpu().numpy()[:, 0, :, :]])
val_truth = np.array([dataset.mask(record) for record in val_dataset.test])
assert val_preds.shape == val_truth.shape

## Scoring for last model, choose threshold by validation data 
thresholds = np.linspace(-2, 2, 100)
# Reverse sigmoid function: Use code below because the  sigmoid activation was removed
#thresholds = np.log(thresholds_ori/(1-thresholds_ori))

ious = np.array([iou_numpy(val_preds, val_truth, prob_thres=threshold) for threshold in tqdm_notebook(thresholds)])
print(ious)

# instead of using default 0 as threshold, use validation data to find the best threshold.
threshold_best_index = np.argmax(ious) 
iou_best = ious[threshold_best_index]
threshold_best = thresholds[threshold_best_index]

plt.plot(thresholds, ious)
plt.plot(threshold_best, iou_best, "xr", label="Best threshold")
plt.xlabel("Threshold")
plt.ylabel("IoU")
plt.title("Threshold vs IoU ({}, {})".format(threshold_best, iou_best))
plt.legend()

In [None]:
show_images(dataset, unprocess_fn=downsample_fn, model=model, threshold=threshold_best)

In [None]:
half1 = dataset.test[:9000]
half2 = dataset.test[9000:]
dataset.test = half1
predictions = test(model, dataset, batch_size=8)
predictions = np.array([downsample_fn(pred) for pred in predictions.data.cpu().numpy()[:, 0, :, :]])
binary_predictions1 = (predictions > threshold_best).astype(int)
predictions = None

In [None]:
dataset.test = half2
predictions = test(model, dataset, batch_size=8)
predictions = np.array([downsample_fn(pred) for pred in predictions.data.cpu().numpy()[:, 0, :, :]])
binary_predictions2 = (predictions > threshold_best).astype(int)
predictions = None

In [None]:
binary_predictions = np.concatenate((binary_predictions1, binary_predictions2))
dataset.test = np.concatenate((half1, half2))

In [None]:
all_masks = rle_encoding(binary_predictions)

submit = pd.DataFrame([[rec.split('-')[1] for rec in dataset.test], all_masks]).T
submit.columns = ['id', 'rle_mask']
submission_path = SUBMISSIONS_PATH + model_name.split('/')[-1].replace('.pt', '.csv')
submit.to_csv(submission_path, index = False)

In [None]:
m = submission_path.replace(':', '\:').replace('(', '\(').replace(')', '\)')
!kaggle c submit -f {m} -m '{submission_path}' -c tgs-salt-identification-challenge