In [2]:
import os 
import time 
import json 
import torch 
import random 
import warnings
import torchvision
import numpy as np 
import pandas as pd 
import sys 
import shutil
import pathlib

from tqdm import tqdm 
from datetime import datetime
from torch import nn,optim
from collections import OrderedDict
from torch.autograd import Variable
from torch.utils.data import Dataset,DataLoader
from torch.optim import lr_scheduler
from sklearn.model_selection import train_test_split
from timeit import default_timer as timer
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from torchvision import models
from pretrainedmodels.models import bninception
from collections import OrderedDict
import torch.nn.functional as F
from torchsummary import summary
from PIL import Image
from torchvision import transforms as T
from imgaug import augmenters as iaa

In [None]:
# 1. set random seed
random.seed(2050)
np.random.seed(2050)
torch.manual_seed(2050)
torch.cuda.manual_seed_all(2050)
torch.backends.cudnn.benchmark = True
warnings.filterwarnings('ignore')

In [None]:
if not os.path.exists("./logs/"):
    os.mkdir("./logs/")

def setup_logger(fold):
    log = Logger()
    log.open("logs/%s_fold_%s_log_train.txt"%(config.model_name,str(fold)),mode="a")
    log.write("\n----------------------------------------------- [START %s] %s\n\n" % (datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '-' * 51))
    log.write('                           |------------ Train -------------|----------- Valid -------------|----------Best Results---------|------------|\n')
    log.write('mode     iter     epoch    |         loss   f1_macro        |         loss   f1_macro       |         loss   f1_macro       | time       |\n')
    log.write('-------------------------------------------------------------------------------------------------------------------------------\n')
    return log

In [None]:
class DefaultConfigs(object):
    # DIRECTORIES
    train_data = "../input/human-protein-atlas-image-classification/data/train/" # train data directory
    test_data = "../input/human-protein-atlas-image-classification/data/test/"   # test data directory
    data_root = "../input/human-protein-atlas-image-classification/data/"        # data root directory
    weights = "./checkpoints/"                                                   # saved models' directory
    best_models = "./checkpoints/best_models/"                                   # best models' directory
    submit = "./submit/"                                                         # submission file directory

    model_name = "bninception_bcelog4"

    # PARAMETERS
    num_classes = 28
    img_width = 512
    img_height = 512
    channels = 4
    lr = 0.03
    batch_size = 32
    epochs = 50
    curr_fold = 0 

    # FLAGS
    first_layer_pretrained = True
    oversample = True
    
    train = True
    retrain = True # train flag should be True for retrain to work
    test = True
    ensemble = False

config = DefaultConfigs()

# Data Class

In [None]:
class HumanDataset(Dataset):
    def __init__(self,images_df,base_path,augument=True,mode="train"):
        if not isinstance(base_path, pathlib.Path):
            base_path = pathlib.Path(base_path)
        self.images_df = images_df.copy()
        self.augument = augument
        self.images_df.Id = self.images_df.Id.apply(lambda x:base_path / x)
        self.mlb = MultiLabelBinarizer(classes = np.arange(0,config.num_classes))
        self.mlb.fit(np.arange(0,config.num_classes))
        self.mode = mode

    def __len__(self):
        return len(self.images_df)

    def __getitem__(self,index):
        X = self.read_images(index)
        if not self.mode == "test":
            labels = np.array(list(map(int, self.images_df.iloc[index].Target.split(' '))))
            y  = np.eye(config.num_classes,dtype=np.float)[labels].sum(axis=0)
        else:
            y = str(self.images_df.iloc[index].Id.absolute())
        
        if self.augument:
            X = self.augumentor(X)
        
        # X = T.Compose([T.ToPILImage(),T.ToTensor(),T.Normalize([0.08069, 0.05258, 0.05487, 0.08282], [0.13704, 0.10145, 0.15313, 0.13814])])(X)
        X = T.Compose([T.ToPILImage(),T.ToTensor()])(X)
        
        return X.float(),y

    def read_images(self,index):
        row = self.images_df.iloc[index]
        filename = str(row.Id.absolute())
        #use only rgb channels
        if config.channels == 4:
            images = np.zeros(shape=(512,512,4))
        else:
            images = np.zeros(shape=(512,512,3))
        r = np.array(Image.open(filename+"_red.png")) 
        g = np.array(Image.open(filename+"_green.png")) 
        b = np.array(Image.open(filename+"_blue.png")) 
        y = np.array(Image.open(filename+"_yellow.png")) 
        images[:,:,0] = r.astype(np.uint8) 
        images[:,:,1] = g.astype(np.uint8)
        images[:,:,2] = b.astype(np.uint8)
        if config.channels == 4:
            images[:,:,3] = y.astype(np.uint8)
        images = images.astype(np.uint8)
        #images = np.stack(images,-1) 
        if config.img_height == 512:
            return images
        else:
            return cv2.resize(images,(config.img_width,config.img_height))

    def augumentor(self,image):
        augment_img = iaa.Sequential([
            iaa.OneOf([
                iaa.Affine(rotate=90),
                iaa.Affine(rotate=180),
                iaa.Affine(rotate=270),
                iaa.Affine(shear=(-16, 16)),
                iaa.Fliplr(0.5),
                iaa.Flipud(0.5),
            ])], random_order=True)
        
        image_aug = augment_img.augment_image(image)
        return image_aug

# Utils

In [None]:
# save best model
def save_checkpoint(state, is_best_loss,is_best_f1,fold):
    filename = config.weights + config.model_name + os.sep +str(fold) + os.sep + "checkpoint.pth.tar"
    torch.save(state, filename)
    if is_best_loss:
        shutil.copyfile(filename,"%s/%s_fold_%s_model_best_loss.pth.tar"%(config.best_models,config.model_name,str(fold)))
    if is_best_f1:
        shutil.copyfile(filename,"%s/%s_fold_%s_model_best_f1.pth.tar"%(config.best_models,config.model_name,str(fold)))

# evaluate meters
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

# print logger
class Logger(object):
    def __init__(self):
        self.terminal = sys.stdout  #stdout
        self.file = None

    def open(self, file, mode=None):
        if mode is None: mode ='w'
        self.file = open(file, mode)

    def write(self, message, is_terminal=1, is_file=1 ):
        if '\r' in message: is_file=0

        if is_terminal == 1:
            self.terminal.write(message)
            self.terminal.flush()
            #time.sleep(1)

        if is_file == 1:
            self.file.write(message)
            self.file.flush()

    def flush(self):
        # this flush method is needed for python 3 compatibility.
        # this handles the flush command by doing nothing.
        # you might want to specify some extra behavior here.
        pass

class Oversampling:
    def __init__(self, df):
        self.train_labels = df.set_index('Id')
        self.train_labels['Target'] = [[int(i) for i in s.split()] for s in self.train_labels['Target']]
        # set the minimum number of duplicates for each class
        self.multi = [1, 1, 1, 1, 1, 1, 1, 1, 8, 8,
                      8, 1, 1, 1, 1, 8, 1, 2, 1, 1,
                      4, 1, 1, 1, 2, 1, 2, 8]
        # TODO : different oversampling? https://www.kaggle.com/wordroid/inceptionresnetv2-resize256-f1loss-lb0-419

    def get(self, image_id):
        labels = self.train_labels.loc[image_id, 'Target'] if image_id in self.train_labels.index else []
        m = 1
        for l in labels:
            if m < self.multi[l]: m = self.multi[l]
        return m
        
def get_learning_rate(optimizer):
    lr=[]
    for param_group in optimizer.param_groups:
       lr +=[ param_group['lr'] ]

    #assert(len(lr)==1) #we support only one param_group
    lr = lr[0]

    return lr

def time_to_str(t, mode='min'):
    if mode=='min':
        t  = int(t)/60
        hr = t//60
        min = t%60
        return '%2d hr %02d min'%(hr,min)

    elif mode=='sec':
        t   = int(t)
        min = t//60
        sec = t%60
        return '%2d min %02d sec'%(min,sec)

    else:
        raise NotImplementedError

def get_best_thres(val_loader,model,num_split=100):

    y_true_all, y_pred_all = [], []
    model.cuda()
    model.eval()
    with torch.no_grad():
        for (images,target) in tqdm(val_loader):
            images_var = images.cuda(non_blocking=True)
            target = np.array(target)
            output = model(images_var).sigmoid().cpu().data.numpy()
            y_true_all.extend(target)
            y_pred_all.extend(output)

    cate2th = {}
    y_true_all, y_pred_all = np.array(y_true_all), np.array(y_pred_all)

    for c in range(28):
        y_true = y_true_all[:,c]
        y_pred = y_pred_all[:,c]
        best_th = 0
        best_f1 = -1

        for th in np.linspace(0,1,num_split,endpoint=False):
            f1 = f1_score(y_true,(y_pred > th).astype(int))
            if best_f1 <= f1:
                best_f1 = f1
                best_th = th

        cate2th[c] = best_th

    return cate2th

# Model

In [None]:
def get_net(show_summary=0):
    
    '''show_summary flag is for printing summary of the model
        0 --> prints nothing
        1 --> keras type model summary using torchsummary library
        2 --> prints the model in pyTorch fashion'''
    
    model = bninception(pretrained="imagenet")
    model.global_pool = nn.AdaptiveAvgPool2d(1)
    old_weight = model.conv1_7x7_s2.weight
    model.conv1_7x7_s2 = nn.Conv2d(config.channels, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
    
    if config.first_layer_pretrained:
        # using pretrained imagenet weight for RGB channels & G channel weight for Y channel
        if config.channels == 3:
            model.conv1_7x7_s2.weight = old_weight
        else:
            new_weight = torch.nn.Parameter(torch.cat((old_weight,torch.reshape(old_weight[:,1,:,:],(64,1,7,7))),dim=1))
            model.conv1_7x7_s2.weight = new_weight 

    model.last_linear = nn.Sequential(
                nn.BatchNorm1d(1024),
                nn.Dropout(0.5),
                nn.Linear(1024, config.num_classes),
            )

    if show_summary == 1: print(summary(model.to("cuda:0"), (config.channels, config.img_width, config.img_height)))
    elif show_summary == 2: print(model)
    # to check weight
    # for param in model.parameters():    
    #     print(param.data[0])
    #     break
    return model

# Train, Validate, Test

In [None]:
def train(train_loader,model,criterion,optimizer,epoch,valid_loss,best_results,start,log):
    losses = AverageMeter()
    f1 = AverageMeter()
    model.train()
    for i,(images,target) in enumerate(train_loader):
        images = images.cuda(non_blocking=True)
        target = torch.from_numpy(np.array(target)).float().cuda(non_blocking=True)
        # compute output
        output = model(images)
        loss = criterion(output,target)
        losses.update(loss.item(),images.size(0))
        
        f1_batch = f1_score(target,output.sigmoid().cpu() > 0.15,average='macro')
        f1.update(f1_batch,images.size(0))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print('\r',end='',flush=True)
        message = '%s %5.1f %6.1f         |         %0.3f  %0.3f           |         %0.3f  %0.4f         |         %s  %s    | %s' % (\
                "train", i/len(train_loader), epoch,
                losses.avg, f1.avg, 
                valid_loss[0], valid_loss[1], 
                str(best_results[0])[:8],str(best_results[1])[:8],
                time_to_str((timer() - start),'min'))
        print(message , end='',flush=True)
    log.write("\n")
    return [losses.avg,f1.avg]

def evaluate(val_loader,model,criterion,epoch,train_loss,best_results,start,log):
    # only meter loss and f1 score
    losses = AverageMeter()
    f1 = AverageMeter()
    model.cuda()
    model.eval() # switch mode for evaluation
    
    with torch.no_grad():
        for i, (images,target) in enumerate(val_loader):
            images_var = images.cuda(non_blocking=True)
            target = torch.from_numpy(np.array(target)).float().cuda(non_blocking=True)
            output = model(images_var)
            loss = criterion(output,target)
            losses.update(loss.item(),images_var.size(0))
            f1_batch = f1_score(target,output.sigmoid().cpu().data.numpy() > 0.15,average='macro')
            f1.update(f1_batch,images_var.size(0))
            print('\r',end='',flush=True)
            message = '%s   %5.1f %6.1f         |         %0.3f  %0.3f           |         %0.3f  %0.4f         |         %s  %s    | %s' % (\
                    "val", i/len(val_loader), epoch,                    
                    train_loss[0], train_loss[1], 
                    losses.avg, f1.avg,
                    str(best_results[0])[:8],str(best_results[1])[:8],
                    time_to_str((timer() - start),'min'))

            print(message, end='',flush=True)
        log.write("\n")
    return [losses.avg,f1.avg]

def test(test_loader,model,fold,test_files,metric,log,th_dic=None):
    sample_submission_df = test_files
    filenames,labels,submissions= [],[],[]
    model.cuda() # confirm the model converted to cuda
    model.eval()
    submit_results = []
    
    for i,(input,filepath) in enumerate(tqdm(test_loader)):
        # change everything to cuda and get only basename
        filepath = [os.path.basename(x) for x in filepath]
        with torch.no_grad():
            image_var = input.cuda(non_blocking=True)
            y_pred = model(image_var)
            label = y_pred.sigmoid().cpu().data.numpy()
            if th_dic is not None: labels.append(label > np.array(list(th_dic.values())))
            else: labels.append(label > 0.15)
            filenames.append(filepath)

    for row in np.concatenate(labels):
        subrow = ' '.join(list([str(i) for i in np.nonzero(row)[0]]))
        submissions.append(subrow)
    sample_submission_df['Predicted'] = submissions
    sample_submission_df.to_csv('./submit/%s_%s_fold_%s_submission.csv'%(config.model_name,metric,str(fold)), index=None)

# Main function

In [None]:
def main(fold, oversample=False):
    log = setup_logger(fold)

    # mkdirs
    if not os.path.exists(config.submit):
           os.makedirs(config.submit)
    if not os.path.exists(config.weights + config.model_name + os.sep +str(fold)):
           os.makedirs(config.weights + config.model_name + os.sep +str(fold))
    if not os.path.exists(config.best_models):
           os.mkdir(config.best_models)
    if not os.path.exists("./logs/"):
           os.mkdir("./logs/")

    # get model
    model = get_net()
    model.cuda()

    # criterion
    optimizer = optim.SGD(model.parameters(),lr = config.lr,momentum=0.9,weight_decay=1e-4)
    criterion = nn.BCEWithLogitsLoss().cuda()
    
    start_epoch = 0
    best_loss = 999
    best_f1 = 0
    best_results = [np.inf,0]
    val_metrics = [np.inf,0]
    resume = False
    all_files = pd.read_csv(config.data_root+"train.csv")
    test_files = pd.read_csv(config.data_root+"sample_submission.csv")
    # train_data_list,val_data_list = train_test_split(all_files,test_size = 0.13,random_state = 2050)

    # Stratify 
    mlb = MultiLabelBinarizer()
    labels = [[int(i) for i in i.split()] for i in all_files.Target.tolist()]
    labels = mlb.fit_transform(labels)

    X, Y = np.arange(len(labels)), labels
    mskf = MultilabelStratifiedKFold(n_splits=5, random_state=0)

    for n_fold, (train_index, test_index) in enumerate(mskf.split(X, Y)):
        print('Fold %s'%str(n_fold))
        train_data_list = all_files.iloc[train_index]
        val_data_list = all_files.iloc[test_index]
        if n_fold==fold: break
    
    # Oversample
    if oversample:
        oversampled_train_data = train_data_list.copy()
        s = Oversampling(oversampled_train_data)
        for ind,idx in enumerate(train_data_list.Id):
            multiplier = s.get(idx)
            if multiplier>1: 
                oversampled_train_data = oversampled_train_data.append([train_data_list.iloc[[ind]]]*(multiplier-1),ignore_index=True)
            train_data_list = oversampled_train_data

    # load dataset
    train_gen = HumanDataset(train_data_list,config.train_data,mode="train")
    train_loader = DataLoader(train_gen,batch_size=config.batch_size,shuffle=True,pin_memory=True,num_workers=4)

    val_gen = HumanDataset(val_data_list,config.train_data,augument=False,mode="train")
    val_loader = DataLoader(val_gen,batch_size=config.batch_size,shuffle=False,pin_memory=True,num_workers=4)

    test_gen = HumanDataset(test_files,config.test_data,augument=False,mode="test")
    test_loader = DataLoader(test_gen,1,shuffle=False,pin_memory=True,num_workers=4)
       
    scheduler = lr_scheduler.StepLR(optimizer,step_size=10,gamma=0.1)
    start = timer()

    if config.retrain: # retrain
        saved_model = torch.load("%s/%s/%s/checkpoint.pth.tar"%(config.weights,config.model_name,str(fold)))
        model.load_state_dict(saved_model["state_dict"])
        optimizer.load_state_dict(saved_model["optimizer"])
        start_epoch = saved_model["epoch"]
        best_results = [saved_model["best_loss"],saved_model["best_f1"]]
        scheduler = lr_scheduler.StepLR(optimizer,step_size=10,gamma=0.1)
        del(saved_model)

    if config.train: # train
        for epoch in range(start_epoch,config.epochs):
            scheduler.step(epoch)
            # train
            lr = get_learning_rate(optimizer)
            train_metrics = train(train_loader,model,criterion,optimizer,epoch,val_metrics,best_results,start,log)
            # validate
            val_metrics = evaluate(val_loader,model,criterion,epoch,train_metrics,best_results,start,log)
            # check results 
            is_best_loss = val_metrics[0] < best_results[0]
            best_results[0] = min(val_metrics[0],best_results[0])
            is_best_f1 = val_metrics[1] > best_results[1]
            best_results[1] = max(val_metrics[1],best_results[1])   
            # save model
            save_checkpoint({
                        "epoch":epoch + 1,
                        "model_name":config.model_name,
                        "state_dict":model.state_dict(),
                        "best_loss":best_results[0],
                        "optimizer":optimizer.state_dict(),
                        "fold":fold,
                        "best_f1":best_results[1],
            },is_best_loss,is_best_f1,fold)
            # print logs
            print('\r',end='',flush=True)
            log.write('%s  %5.1f %6.1f         |         %0.3f  %0.3f           |         %0.3f  %0.4f         |         %s  %s    | %s' % (\
                    "best", epoch, epoch,                    
                    train_metrics[0], train_metrics[1], 
                    val_metrics[0], val_metrics[1],
                    str(best_results[0])[:8],str(best_results[1])[:8],
                    time_to_str((timer() - start),'min'))
                )
            log.write("\n")
            time.sleep(0.01)

    if config.test: # test
        best_model = torch.load("%s/%s_fold_%s_model_best_loss.pth.tar"%(config.best_models,config.model_name,str(fold)))
        model.load_state_dict(best_model["state_dict"])
        test(test_loader,model,fold,test_files,"best_loss",log)

        best_model = torch.load("%s/%s_fold_%s_model_best_f1.pth.tar"%(config.best_models,config.model_name,str(fold)))
        model.load_state_dict(best_model["state_dict"])
        test(test_loader,model,fold,test_files,"best_f1",log)

        best_model = torch.load("%s/%s/%s/checkpoint.pth.tar"%(config.weights,config.model_name,str(fold)))
        model.load_state_dict(best_model["state_dict"])
        test(test_loader,model,fold,test_files,"last_epoch",log)

In [None]:
if __name__ == "__main__":
    main(config.curr_fold, config.oversample)