# Import / Setting

In [2]:
import numpy as np
import pandas as pd
import random, os
import gc, psutil
from datetime import datetime
from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

import albumentations as A
from albumentations import (Compose, OneOf,
                            Resize, RandomSizedCrop, CenterCrop,
                            RandomRotate90, Flip, Transpose, 
                            IAAAdditiveGaussianNoise, GaussNoise,
                            Normalize)
from albumentations.pytorch import ToTensor

from PIL import Image
import cv2
import timm

DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'

def NMAE(true, pred):
    mae = np.mean(np.abs(true - pred))
    score = mae / np.mean(np.abs(true))
    return score

def seed_everything(seed = 42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

SEED = 42    
seed_everything(SEED)

# Load Data

In [3]:
submission = pd.read_csv('sample_submission.csv')
submission

Unnamed: 0,img_name,leaf_weight
0,001.png,0
1,002.png,0
2,003.png,0
3,004.png,0
4,005.png,0
...,...,...
455,456.png,0
456,457.png,0
457,458.png,0
458,459.png,0


In [4]:
train_df = pd.concat([pd.read_csv(label_path) for label_path in glob('train/*/*.csv')]).reset_index(drop = True)
train_df['case'] = train_df['img_name'].str[4:6].astype(int)
train_df['img_path'] = glob('train/*/image/*')
train_df['meta_path'] = glob('train/*/meta/*')
train_df = train_df.drop([883]).reset_index(drop = True)
train_df

Unnamed: 0,img_name,leaf_weight,case,img_path,meta_path
0,CASE01_01.png,49.193,1,train\CASE01\image\CASE01_01.png,train\CASE01\meta\CASE01_01.csv
1,CASE01_02.png,59.764,1,train\CASE01\image\CASE01_02.png,train\CASE01\meta\CASE01_02.csv
2,CASE01_03.png,72.209,1,train\CASE01\image\CASE01_03.png,train\CASE01\meta\CASE01_03.csv
3,CASE01_04.png,85.737,1,train\CASE01\image\CASE01_04.png,train\CASE01\meta\CASE01_04.csv
4,CASE01_05.png,102.537,1,train\CASE01\image\CASE01_05.png,train\CASE01\meta\CASE01_05.csv
...,...,...,...,...,...
1586,CASE74_25.png,211.497,74,train\CASE74\image\CASE74_25.png,train\CASE74\meta\CASE74_25.csv
1587,CASE74_26.png,214.116,74,train\CASE74\image\CASE74_26.png,train\CASE74\meta\CASE74_26.csv
1588,CASE74_27.png,214.293,74,train\CASE74\image\CASE74_27.png,train\CASE74\meta\CASE74_27.csv
1589,CASE75_01.png,210.872,75,train\CASE75\image\CASE75_01.png,train\CASE75\meta\CASE75_01.csv


In [5]:
def concat_shifted(train_df):
    dfs = []
    for case in train_df['case'].unique():
        temp = train_df[train_df['case'] == case].copy()
        temp['real_weight'] = temp['leaf_weight'].shift(1)
        dfs.append(temp)
    return pd.concat(dfs).dropna()

train_df_shifted = concat_shifted(train_df)
train_df_shifted['weight_delta'] = train_df_shifted['leaf_weight'] - train_df_shifted['real_weight']
train_df_shifted

Unnamed: 0,img_name,leaf_weight,case,img_path,meta_path,real_weight,weight_delta
1,CASE01_02.png,59.764,1,train\CASE01\image\CASE01_02.png,train\CASE01\meta\CASE01_02.csv,49.193,10.571
2,CASE01_03.png,72.209,1,train\CASE01\image\CASE01_03.png,train\CASE01\meta\CASE01_03.csv,59.764,12.445
3,CASE01_04.png,85.737,1,train\CASE01\image\CASE01_04.png,train\CASE01\meta\CASE01_04.csv,72.209,13.528
4,CASE01_05.png,102.537,1,train\CASE01\image\CASE01_05.png,train\CASE01\meta\CASE01_05.csv,85.737,16.800
5,CASE01_06.png,123.359,1,train\CASE01\image\CASE01_06.png,train\CASE01\meta\CASE01_06.csv,102.537,20.822
...,...,...,...,...,...,...,...
1585,CASE74_24.png,211.002,74,train\CASE74\image\CASE74_24.png,train\CASE74\meta\CASE74_24.csv,206.349,4.653
1586,CASE74_25.png,211.497,74,train\CASE74\image\CASE74_25.png,train\CASE74\meta\CASE74_25.csv,211.002,0.495
1587,CASE74_26.png,214.116,74,train\CASE74\image\CASE74_26.png,train\CASE74\meta\CASE74_26.csv,211.497,2.619
1588,CASE74_27.png,214.293,74,train\CASE74\image\CASE74_27.png,train\CASE74\meta\CASE74_27.csv,214.116,0.177


In [6]:
train_df_shifted = train_df_shifted[train_df_shifted['case'] != 59].copy()
train_df_shifted

Unnamed: 0,img_name,leaf_weight,case,img_path,meta_path,real_weight,weight_delta
1,CASE01_02.png,59.764,1,train\CASE01\image\CASE01_02.png,train\CASE01\meta\CASE01_02.csv,49.193,10.571
2,CASE01_03.png,72.209,1,train\CASE01\image\CASE01_03.png,train\CASE01\meta\CASE01_03.csv,59.764,12.445
3,CASE01_04.png,85.737,1,train\CASE01\image\CASE01_04.png,train\CASE01\meta\CASE01_04.csv,72.209,13.528
4,CASE01_05.png,102.537,1,train\CASE01\image\CASE01_05.png,train\CASE01\meta\CASE01_05.csv,85.737,16.800
5,CASE01_06.png,123.359,1,train\CASE01\image\CASE01_06.png,train\CASE01\meta\CASE01_06.csv,102.537,20.822
...,...,...,...,...,...,...,...
1585,CASE74_24.png,211.002,74,train\CASE74\image\CASE74_24.png,train\CASE74\meta\CASE74_24.csv,206.349,4.653
1586,CASE74_25.png,211.497,74,train\CASE74\image\CASE74_25.png,train\CASE74\meta\CASE74_25.csv,211.002,0.495
1587,CASE74_26.png,214.116,74,train\CASE74\image\CASE74_26.png,train\CASE74\meta\CASE74_26.csv,211.497,2.619
1588,CASE74_27.png,214.293,74,train\CASE74\image\CASE74_27.png,train\CASE74\meta\CASE74_27.csv,214.116,0.177


In [7]:
train_df_shifted = train_df_shifted.reset_index(drop = True)
train_df_shifted

Unnamed: 0,img_name,leaf_weight,case,img_path,meta_path,real_weight,weight_delta
0,CASE01_02.png,59.764,1,train\CASE01\image\CASE01_02.png,train\CASE01\meta\CASE01_02.csv,49.193,10.571
1,CASE01_03.png,72.209,1,train\CASE01\image\CASE01_03.png,train\CASE01\meta\CASE01_03.csv,59.764,12.445
2,CASE01_04.png,85.737,1,train\CASE01\image\CASE01_04.png,train\CASE01\meta\CASE01_04.csv,72.209,13.528
3,CASE01_05.png,102.537,1,train\CASE01\image\CASE01_05.png,train\CASE01\meta\CASE01_05.csv,85.737,16.800
4,CASE01_06.png,123.359,1,train\CASE01\image\CASE01_06.png,train\CASE01\meta\CASE01_06.csv,102.537,20.822
...,...,...,...,...,...,...,...
1479,CASE74_24.png,211.002,74,train\CASE74\image\CASE74_24.png,train\CASE74\meta\CASE74_24.csv,206.349,4.653
1480,CASE74_25.png,211.497,74,train\CASE74\image\CASE74_25.png,train\CASE74\meta\CASE74_25.csv,211.002,0.495
1481,CASE74_26.png,214.116,74,train\CASE74\image\CASE74_26.png,train\CASE74\meta\CASE74_26.csv,211.497,2.619
1482,CASE74_27.png,214.293,74,train\CASE74\image\CASE74_27.png,train\CASE74\meta\CASE74_27.csv,214.116,0.177


In [8]:
test_imgs = glob('test/image/*')
test_metas = glob('test/meta/*')

# Preprocessing

In [9]:
SCALER = StandardScaler()
train_df_shifted['real_weight'] = SCALER.fit_transform(train_df_shifted[['real_weight']])
train_df_shifted['real_weight'] = np.log1p(train_df_shifted['real_weight'])
train_df_shifted

Unnamed: 0,img_name,leaf_weight,case,img_path,meta_path,real_weight,weight_delta
0,CASE01_02.png,59.764,1,train\CASE01\image\CASE01_02.png,train\CASE01\meta\CASE01_02.csv,-0.297934,10.571
1,CASE01_03.png,72.209,1,train\CASE01\image\CASE01_03.png,train\CASE01\meta\CASE01_03.csv,-0.159954,12.445
2,CASE01_04.png,85.737,1,train\CASE01\image\CASE01_04.png,train\CASE01\meta\CASE01_04.csv,-0.018687,13.528
3,CASE01_05.png,102.537,1,train\CASE01\image\CASE01_05.png,train\CASE01\meta\CASE01_05.csv,0.115151,16.800
4,CASE01_06.png,123.359,1,train\CASE01\image\CASE01_06.png,train\CASE01\meta\CASE01_06.csv,0.259742,20.822
...,...,...,...,...,...,...,...
1479,CASE74_24.png,211.002,74,train\CASE74\image\CASE74_24.png,train\CASE74\meta\CASE74_24.csv,0.865084,4.653
1480,CASE74_25.png,211.497,74,train\CASE74\image\CASE74_25.png,train\CASE74\meta\CASE74_25.csv,0.885233,0.495
1481,CASE74_26.png,214.116,74,train\CASE74\image\CASE74_26.png,train\CASE74\meta\CASE74_26.csv,0.887353,2.619
1482,CASE74_27.png,214.293,74,train\CASE74\image\CASE74_27.png,train\CASE74\meta\CASE74_27.csv,0.898495,0.177


# Dataloader

In [10]:
class train_dataset(Dataset):
    def __init__(self, train_df, transform = None):
        super(train_dataset, self).__init__()
        self.train_df = train_df
        self.transform = transform
        
    def __len__(self):
        return len(self.train_df)
    
    def __getitem__(self, idx):
        img = np.array(Image.open(self.train_df['img_path'][idx]))
        weight = self.train_df['real_weight'][idx]
        
        if self.transform:
            img = self.transform(image = img)['image']
            
        weight = torch.tensor(weight, dtype = torch.float32)
        return img, weight

In [11]:
class test_dataset(Dataset):
    def __init__(self, imgs, transform = None):
        super(test_dataset, self).__init__()
        self.imgs = imgs
        self.transform = transform
        
    def __len__(self):
        return len(self.imgs)
    
    def __getitem__(self, idx):
        img = np.array(Image.open(self.imgs[idx]))
        
        if self.transform:
            img = self.transform(image = img)['image']
            return img
        else:
            return img

# Model

In [None]:
class ConvNet(nn.Module):
    def __init__(self, model_name, pretrained = True):
        super(ConvNet, self).__init__()
        self.conv = timm.create_model(model_name, pretrained = pretrained)
        self.fc1 = nn.Linear(1000, 256)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(256, 1)
        
    def forward(self, inputs):
        X = self.conv(inputs)
        X = self.fc1(X)
        X = self.relu(X)
        outputs = self.fc2(X)
        return outputs

# Train / Validation

In [None]:
TEST_SIZE = 0.2
N_EPOCH = 30
LR = 1e-3
BATCH_SIZE = 32
MODEL_NAME = 'densenet121'#'densenet201'#'efficientnet_b0'
IMAGE_SIZE = 256

In [None]:
aug_transform = Compose([
    Resize(IMAGE_SIZE, IMAGE_SIZE),
    A.HorizontalFlip(),
    OneOf([
        A.IAAAdditiveGaussianNoise(),
        A.GaussNoise(),
        ], p = 0.2),
    OneOf([
        A.MotionBlur(blur_limit = 3, p = 0.2),
        A.MedianBlur(blur_limit = 3, p = 0.1),
        A.Blur(blur_limit = 3, p = 0.1),
        ], p = 0.2),
#     A.ShiftScaleRotate(rotate_limit = 15),
#     OneOf([
#         A.OpticalDistortion(p = 0.3),
#         A.GridDistortion(p = 0.1),
#         A.IAAPiecewiseAffine(p = 0.3),
#         ], p = 0.2),
    OneOf([
        A.CLAHE(clip_limit = 2),
        A.IAASharpen(),
        A.IAAEmboss(),
        A.RandomBrightnessContrast(),
        ], p = 0.3),
#     A.HueSaturationValue(p = 0.3),
    Normalize(),
    ToTensor(),
])

resize_transform = Compose([
    Resize(IMAGE_SIZE, IMAGE_SIZE),
    Normalize(),
    ToTensor(),
])

def train_models(seeds, df, train_transform, val_transform):
    models = []
    for seed in seeds:
        seed_everything(seed)
        
        weights_path = f'{MODEL_NAME}_stdscaled_drop59_size_{IMAGE_SIZE}_nomask_best_seed{seed}.pt'
        
        train_data, val_data = train_test_split(df, test_size = TEST_SIZE, random_state = SEED,)
        train_data = train_data.reset_index(drop = True)
        val_data = val_data.reset_index(drop = True)
        
        train_loader = DataLoader(train_dataset(train_data, transform = train_transform),
                                  shuffle = True, batch_size = BATCH_SIZE)
        val_loader = DataLoader(train_dataset(val_data, transform = val_transform),
                                shuffle = False, batch_size = BATCH_SIZE)
        
        model = ConvNet(MODEL_NAME).to(DEVICE)

        criterion = nn.MSELoss().to(DEVICE)
        optimizer = torch.optim.AdamW(model.parameters(), lr = LR)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', patience = 2, factor = 0.5, min_lr = 5e-5)

        total_batch = len(train_loader)
        
        val_score = 99999999
        
        for epoch in range(N_EPOCH):
            avg_cost = 0
            pbar = tqdm(train_loader)

            model.train()
            for img, y in pbar:
                img = img.to(DEVICE)
                y = y[..., None].to(DEVICE)
                optimizer.zero_grad()
                hypothesis = model(img)
                cost = criterion(hypothesis, y)
                cost.backward()
                optimizer.step()

                avg_cost += cost / total_batch

            model.eval()
            with torch.no_grad():
                preds = []
                val_loss = 0
                for img, y in val_loader:
                    img = img.to(DEVICE)
                    y = y[..., None].to(DEVICE)
                    
                    hypothesis = model(img)
                    batch_cost = criterion(hypothesis, y)
                    val_loss += batch_cost / total_batch
                    
                    pred = hypothesis.cpu().numpy()
                    preds.append(pred)
                    
                score = NMAE(SCALER.inverse_transform(np.expm1(val_data['leaf_weight'].values).reshape(-1, 1)), 
                             SCALER.inverse_transform(np.expm1(np.concatenate(preds).squeeze()).reshape(-1, 1)))

            scheduler.step(score)

            if val_score > score:
                print(f'val_score improved {val_score:.4f} to {score:.4f}')
                val_score = score
                torch.save(model.state_dict(), weights_path)

            print(f'[Epoch {epoch + 1} / {N_EPOCH}] cost = {avg_cost:.4f}, val_score = {score:.4f}')
            
        model.load_state_dict(torch.load(weights_path))
        models.append(model)
    return models

In [None]:
seeds = [42, 77, 777, 7777, 77777,]

models = train_models(seeds, df = train_df_shifted, train_transform = aug_transform, val_transform = resize_transform)

# Inference

In [None]:
test_loader = DataLoader(test_dataset(test_imgs, transform = resize_transform, n_tta = 0),
                         shuffle=False, batch_size = BATCH_SIZE)

def predict_models(models, log_transform = True, inverse_transform = True):
    all_pred = []

    for model in models:
        model.eval()
        preds = []
        with torch.no_grad():
            for img in tqdm(test_loader):
                img = img.to(DEVICE)
                pred = model(img).cpu().numpy()
                preds.append(pred)
        
        preds = np.concatenate(np.array(preds))
        
        if log_transform:
            preds = np.expm1(preds)
        
        if inverse_transform:
            preds = SCALER.inverse_transform(preds)
        
        all_pred.append(preds)

    avg_pred = np.array(all_pred).mean(axis = 0)
    
    return avg_pred

avg_pred = predict_models(models)
avg_pred.shape

# Submission

In [None]:
submission['leaf_weight'] = np.clip(avg_pred, 0, None)
save_path = 'shifted_stdscaled_log_5seeds_aug_30epoch_drop59_size256_densenet121.csv'
submission.to_csv(save_path, index = False)
pd.read_csv(save_path)

In [None]:
save_path = 'shifted_stdscaled_log_5seeds_aug_30epoch_drop59_size256_densenet121.csv'
submission = pd.read_csv(save_path)
shift = 0.025
submission['leaf_weight'] = np.clip(submission['leaf_weight'] + shift, 0, None)
save_path = f'shifted_stdscaled_log_5seeds_aug_30epoch_drop59_size256_densenet121+{shift}.csv'
submission.to_csv(save_path, index = False)
pd.read_csv(save_path)

In [None]:
test_delta = pd.DataFrame(columns = ['real_weight', '내부온도관측치', '외부온도관측치', '내부습도관측치', 'CO2관측치', 
                                   'EC관측치', '냉방온도', '냉방부하', '난방온도', '난방부하', '총추정광량', 
                                   '백색광추정광량', '적색광추정광량', 'weight_delta'])
test_delta['real_weight'] = submission['leaf_weight']
for i in range(len(test_metas)):
    temp = meta_loader(test_metas[i], 1, return_df = True)
    test_delta.iloc[i, 1:-1] = [temp[col].sum() for col in temp.columns]
test_delta = test_delta.drop('weight_delta', axis = 1)
test_delta

In [None]:
submission['leaf_weight'] += delta_model.predict(test_delta)
save_path = 'shifted_masked_stdscaled_log_5seeds_aug5_20epoch_densenet192+rfsumdelta.csv'
submission.to_csv(save_path, index = False)
pd.read_csv(save_path)