In [None]:
import re

import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt

from joblib import dump
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from feature_engine.outliers import ArbitraryOutlierCapper
import warnings

warnings.filterwarnings("error")

In [None]:
seed = 48

In [None]:
tar_features = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
tar_sd_features = ['X4_sd', 'X11_sd', 'X18_sd', 'X50_sd', 'X26_sd', 'X3112_sd']
log_features = ['X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']

In [None]:
train = pd.read_csv('./data/train.csv', index_col='id').fillna(0)
sub = pd.read_csv('./data/sample_submission.csv', index_col='id')
# train = train + 1e-5

In [None]:
train_xs, valid_xs = train_test_split(train, test_size=0.2, random_state=seed)
train_xs.shape, valid_xs.shape

In [None]:
train_xs[tar_features].describe().T

In [None]:
sub.describe().T

In [None]:
sub.quantile([0, 0.01, 0.025, 0.975, 0.99, 1]).T

In [None]:
train_xs[tar_features].quantile([0, 0.003, 0.01, 0.025, 0.975, 0.98, 0.99, 0.997, 0.998, 0.999, 1]).T

In [None]:
min_caps = {'X4_mean': 0.208652,
            'X11_mean': 3.178135,
            'X18_mean': 0.052039,
            'X50_mean': 0.493695,
            'X26_mean': 0.013635,
            'X3112_mean': 12.100823,
            }
max_caps = {'X4_mean': 0.886177,
            'X11_mean': 50.773743,
            'X18_mean': 35.106055,
            'X50_mean': 4.402759,
            'X26_mean': 905.397713,
            'X3112_mean': 16007.248293,
            }

targets = train_xs[tar_features].copy()
val_targets = valid_xs[tar_features].copy()

idxs = []
idxs_val = []

for col in min_caps.keys():
    idxs += list(train_xs[(train_xs[col] > max_caps[col])].index)
    idxs += list(train_xs[(train_xs[col] < min_caps[col])].index)
    
    idxs_val += list(valid_xs[(valid_xs[col] > max_caps[col])].index)
    idxs_val += list(valid_xs[(valid_xs[col] < min_caps[col])].index)
    
print(len(set(idxs)))
print(len(set(idxs_val)))


In [None]:
targets = targets.drop(idxs)
val_targets = val_targets.drop(idxs_val)

In [None]:
val_targets

In [None]:
# targets = trimmer_r.transform(targets)

In [None]:
targets.describe().T

In [None]:
targets = targets.join(train_xs[tar_sd_features], how='inner')
val_targets = val_targets.join(valid_xs[tar_sd_features], how='inner')

targets.shape, val_targets.shape

In [None]:
# for mean, sd in zip(tar_features, tar_sd_features):
#     print(mean, sd)
#     targets[mean] = targets[mean] + targets[sd] * np.random.normal(0, 0.1, len(targets))

In [None]:
scaler = MinMaxScaler()
y_train = np.zeros_like(targets[tar_features], dtype=np.float32)

for idx, (target, tar_sd) in enumerate(zip(tar_features, tar_sd_features)):
    v = targets[target].values

    if target in log_features:
        v = np.log10(v)

    y_train[:, idx] = v

y_train = scaler.fit_transform(y_train)

In [None]:
y_val = np.zeros_like(val_targets[tar_features], dtype=np.float32)

for idx, (target, tar_sd) in enumerate(zip(tar_features, tar_sd_features)):
    v = val_targets[target].values

    if target in log_features:
        v = np.log10(v)

    y_val[:, idx] = v

y_val = scaler.transform(y_val)

In [None]:
y_train = pd.DataFrame(y_train, columns=tar_features).set_index(targets.index)
y_val = pd.DataFrame(y_val, columns=tar_features).set_index(val_targets.index)

In [None]:
y_train

In [None]:
y_val

In [None]:
train_boxes = pd.read_csv('./data/boxes.csv', index_col='id')

In [None]:
train_boxes['box'] = train_boxes['box'].apply(lambda x:
                                              np.fromstring(
                                                  x.replace('\n', '')
                                                  .replace('[', '')
                                                  .replace(']', '')
                                                  .replace('  ', ' '), sep=' '))

In [None]:
full_train = y_train.join(train_boxes, how='inner')
full_valid = y_val.join(train_boxes, how='inner')

In [None]:
full_train.sample(5)

In [None]:
full_valid.sample(5)

In [None]:
idx = 195356035
img = cv2.imread(f'./data/train_images/{idx}.jpeg')
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

box = full_valid.loc[idx, 'box']
patch = img[int(box[1]):int(box[3]), int(box[0]):int(box[2])]
plt.imshow(patch)

In [None]:
full_train.to_csv('./data/processed/train.csv')
full_valid.to_csv('./data/processed/valid.csv')
dump(scaler, './data/processed/scaler.joblib')

In [None]:
test = pd.read_csv('./data/test.csv')
test['file_path'] = test['id'].apply(lambda s: f'./data/test_images/{s}.jpeg')
test['jpeg_bytes'] = test['file_path'].apply(lambda fp: open(fp, 'rb').read())
test.to_pickle('./data/test.pkl')

In [None]:
x_feature = train.columns[:-12].tolist()

In [None]:
train_scaler = MinMaxScaler()
x_train = np.zeros_like(train[x_feature], dtype=np.float32)

for idx, target in enumerate(x_feature):
    v = train[target].values

    # if target in log_x_features:
    #     v = np.log10(v)

    x_train[:, idx] = v

x_train = train_scaler.fit_transform(x_train)

In [None]:
x_train = pd.DataFrame(x_train, columns=x_feature).set_index(train.index)

In [None]:
x_train

In [None]:
x_train_feats = x_train.loc[targets.index, :]
x_val_feats = x_train.loc[val_targets.index, :]

In [None]:
x_train_feats.to_csv('./data/processed/train_x.csv')
x_val_feats.to_csv('./data/processed/valid_x.csv')

In [None]:
dump(train_scaler, './data/processed/scaler_x.joblib')