In [4]:
import os
import cv2
import shutil
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from feature_engine.outliers import OutlierTrimmer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [16]:
seed = 48

In [2]:
train_data = pd.read_csv('./data/2023/data_train_mean.csv')
train_data.head()

Unnamed: 0,species,uniqID,bio2,bio3,bio4,bio5,bio6,bio7,bio8,bio9,...,trait_224,trait_237,trait_281,trait_282,trait_289,trait_1080,trait_3112,trait_3113,trait_3114,trait_3120
0,Abarema macradenia,2,108,76,731,331,189,142,248,257,...,2.751126,8.051283,44.170849,840.303455,1493.103154,938.320226,9664.950673,521.472828,1269.832446,1.722072
1,Abarema macradenia,3,108,76,731,331,189,142,248,257,...,2.751126,8.051283,44.170849,840.303455,1493.103154,938.320226,9664.950673,521.472828,1269.832446,1.722072
2,Abelia chinensis,66,69,42,3370,225,61,164,188,104,...,5.106266,6.37212,22.973991,584.809757,1078.055613,4382.060727,1109.400814,1236.507079,1419.837554,3.571199
3,Abelia chinensis,71,71,43,3389,161,-2,163,121,59,...,5.106266,6.37212,22.973991,584.809757,1078.055613,4382.060727,1109.400814,1236.507079,1419.837554,3.571199
4,Abelmoschus esculentus,112,80,68,1212,276,159,117,203,226,...,3.311457,4.388532,70.026823,577.398989,915.531609,1167.873857,9090.440861,6503.515479,16327.996459,3.693042


In [6]:
dir = './data/2023/01_data_train'
errors = {}

for i, file in enumerate(train_data['pic_name'].values):
    try:
        img = cv2.imread(f'{dir}/{file}')
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    except:
        errors[i] = file
        pass
    
print(errors)

{6189: 'train_2869277.jpg', 6218: 'train_2895197.jpg', 20539: 'train_3366733.jpg', 20541: 'train_3369511.jpg', 20557: 'train_3408054.jpg', 20567: 'train_3419342.jpg', 30866: 'train_4075601.jpg', 30886: 'train_4096439.jpg'}


In [8]:
train_data = train_data.drop(list(errors.keys()))

In [9]:
aux_feat = ['pic_name', 'trait_4', 'trait_11', 'trait_18', 'trait_50', 'trait_26', 'trait_3112',
            'trait_13', 'trait_14', 'trait_15', 'trait_21', 'trait_27', 'trait_46',
            'trait_47', 'trait_55', 'trait_95', 'trait_144',
            'trait_145', 'trait_146', 'trait_163', 'trait_169', 'trait_223', 'trait_224',
            'trait_237', 'trait_281', 'trait_282', 'trait_289', 'trait_1080', 'trait_3113',
            'trait_3114', 'trait_3120']
log_feat = ['trait_3114', 'trait_1080', 'trait_3112', 'trait_3113', 'trait_26', 'trait_55', 'trait_289', 'trait_282',
            'trait_169', 'trait_13', 'trait_144', 'trait_281', 'trait_95', 'trait_146', 'trait_223', 'trait_11',
            'trait_14', 'trait_224', 'trait_18', 'trait_237', 'trait_27', 'trait_145', 'trait_3120', 'trait_163',
            'trait_15', 'trait_50', 'trait_21'
            ]

In [10]:
train_data[aux_feat[1:7]].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
trait_4,36252.0,0.502361,0.15722,-1.623941,0.392464,0.486149,0.603254,2.889469
trait_11,36252.0,62.911118,7918.879919,0.001437,10.668779,15.568725,21.304281,1504254.0
trait_18,36252.0,7.204428,400.63941,8e-06,0.330964,0.787158,5.46381,75967.59
trait_50,36252.0,1.676285,7.543198,9.7e-05,1.08147,1.427027,1.891408,1263.594
trait_26,36252.0,139.854735,959.647442,0.000227,0.561029,2.555346,17.342279,37836.25
trait_3112,36252.0,6091.972594,299141.811907,6.1e-05,261.730082,805.189101,2331.610029,28429970.0


In [11]:
train_data[aux_feat[1:7]].quantile([0.01, 0.95, 0.97, 0.99]).T

Unnamed: 0,0.01,0.95,0.97,0.99
trait_4,0.2089,0.779683,0.813282,0.886177
trait_11,3.177824,34.739678,39.680866,50.777375
trait_18,0.052039,21.914292,26.398965,35.106055
trait_50,0.493695,3.00819,3.401438,4.402759
trait_26,0.013635,393.578394,905.397713,3155.242127
trait_3112,12.100823,10540.449061,16007.248293,32257.383272


In [12]:
train_data[aux_feat[1:]].quantile([0, 0.01, 0.05, 0.95, 0.99, 1]).T

Unnamed: 0,0.00,0.01,0.05,0.95,0.99,1.00
trait_4,-1.623941,0.2089,0.287281,0.779683,0.886177,2.889469
trait_11,0.001436645,3.177824,4.977883,34.739678,50.777375,1504254.0
trait_18,8.14e-06,0.052039,0.116504,21.914292,35.106055,75967.59
trait_50,9.68e-05,0.493695,0.706616,3.00819,4.402759,1263.594
trait_26,0.00022736,0.013635,0.062208,393.578394,3155.242127,37836.25
trait_3112,6.05e-05,12.100823,39.63415,10540.449061,32257.383272,28429970.0
trait_13,93.94884,374.143876,403.25732,499.333572,525.158723,1022.154
trait_14,0.7281234,6.700712,10.027895,35.317903,44.356969,2639.664
trait_15,9.2575e-06,0.25951,0.543335,3.059211,4.578983,138897.1
trait_21,2.885e-10,0.00124,0.002978,0.425348,1.096296,81637070.0


In [13]:
trimmer = OutlierTrimmer('quantiles', fold=0.01)

In [14]:
train_sample = trimmer.fit_transform(train_data[aux_feat], aux_feat[1:])

In [17]:
train_xs, valid_xs = train_test_split(train_sample, test_size=0.2, random_state=seed)
train_xs.shape, valid_xs.shape

((24036, 31), (6009, 31))

In [18]:
min_caps = {'trait_4': 0.208652,
            'trait_11': 3.178135,
            'trait_18': 0.052039,
            'trait_50': 0.493695,
            'trait_26': 0.013635,
            'trait_3112': 12.100823,
            }
max_caps = {'trait_4': 0.886177,
            'trait_11': 50.773743,
            'trait_18': 35.106055,
            'trait_50': 4.402759,
            'trait_26': 3155.242127,
            'trait_3112': 32257.383272,
            }

targets = train_xs[aux_feat].copy()
val_targets = valid_xs[aux_feat].copy()

idxs = []
idxs_val = []

for col in min_caps.keys():
    idxs += list(train_xs[(train_xs[col] > max_caps[col])].index)
    idxs += list(train_xs[(train_xs[col] < min_caps[col])].index)

    idxs_val += list(valid_xs[(valid_xs[col] > max_caps[col])].index)
    idxs_val += list(valid_xs[(valid_xs[col] < min_caps[col])].index)

print(len(set(idxs)))
print(len(set(idxs_val)))

936
224


In [None]:
targets = targets.drop(idxs)
val_targets = val_targets.drop(idxs_val)

In [21]:
targets[aux_feat[1:]].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
trait_4,23100.0,0.501635,0.140286,0.209118,0.39358,0.485673,0.603546,0.886088
trait_11,23100.0,16.916931,8.027348,3.204518,11.378021,15.789493,21.181707,50.755128
trait_18,23100.0,3.918257,6.417485,0.052164,0.336301,0.756986,4.520747,35.106055
trait_50,23100.0,1.535087,0.623789,0.493695,1.087353,1.417006,1.837036,4.378196
trait_26,23100.0,58.449669,239.196743,0.013667,0.583727,2.473299,14.709484,3136.327163
trait_3112,23100.0,1962.81849,3202.650043,12.100823,293.319686,805.567518,2176.823902,31959.09973
trait_13,23100.0,451.783748,26.66305,286.152502,436.973342,451.569822,468.868477,525.158723
trait_14,23100.0,21.1861,6.997589,3.73602,16.458973,20.433213,25.116793,44.356969
trait_15,23100.0,1.606581,0.691887,0.124495,1.140357,1.532381,1.976587,4.578983
trait_21,23100.0,0.084385,0.137001,0.000329,0.00934,0.027526,0.1049,1.089238


In [22]:
scaler = MinMaxScaler()
y_train = np.zeros_like(targets[aux_feat[1:]], dtype=np.float32)

for idx, target in enumerate(aux_feat[1:]):
    v = targets[target].values

    if target in log_feat:
        v = np.log10(v)

    y_train[:, idx] = v

y_train = scaler.fit_transform(y_train)

In [23]:
y_train = pd.DataFrame(y_train).set_index(targets.index)
y_train['pic_name'] = targets['pic_name']

In [24]:
y_val = np.zeros_like(val_targets[aux_feat[1:]], dtype=np.float32)

for idx, target in enumerate(aux_feat[1:]):
    v = val_targets[target].values

    if target in log_feat:
        v = np.log10(v)

    y_val[:, idx] = v

y_val = scaler.transform(y_val)

In [25]:
y_val = pd.DataFrame(y_val).set_index(val_targets.index)
y_val['pic_name'] = val_targets['pic_name']

In [26]:
y_val

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,pic_name
17346,0.146870,0.701715,0.269894,0.263682,0.290760,0.500740,0.700837,0.604985,0.691422,0.345333,...,0.289648,0.407680,0.945077,0.655185,0.550339,0.730934,0.543144,0.551009,0.424438,train_1336146.jpg
27072,0.458461,0.495912,0.727737,0.597717,0.799695,0.740916,0.841887,0.747066,0.667284,0.699029,...,0.311493,0.735531,0.861929,0.587568,0.662830,0.451848,0.731611,0.823098,0.451281,train_508116.jpg
26326,0.441159,0.569103,0.787902,0.362773,0.327795,0.912708,0.911258,0.580048,0.562157,0.666247,...,0.266442,0.324182,0.740882,0.580151,0.614944,0.788281,0.959736,0.873792,0.544085,train_7674095.jpg
6399,0.425788,0.548796,0.751564,0.598817,0.618651,0.748983,0.794907,0.765785,0.639619,0.772026,...,0.415866,0.587438,0.769512,0.707441,0.631741,0.498428,0.771504,0.798319,0.423245,train_2982086.jpg
30360,0.065651,0.992974,0.406608,0.121098,0.455146,0.748693,0.655189,0.871136,0.947494,0.147386,...,0.427023,0.540199,0.681310,0.395606,0.408158,0.907711,0.798576,0.776798,0.855969,train_3552439.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27724,0.275077,0.680603,0.447287,0.413871,0.166235,0.572480,0.644684,0.747110,0.692307,0.503113,...,0.398890,0.409855,0.513081,0.435434,0.454196,0.698356,0.516569,0.697842,0.379425,train_1112251.jpg
5139,0.309086,0.850214,0.272575,0.180240,0.456381,0.400510,0.687251,0.677584,0.784533,0.480293,...,0.595917,0.532324,0.986828,0.775914,0.801559,0.889218,0.321384,0.564175,0.489608,train_2402293.jpg
35736,0.413698,0.599759,0.241346,0.423046,0.296247,0.392229,0.795117,0.667148,0.753452,0.282301,...,0.358236,0.356016,0.748520,0.394747,0.306807,0.780234,0.407771,0.458470,0.558428,train_4209144.jpg
14701,0.335039,0.601823,0.220426,0.422102,0.207585,0.280508,0.804646,0.647103,0.735996,0.463889,...,0.291781,0.307257,0.767636,0.728859,0.620882,0.747455,0.388056,0.469258,0.419116,train_7499898.jpg


In [27]:
y_train.to_csv('./data/2023/processed/train.csv', index=False)
y_val.to_csv('./data/2023/processed/val.csv', index=False)

In [28]:
joblib.dump(scaler, './data/2023/processed/scaler_23.joblib')

['./data/2023/processed/scaler_23.joblib']