In [1]:
import numpy as np
import pandas as pd
from joblib import dump
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from feature_engine.outliers import OutlierTrimmer
import warnings

warnings.filterwarnings("error")

In [2]:
seed = 48

In [3]:
tar_features = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
tar_sd_features = ['X4_sd', 'X11_sd', 'X18_sd', 'X50_sd', 'X26_sd', 'X3112_sd']
log_features = ['X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']

In [4]:
train = pd.read_csv('./data/train.csv', index_col='id').fillna(0)
# train = train + 1e-5

In [5]:
train_xs, valid_xs = train_test_split(train, test_size=0.2, random_state=seed)
train_xs.shape, valid_xs.shape

((44391, 175), (11098, 175))

In [10]:
for mean, sd in zip(tar_features, tar_sd_features):
    print(mean, sd)
    train[mean] = train[mean] + train[sd] * np.random.normal(0, 0.1, len(train))

X4_mean X4_sd
X11_mean X11_sd
X18_mean X18_sd
X50_mean X50_sd
X26_mean X26_sd
X3112_mean X3112_sd


In [6]:
train_xs[tar_features].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
X4_mean,44391.0,0.522885,0.1758845,-2.431157,0.411067,0.508919,0.62276,4.475172
X11_mean,44391.0,159.569755,14287.73,6.78e-05,10.626319,15.11627,19.680132,1504254.0
X18_mean,44391.0,24578.633284,2582359.0,2.33e-08,0.313851,0.720825,3.635854,272049400.0
X50_mean,44391.0,12.802032,1313.423,9.68e-05,1.172768,1.478612,1.92601,159759.9
X26_mean,44391.0,3597.055443,262306.9,5.5e-07,0.57246,2.539143,15.073373,31065550.0
X3112_mean,44391.0,493868.68102,102327000.0,7.69e-08,257.437398,726.808928,2146.463677,21559110000.0


In [8]:
train_xs[tar_features].quantile([0, 0.01, 0.025, 0.1, 0.5, 0.9, 0.975, 0.99, 1]).T

Unnamed: 0,0.000,0.010,0.025,0.100,0.500,0.900,0.975,0.990,1.000
X4_mean,-2.431157,0.227783,0.273426,0.338843,0.508919,0.724688,0.841314,0.92031,4.475172
X11_mean,6.78e-05,3.057649,3.967288,6.421923,15.11627,25.246879,38.378333,50.777775,1504254.0
X18_mean,2.33e-08,0.04279,0.064821,0.163245,0.720825,11.39747,21.751872,28.564947,272049400.0
X50_mean,9.68e-05,0.492535,0.642466,0.925622,1.478612,2.491684,3.51541,4.444901,159759.9
X26_mean,5.5e-07,0.010774,0.021041,0.120801,2.539143,80.104639,537.145107,1454.708615,31065550.0
X3112_mean,7.69e-08,11.708036,21.933505,91.904776,726.808928,4844.990136,13641.992446,24493.44628,21559110000.0


In [9]:
trimmer = OutlierTrimmer(capping_method='quantiles', tail='both', fold=0.01)
trimmer.fit(train_xs[tar_features])

In [10]:
targets = trimmer.transform(train_xs[tar_features])
targets

Unnamed: 0_level_0,X4_mean,X11_mean,X18_mean,X50_mean,X26_mean,X3112_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
196351112,0.345967,18.729455,6.097319,0.983611,0.110296,8217.136933
194199029,0.788713,22.499436,0.079323,2.344849,22.024522,675.009501
130253467,0.662078,12.631095,0.753300,2.763913,15.101902,265.980344
195132552,0.616988,43.241915,0.802360,1.039350,1.873556,1107.025060
188664637,0.458969,10.782423,0.100319,2.669681,2.557434,94.730622
...,...,...,...,...,...,...
191833808,0.656199,12.897281,1.526435,2.695139,18.236375,320.115284
120363935,0.739116,3.233058,5.884583,3.885075,8.067971,144.071709
195878907,0.556858,14.144586,0.722631,3.464295,3.489664,837.110254
196192062,0.500775,5.970475,15.395812,2.026018,26.704722,159.347682


In [11]:
val_targets = trimmer.transform(valid_xs[tar_features])
val_targets

Unnamed: 0_level_0,X4_mean,X11_mean,X18_mean,X50_mean,X26_mean,X3112_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
194891447,0.680353,8.897225,21.364594,3.129161,50.941272,1080.646170
191770603,0.476163,16.691173,0.473854,0.958628,0.078599,798.094415
196541825,0.613250,12.979986,4.563588,1.313577,3.191662,3213.977560
194014257,0.546773,19.176414,0.121865,1.329584,0.332680,209.330816
18647018,0.484643,8.985343,7.918931,2.050727,100.963345,2930.197690
...,...,...,...,...,...,...
28547380,0.529505,15.555081,0.498518,1.327774,12.110630,1822.060834
149717313,0.436301,12.386403,12.460742,1.689838,45.965060,5263.455160
196471504,0.441506,21.160422,4.785422,1.398361,4.658262,3875.831403
33314622,0.614109,11.815616,0.275037,1.362108,0.063458,308.092495


In [12]:
scaler = MinMaxScaler()
y_train = np.zeros_like(targets[tar_features], dtype=np.float32)

for idx, (target, tar_sd) in enumerate(zip(tar_features, tar_sd_features)):
    v = targets[target].values

    if target in tar_features:
        v = np.log10(v)

    y_train[:, idx] = v

y_train = scaler.fit_transform(y_train)

In [13]:
y_val = np.zeros_like(val_targets[tar_features], dtype=np.float32)

for idx, (target, tar_sd) in enumerate(zip(tar_features, tar_sd_features)):
    v = val_targets[target].values

    if target in tar_features:
        v = np.log10(v)

    y_val[:, idx] = v

y_val = scaler.transform(y_val)

In [14]:
y_train = pd.DataFrame(y_train, columns=tar_features).set_index(targets.index)
y_val = pd.DataFrame(y_val, columns=tar_features).set_index(val_targets.index)

In [15]:
y_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
X4_mean,40388.0,0.56471,0.197946,0.0,0.422461,0.571524,0.71691,1.0
X11_mean,40388.0,0.544537,0.177004,0.0,0.452947,0.570665,0.659388,1.0
X18_mean,40388.0,0.485226,0.232014,0.0,0.310841,0.433217,0.671404,1.0
X50_mean,40388.0,0.506821,0.167966,0.0,0.399613,0.499479,0.61491,1.0
X26_mean,40388.0,0.472991,0.1966,0.0,0.340808,0.462115,0.606895,1.0
X3112_mean,40388.0,0.539117,0.189768,0.0,0.412666,0.541558,0.67857,1.0


In [16]:
y_val.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
X4_mean,10089.0,0.564453,0.199627,0.004619,0.422203,0.574591,0.71691,0.996655
X11_mean,10089.0,0.545319,0.176271,0.001127,0.454027,0.569394,0.659051,1.0
X18_mean,10089.0,0.481044,0.232343,0.000179,0.304625,0.428856,0.663703,1.0
X50_mean,10089.0,0.509069,0.17086,0.001012,0.402606,0.502125,0.616215,0.999341
X26_mean,10089.0,0.469911,0.197601,0.000361,0.336701,0.460686,0.606628,0.998885
X3112_mean,10089.0,0.537091,0.192288,0.000256,0.409339,0.539785,0.679687,0.999821


In [17]:
y_train.to_csv('./data/processed/train.csv')
y_val.to_csv('./data/processed/valid.csv')
dump(scaler, './data/processed/scaler.joblib')

['./data/processed/scaler.joblib']

In [4]:
test = pd.read_csv('./data/test.csv')
test['file_path'] = test['id'].apply(lambda s: f'./data/test_images/{s}.jpeg')
test['jpeg_bytes'] = test['file_path'].apply(lambda fp: open(fp, 'rb').read())
test.to_pickle('./data/test.pkl')

In [7]:
train_scaler = MinMaxScaler()
x_train = np.zeros_like(train[x_feature], dtype=np.float32)

for idx, target in enumerate(x_feature):
    v = train[target].values

    # if target in log_x_features:
    #     v = np.log10(v)

    x_train[:, idx] = v

x_train = train_scaler.fit_transform(x_train)

In [10]:
x_train = pd.DataFrame(x_train, columns=x_feature).set_index(train.index)

In [11]:
x_train

Unnamed: 0_level_0,WORLDCLIM_BIO1_annual_mean_temperature,WORLDCLIM_BIO12_annual_precipitation,WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month,WORLDCLIM_BIO15_precipitation_seasonality,WORLDCLIM_BIO4_temperature_seasonality,WORLDCLIM_BIO7_temperature_annual_range,SOIL_bdod_0.5cm_mean_0.01_deg,SOIL_bdod_100.200cm_mean_0.01_deg,SOIL_bdod_15.30cm_mean_0.01_deg,SOIL_bdod_30.60cm_mean_0.01_deg,...,VOD_X_1997_2018_multiyear_mean_m03,VOD_X_1997_2018_multiyear_mean_m04,VOD_X_1997_2018_multiyear_mean_m05,VOD_X_1997_2018_multiyear_mean_m06,VOD_X_1997_2018_multiyear_mean_m07,VOD_X_1997_2018_multiyear_mean_m08,VOD_X_1997_2018_multiyear_mean_m09,VOD_X_1997_2018_multiyear_mean_m10,VOD_X_1997_2018_multiyear_mean_m11,VOD_X_1997_2018_multiyear_mean_m12
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
192027691,-0.308725,-0.889802,-0.554512,0.490021,0.805016,0.879735,0.460272,0.579818,0.487341,0.633997,...,0.638743,0.927605,1.040217,0.817117,0.347126,0.213268,0.384477,0.644727,0.544614,0.236286
195542235,0.355134,-1.259414,-0.936992,-0.583946,1.075379,1.615380,0.403478,0.248144,0.612104,0.505870,...,-0.768723,-0.881663,-0.942067,-1.069347,-1.022200,-1.020704,-0.940234,-0.787698,-0.648572,-0.492379
196639184,-0.042541,-0.203696,-0.648946,-1.225982,-0.412910,-0.264452,-0.562008,-0.481537,-0.573140,-0.583211,...,0.488973,0.413616,0.405373,0.406568,0.344944,0.294744,0.386658,0.516051,0.623689,0.651859
195728812,0.541083,0.539962,0.182802,-0.369732,-0.434778,-0.523488,0.176305,-0.614206,-0.198853,-0.262893,...,-0.321266,-0.438934,-0.559986,-0.635205,-0.534976,-0.460638,-0.527141,-0.260902,-0.219068,-0.193552
195251545,-1.833272,-0.687428,-0.640040,-0.583637,2.541932,2.258165,-1.470702,0.380814,-0.510759,0.185552,...,-0.128926,0.260018,0.553307,1.199778,1.516921,1.560907,1.258588,0.767458,0.616531,0.594171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190558785,0.645423,-1.058427,-0.726033,0.326497,-0.145290,0.276630,0.062718,-0.017194,0.175435,0.057425,...,-0.721367,-0.690451,-0.596856,-0.525751,-0.424501,-0.321623,-0.282681,-0.401587,-0.571184,-0.637861
194523231,-0.112469,0.508838,0.176645,-0.428751,0.423243,0.160330,0.460272,0.248144,0.424960,0.377743,...,-0.646642,-0.603014,-0.496135,-0.497384,-0.377590,-0.402016,-0.473746,-0.457392,-0.553931,-0.624018
195888987,0.021632,-0.620095,-0.212108,1.026921,-0.036127,0.180030,0.062718,0.977825,0.487341,0.698061,...,-1.581710,-1.633348,-1.668236,-1.719123,-1.754660,-1.738011,-1.750808,-1.761985,-1.816742,-1.792120
135487319,0.200102,0.158816,-0.425108,-1.067075,-0.557545,-0.861202,-0.448422,-0.680541,-0.697903,-0.711338,...,-0.203950,-0.314379,-0.454435,-0.532779,-0.507220,-0.423425,-0.390915,-0.284126,-0.262354,-0.252165


In [12]:
x_train.to_csv('./data/processed/train_x.csv')
dump(train_scaler, './data/processed/scaler_x.joblib')

['./data/processed/scaler_x.joblib']