In [1]:
import numpy as np
import pandas as pd
from joblib import dump
from sklearn.preprocessing import StandardScaler

In [5]:
tar_features = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
tar_sd_features = ['X4_sd', 'X11_sd', 'X18_sd', 'X50_sd', 'X26_sd', 'X3112_sd']
log_features = ['X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']

In [4]:
train = pd.read_csv('./data/train.csv', index_col='id').fillna(0)
train = train + 1e-5

In [4]:
x_feature = train.columns[:-12]
list(x_feature)

['WORLDCLIM_BIO1_annual_mean_temperature',
 'WORLDCLIM_BIO12_annual_precipitation',
 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month',
 'WORLDCLIM_BIO15_precipitation_seasonality',
 'WORLDCLIM_BIO4_temperature_seasonality',
 'WORLDCLIM_BIO7_temperature_annual_range',
 'SOIL_bdod_0.5cm_mean_0.01_deg',
 'SOIL_bdod_100.200cm_mean_0.01_deg',
 'SOIL_bdod_15.30cm_mean_0.01_deg',
 'SOIL_bdod_30.60cm_mean_0.01_deg',
 'SOIL_bdod_5.15cm_mean_0.01_deg',
 'SOIL_bdod_60.100cm_mean_0.01_deg',
 'SOIL_cec_0.5cm_mean_0.01_deg',
 'SOIL_cec_100.200cm_mean_0.01_deg',
 'SOIL_cec_15.30cm_mean_0.01_deg',
 'SOIL_cec_30.60cm_mean_0.01_deg',
 'SOIL_cec_5.15cm_mean_0.01_deg',
 'SOIL_cec_60.100cm_mean_0.01_deg',
 'SOIL_cfvo_0.5cm_mean_0.01_deg',
 'SOIL_cfvo_100.200cm_mean_0.01_deg',
 'SOIL_cfvo_15.30cm_mean_0.01_deg',
 'SOIL_cfvo_30.60cm_mean_0.01_deg',
 'SOIL_cfvo_5.15cm_mean_0.01_deg',
 'SOIL_cfvo_60.100cm_mean_0.01_deg',
 'SOIL_clay_0.5cm_mean_0.01_deg',
 'SOIL_clay_100.200cm_mean_0.01_d

In [14]:
train['X11_mean'] + train['X11_sd'] * np.random.normal(0, 0.1, len(train))

id
192027691    11.929230
195542235    15.759774
196639184     5.291263
195728812     9.097583
195251545    14.457712
               ...    
190558785    11.572787
194523231     6.117681
195888987     5.533388
135487319     7.024228
146608105    13.169106
Length: 55489, dtype: float64

In [15]:
for mean, sd in zip(tar_features, tar_sd_features):
    print(mean, sd)
    train[mean] = train[mean] + train[sd] * np.random.normal(0, 0.1, len(train))

X4_mean X4_sd
X11_mean X11_sd
X18_mean X18_sd
X50_mean X50_sd
X26_mean X26_sd
X3112_mean X3112_sd


In [17]:
train[tar_features]

Unnamed: 0_level_0,X4_mean,X11_mean,X18_mean,X50_mean,X26_mean,X3112_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
192027691,0.400071,11.887680,0.115138,1.873856,1.230122,48.097164
195542235,0.479893,15.767041,0.389362,1.352385,0.644923,572.837821
196639184,0.796927,5.291262,8.552917,2.343163,0.395249,1130.096740
195728812,0.526728,9.532661,1.100133,1.187957,0.153521,1042.075608
195251545,0.411670,14.596682,0.659322,2.243411,10.920314,2392.235761
...,...,...,...,...,...,...
190558785,0.337252,11.572789,0.233701,1.608350,1.783202,969.547842
194523231,0.423531,6.147388,1.007841,2.412141,13.455754,1617.548862
195888987,0.641634,5.513693,2.743660,2.729061,10.255837,606.133478
135487319,0.774651,7.024228,4.429667,3.251750,9.372180,244.387180


In [18]:
scaler = StandardScaler()
y_train = np.zeros_like(train[tar_features], dtype=np.float32)

for idx, (target, tar_sd) in enumerate(zip(tar_features, tar_sd_features)):
    v = train[target].values

    if target in log_features:
        v = np.log10(v)

    y_train[:, idx] = v

y_train = scaler.fit_transform(y_train)

In [19]:
y_train = pd.DataFrame(y_train, columns=tar_features).set_index(train.index)
y_train

Unnamed: 0_level_0,X4_mean,X11_mean,X18_mean,X50_mean,X26_mean,X3112_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
192027691,-0.700685,-0.259984,-1.317309,0.447518,-0.330649,-1.629578
195542235,-0.244191,0.186301,-0.577431,-0.212735,-0.584730,-0.123722
196639184,1.568909,-1.539090,1.298727,0.900003,-0.777380,0.289280
195728812,0.023656,-0.608864,0.053322,-0.475183,-1.149479,0.239991
195251545,-0.634354,0.064422,-0.257583,0.811929,0.528504,0.745117
...,...,...,...,...,...,...
190558785,-1.059944,-0.302407,-0.887421,0.138193,-0.184554,0.196141
194523231,-0.566523,-1.302103,0.000113,0.958740,0.610655,0.507258
195888987,0.680800,-1.474019,0.608278,1.208652,0.503803,-0.089380
135487319,1.441513,-1.091399,0.899178,1.563420,0.468350,-0.641511


In [20]:
y_train.to_csv('./data/processed/train.csv')
dump(scaler, './data/processed/scaler.joblib')

['./data/processed/scaler.joblib']

In [4]:
test = pd.read_csv('./data/test.csv')
test['file_path'] = test['id'].apply(lambda s: f'./data/test_images/{s}.jpeg')
test['jpeg_bytes'] = test['file_path'].apply(lambda fp: open(fp, 'rb').read())
test.to_pickle('./data/test.pkl')

In [7]:
train_scaler = StandardScaler()
x_train = np.zeros_like(train[x_feature], dtype=np.float32)

for idx, target in enumerate(x_feature):
    v = train[target].values

    # if target in log_x_features:
    #     v = np.log10(v)

    x_train[:, idx] = v

x_train = train_scaler.fit_transform(x_train)

In [10]:
x_train = pd.DataFrame(x_train, columns=x_feature).set_index(train.index)

In [11]:
x_train

Unnamed: 0_level_0,WORLDCLIM_BIO1_annual_mean_temperature,WORLDCLIM_BIO12_annual_precipitation,WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month,WORLDCLIM_BIO15_precipitation_seasonality,WORLDCLIM_BIO4_temperature_seasonality,WORLDCLIM_BIO7_temperature_annual_range,SOIL_bdod_0.5cm_mean_0.01_deg,SOIL_bdod_100.200cm_mean_0.01_deg,SOIL_bdod_15.30cm_mean_0.01_deg,SOIL_bdod_30.60cm_mean_0.01_deg,...,VOD_X_1997_2018_multiyear_mean_m03,VOD_X_1997_2018_multiyear_mean_m04,VOD_X_1997_2018_multiyear_mean_m05,VOD_X_1997_2018_multiyear_mean_m06,VOD_X_1997_2018_multiyear_mean_m07,VOD_X_1997_2018_multiyear_mean_m08,VOD_X_1997_2018_multiyear_mean_m09,VOD_X_1997_2018_multiyear_mean_m10,VOD_X_1997_2018_multiyear_mean_m11,VOD_X_1997_2018_multiyear_mean_m12
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
192027691,-0.308725,-0.889802,-0.554512,0.490021,0.805016,0.879735,0.460272,0.579818,0.487341,0.633997,...,0.638743,0.927605,1.040217,0.817117,0.347126,0.213268,0.384477,0.644727,0.544614,0.236286
195542235,0.355134,-1.259414,-0.936992,-0.583946,1.075379,1.615380,0.403478,0.248144,0.612104,0.505870,...,-0.768723,-0.881663,-0.942067,-1.069347,-1.022200,-1.020704,-0.940234,-0.787698,-0.648572,-0.492379
196639184,-0.042541,-0.203696,-0.648946,-1.225982,-0.412910,-0.264452,-0.562008,-0.481537,-0.573140,-0.583211,...,0.488973,0.413616,0.405373,0.406568,0.344944,0.294744,0.386658,0.516051,0.623689,0.651859
195728812,0.541083,0.539962,0.182802,-0.369732,-0.434778,-0.523488,0.176305,-0.614206,-0.198853,-0.262893,...,-0.321266,-0.438934,-0.559986,-0.635205,-0.534976,-0.460638,-0.527141,-0.260902,-0.219068,-0.193552
195251545,-1.833272,-0.687428,-0.640040,-0.583637,2.541932,2.258165,-1.470702,0.380814,-0.510759,0.185552,...,-0.128926,0.260018,0.553307,1.199778,1.516921,1.560907,1.258588,0.767458,0.616531,0.594171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190558785,0.645423,-1.058427,-0.726033,0.326497,-0.145290,0.276630,0.062718,-0.017194,0.175435,0.057425,...,-0.721367,-0.690451,-0.596856,-0.525751,-0.424501,-0.321623,-0.282681,-0.401587,-0.571184,-0.637861
194523231,-0.112469,0.508838,0.176645,-0.428751,0.423243,0.160330,0.460272,0.248144,0.424960,0.377743,...,-0.646642,-0.603014,-0.496135,-0.497384,-0.377590,-0.402016,-0.473746,-0.457392,-0.553931,-0.624018
195888987,0.021632,-0.620095,-0.212108,1.026921,-0.036127,0.180030,0.062718,0.977825,0.487341,0.698061,...,-1.581710,-1.633348,-1.668236,-1.719123,-1.754660,-1.738011,-1.750808,-1.761985,-1.816742,-1.792120
135487319,0.200102,0.158816,-0.425108,-1.067075,-0.557545,-0.861202,-0.448422,-0.680541,-0.697903,-0.711338,...,-0.203950,-0.314379,-0.454435,-0.532779,-0.507220,-0.423425,-0.390915,-0.284126,-0.262354,-0.252165


In [12]:
x_train.to_csv('./data/processed/train_x.csv')
dump(train_scaler, './data/processed/scaler_x.joblib')

['./data/processed/scaler_x.joblib']