In [13]:
import numpy as np
import pandas as pd
import pickle

train = pd.read_csv('./raws/train_data.csv')
test = pd.read_csv('./raws/test_data.csv')

train = train.loc[:, 'lat':'MIN_WVP']
test = test.loc[:, 'lat':'MIN_WVP']

train = train.drop(columns=['Landsat_StartTime', 'PRODUCT_ID'])
test = test.drop(columns=['Landsat_StartTime', 'PRODUCT_ID'])

pickle.dump(train, open('./processed/shrinked_train.pkl', 'wb'))
pickle.dump(test, open('./processed/shrinked_test.pkl', 'wb'))

# 以下本体

In [16]:
def featurecreation(df, istrain):
    # 切り出し
    # df = df.loc[:, 'lat':'sst_diff']
    df = df.loc[:, 'lat':'MIN_WVP']

    # # 海岸の比率
    df['beach_rate'] = df['beach_length'] / df['coast_length']
    # df['aicial_rate'] = df['aicial_length'] / df['coast_length']

    # 透明度推定
    # df['SS_est'] = (df['Green']-df['SWIR2']) / (df['Red']-df['SWIR2'])
    df['Chla_est'] = (df['Blue']-df['SWIR2']) / (df['Red']-df['SWIR2'])

    # depthの処理
    # df.loc[df['depth']==0, 'depth'] = np.nan
    # df['dfdiff'] = df['depth_original'] - df['depth']
    # df.loc[~(df['depth_original'].isna()), 'depth'] = np.nan

    # depth_original推定
    # if istrain == 1:
    #     global domodel
    #     domodel = depth_original_estimation(df)
    # for i in range(4):
    #     df['dopred'+str(i)] = domodel[i].predict(df, predict_disable_shape_check=True)
    # df['depth_pred'] = df[['dopred'+str(i) for i in range(4)]].mean(axis=1)
    # df = df.drop(columns=['dopred'+str(i) for i in range(4)])
    
    # 文献の分類
    # df['YMDstyle'] = [str(str(i).count('-')) +'_'+ str(str(i).count('.')) for i in list(df['YMD'])]
    # df['YMDstyle'].replace(['0_4', '1_2', '2_12', '0_2'], np.nan, inplace=True)
    # df['YMDstyle'].replace(['0_0', '0_1', '0_2', '0_4', '1_2', '1_4', '2_12'], [2, 1, 3, np.nan, np.nan, 4, np.nan], inplace=True)

    # 必要な列の抽出
    col = ['Chla_est']
    df = pd.concat([df.loc[:, 'lat':'sst_diff'], df.loc[:, col]], axis=1)
    col = ['year', 'month', 'YMD', 'area', 'lon', 'lat', 'cliff_length', 'Salinity_annual', 'river_area', 'sst_diff']
    df.drop(columns=col, inplace=True)
    # df = df.loc[:, ['cover', 'depth_original','area','YMDstyle','warm_sst','fetch','depth','hist_warm_sst','coastal_dist','Salinity_annual','coast_length','aicial_length','sst_diff']]

    return df

In [26]:
def depth_original_estimation(df):
    print('depth_original feature estimating...')
    print('===========================================================\n')

    train_p = df.loc[~(df['depth_original'].isna()), :]

    x = train_p.drop(columns=['cover', 'depth_original'], errors='ignore')
    y = train_p['depth_original']

    kf = KFold(n_splits=4, shuffle=True, random_state=234)
    c = 0
    model = []
    
    for idx_tr, idx_va in kf.split(train_p):
        tr_x, va_x = x.iloc[idx_tr], x.iloc[idx_va]
        tr_y, va_y = y.iloc[idx_tr], y.iloc[idx_va]

        lgb_train = lgb.Dataset(tr_x, tr_y)
        lgb_val = lgb.Dataset(va_x, va_y)

        params = {
            'objective': 'regression',
            'metric': 'rmse',
            'verbose': -1,
            'random_state': 254,
            # 'min_data_in_leaf': 200,
            # 'bagging_fraction': 0.5,
            # 'bagging_freq': 1,
            'lambda_l1': 0.5,
            'lambda_l2': 0.5,
        }

        model.append(lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_val],
            valid_names=['train', 'val'],
            num_boost_round=1000,
            callbacks=[lgb.early_stopping(stopping_rounds=3, verbose=True)]
        ))

    print('\n===========================================================\n')
    
    return model

In [53]:
def make_nn_data(df, istrain):
    # depth_original推定
    if istrain == 1:
        global domodel
        domodel = depth_original_estimation(df)
    for i in range(4):
        df['dopred'+str(i)] = domodel[i].predict(df, predict_disable_shape_check=True)
    df['depth_pred'] = df[['dopred'+str(i) for i in range(4)]].mean(axis=1)
    df['depth_original'].fillna(df['depth_pred'], inplace=True)
    df = df.drop(columns=['dopred'+str(i) for i in range(4)]+['depth_pred'])

    # 穴埋め
    for col in df.columns:
        df[col].fillna(df[col].mean(numeric_only=True), inplace=True)
    
    #標準化
    if istrain == 1:
        global mea
        global std
        mea = df.drop(columns='cover').mean()
        std = df.drop(columns='cover').std()
        cover = df['cover']
        df = (df - mea) / std
        df['cover'] = cover
    else:
        df = (df - mea) / std
    
    return df

In [50]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import pickle
from sklearn.model_selection import KFold

# train = pd.read_csv('./raws/train_data.csv')
# test = pd.read_csv('./raws/test_data.csv')
train = pickle.load(open('./processed/shrinked_train.pkl', 'rb'))
test = pickle.load(open('./processed/shrinked_test.pkl', 'rb'))

In [54]:
train_ = featurecreation(train, 1)
test_ = featurecreation(test, 0)

pickle.dump(train_, open('./processed/lgb_train.pkl', 'wb'))
pickle.dump(test_, open('./processed/lgb_test.pkl', 'wb'))

train_ = make_nn_data(train_, 1)
test_ = make_nn_data(test_, 0)

pickle.dump(train_, open('./processed/nn_train.pkl', 'wb'))
pickle.dump(test_, open('./processed/nn_test.pkl', 'wb'))

depth_original feature estimating...

Training until validation scores don't improve for 3 rounds
Early stopping, best iteration is:
[108]	train's rmse: 0.844813	val's rmse: 1.00773
Training until validation scores don't improve for 3 rounds
Early stopping, best iteration is:
[99]	train's rmse: 0.783659	val's rmse: 1.34252
Training until validation scores don't improve for 3 rounds
Early stopping, best iteration is:
[115]	train's rmse: 0.813404	val's rmse: 0.988994
Training until validation scores don't improve for 3 rounds
Early stopping, best iteration is:
[90]	train's rmse: 0.803479	val's rmse: 1.28713




In [47]:
test_.clip(0, 2, inplace=True)

In [56]:
test_

Unnamed: 0,depth_original,aicial_length,beach_length,coast_length,coastal_dist,cold_sst,depth,fetch,hist_cold_sst,hist_warm_sst,river_dist,warm_sst,sst_annual,sst_ymd,Chla_est
0,-1.526583,2.308732,-0.472764,0.767202,-0.447847,0.977763,-0.924419,-0.817028,0.936851,1.011987,-0.798753,0.624851,0.966570,-0.636261,-0.095732
1,-0.428186,-0.382668,2.094515,1.508481,-0.486178,0.984934,1.090731,-1.774321,0.935334,0.406273,-0.649615,1.223371,0.965129,1.230732,-0.095732
2,0.058741,-0.083733,1.088644,0.969124,-0.573294,-1.104054,-0.377011,-0.452031,-0.665990,0.272542,-0.646942,-0.386732,-1.054618,-2.191642,-0.095732
3,-0.493328,-0.382668,-1.111069,-1.145003,-0.601171,-0.583171,0.101931,-0.821432,0.134596,-0.297053,0.006226,-1.621510,-0.621779,0.985235,2.215099
4,-1.645780,-0.253607,-0.320310,-0.424078,-0.137714,0.531860,-0.924419,-1.970255,0.731376,-0.791884,1.930360,0.163593,0.563853,0.095865,-1.320249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4034,-1.538503,-0.306280,0.159317,-0.054140,-0.378154,-1.160246,0.541098,-0.313202,-0.996104,-0.315032,-0.679310,-1.303190,-1.211069,0.573896,-0.275556
4035,-0.603355,-0.382668,-1.030628,-1.078417,-0.043628,0.997756,0.686703,0.779535,0.946537,1.120150,-0.739830,0.540590,0.989261,-0.478101,0.550241
4036,-1.574262,0.111113,2.054972,1.875415,-0.594202,-1.109379,-0.924419,-0.889888,-0.688279,0.261927,-0.645451,-0.360262,-1.062047,-2.195682,-0.095732
4037,-0.467375,-0.382668,0.399136,0.245577,-0.566324,1.381217,-0.021358,0.039867,0.969969,1.652443,-0.236762,0.972528,1.411566,-1.291243,-1.639349
