In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import lightgbm as lgb

from tsfresh.examples import load_robot_execution_failures
from tsfresh import extract_features, select_features
import optuna

from common import EP

import types

Using TensorFlow backend.


In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
pd.set_option('display.max_rows', 2000)

In [4]:
df_train = pd.read_pickle('../feats/df_train.pkl')
df_test = pd.read_pickle('../feats/df_test.pkl')

In [5]:
df_train['label'] = df_train['y'].apply(lambda x:  int(x) if x<15 else 15)
group = df_train['season'].values
group[np.where(group==17)[0]] = 1
df_train['group'] = group
df_train = df_train.drop(columns=['season'])

In [7]:
# mytrial = []
mytrial = list(pd.read_pickle('../trial/lgbm.pkl').T.to_dict().values())
df_trial = pd.DataFrame(mytrial)
len(mytrial)

2025

In [8]:
test_X = df_train.drop(columns=['y','index','group','label']).copy()
test_X.index = df_train['index']
test_y = df_train['y'].copy()
test_y.index = df_train['index']
tsfresh_columns = select_features(test_X, test_y).columns.tolist()

In [9]:
len(tsfresh_columns)

1071

In [10]:
original_columns = df_train.columns.drop(['index','y','label','group']).tolist()

In [11]:
#check feature_importances
# df_feature_importances = df_trial.loc[1398]['df_feature_importances']
# if type(df_feature_importances)==pd.DataFrame:
#     sorted_columns = EP.evaluate(df_feature_importances, key='average_permutation_weight')
# else:
#     sorted_columns = df_trial.loc[1398]['param']['columns']
# df_feature_importances.sort_values(by=['average_permutation_weight'], ascending=False)
# len(sorted_columns)

In [13]:
param = {'columns': ['q25_roll_std_100',
  'abs_q01_4',
  'q05_roll_std_1000',
  'abs_q25_5',
  'spkt_welch_densitycoeff_2',
  "number_peaks{'n': 10}",
  'spkt_welch_density__coeff_3',
  'iqr_6',
  "change_quantiles{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'var'}",
  'abs_q75_7',
  '5000clearance_factor_quantile25',
  '5000smoothness_std_',
  'q05_roll_std_100',
  'spkt_welch_density__coeff_25',
  'abs_q75_6',
  'abs_max_1',
  'median__roll_std',
  'q05_roll_std_10',
  'abs_q05_7',
  "autocorrelation{'lag': 5}",
  'abs_max_7',
  'FFT_Mag_95q0',
  'abs_max_8',
  '5000smoothness_entropy_',
  'spkt_welch_density__coeff_42'],
 'kfold': {'n_splits': 3,
  'random_state': 1985,
  'shuffle': True,
  'type': 'stratified'},
 'scaler': {'cls': 'StandardScaler'},
 'algorithm': {'cls': 'lgb.LGBMRegressor',
  'init': {'learning_rate': 0.17076106120259138,
   'feature_fraction': 0.6842101917408698,
   'bagging_fraction': 0.8986268312800509,
   'min_data_in_leaf': 243,
   'lambda_l1': 4.612300279009062,
   'lambda_l2': 97.21686371760525,
   'max_bin': 28,
   'num_leaves': 11,
   'random_state': 6805,
   'n_jobs': 32},
  'fit': {'eval_metric': 'mae',
   'verbose': False,
   'early_stopping_rounds': 200}},
}

In [14]:
# run one try
df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = df_test, trial=mytrial)

In [104]:
#  select features by permutation_weight
EP.select_features_(df_train, param, mytrial, nfeats_best=20, nfeats_removed_per_try=5, key='average_permutation_weight')

In [17]:
#  tune hypterparameters
def objective(trial):
        
    learning_rate = trial.suggest_uniform('learning_rate', .01, .5)
    feature_fraction = trial.suggest_uniform('feature_fraction', .6, 1)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.6, 1)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 200,800)
    lambda_l1 = trial.suggest_uniform('lambda_l1', 1e-6, 1e2)
    lambda_l2 = trial.suggest_uniform('lambda_l2', 1e-6, 1e2)
    max_bin = trial.suggest_int('max_bin', 10, 100)
#     max_depth = trial.suggest_int('max_depth', 2, 6)
    num_leaves = trial.suggest_int('num_leaves', 4, 128)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':param['columns'].copy(),
        'kfold':{
            'n_splits': 3,
            'random_state': 1985,
            'shuffle': True,
            'type': 'stratified'
        },
        'scaler':{
            'cls':'StandardScaler',
        },
        'algorithm':{
            'cls':'lgb.LGBMRegressor',
            'init':{
                'learning_rate':learning_rate,
                'feature_fraction':feature_fraction,
                'bagging_fraction':bagging_fraction,
                'min_data_in_leaf':min_data_in_leaf,
                'lambda_l1':lambda_l1,
                'lambda_l2':lambda_l2,
                'max_bin':max_bin,
                'num_leaves':num_leaves,
                'random_state':random_state,
                'n_jobs':32
            },
            'fit':{
                'eval_metric':'mae',
                'verbose':False,
                'early_stopping_rounds':200,
            },
        },
    }
    
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=mytrial, remark='tune 2025')
    val_mae_mean = np.mean(df_his.valid)
    val_mae_var = np.var(df_his.valid)
    train_mae_mean = np.mean(df_his.train)
    train_mae_var = np.var(df_his.train)
    
    trial.set_user_attr('val_mae', val_mae_mean)
    trial.set_user_attr('train_mae', train_mae_mean)
    trial.set_user_attr('mae_diff', val_mae_mean-train_mae_mean)
    trial.set_user_attr('val_mae_var', val_mae_var)

    return np.abs(val_mae_mean - train_mae_mean)*val_mae_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-05-16 22:03:16,568] Finished a trial resulted in value: 0.20447753566082247. Current best value is 0.20447753566082247 with parameters: {'learning_rate': 0.21352232075045977, 'feature_fraction': 0.9225103592909893, 'bagging_fraction': 0.8842505462091309, 'min_data_in_leaf': 594, 'lambda_l1': 85.1321729953592, 'lambda_l2': 99.17793120175925, 'max_bin': 56, 'num_leaves': 41, 'random_state': 3403}.
[I 2019-05-16 22:03:25,520] Finished a trial resulted in value: 0.04672591392970322. Current best value is 0.04672591392970322 with parameters: {'learning_rate': 0.02986614678323516, 'feature_fraction': 0.9412080752956502, 'bagging_fraction': 0.6826312360921137, 'min_data_in_leaf': 707, 'lambda_l1': 81.08671117957908, 'lambda_l2': 67.6600155541113, 'max_bin': 67, 'num_leaves': 80, 'random_state': 9458}.
[I 2019-05-16 22:03:30,661] Finished a trial resulted in value: 0.10773923491803553. Current best value is 0.04672591392970322 with parameters: {'learning_rate': 0.02986614678323516, 'fe

[I 2019-05-16 22:08:08,643] Finished a trial resulted in value: 0.44383746376107197. Current best value is 0.025357023193622836 with parameters: {'learning_rate': 0.014292401680709698, 'feature_fraction': 0.744299545597998, 'bagging_fraction': 0.6020554873480467, 'min_data_in_leaf': 760, 'lambda_l1': 69.98147454809539, 'lambda_l2': 18.684145154794525, 'max_bin': 86, 'num_leaves': 112, 'random_state': 8504}.
[I 2019-05-16 22:08:22,188] Finished a trial resulted in value: 0.20854551743044986. Current best value is 0.025357023193622836 with parameters: {'learning_rate': 0.014292401680709698, 'feature_fraction': 0.744299545597998, 'bagging_fraction': 0.6020554873480467, 'min_data_in_leaf': 760, 'lambda_l1': 69.98147454809539, 'lambda_l2': 18.684145154794525, 'max_bin': 86, 'num_leaves': 112, 'random_state': 8504}.
[I 2019-05-16 22:08:46,841] Finished a trial resulted in value: 0.16178986069882179. Current best value is 0.025357023193622836 with parameters: {'learning_rate': 0.0142924016807

[I 2019-05-16 22:12:44,675] Finished a trial resulted in value: 0.06045011778460817. Current best value is 0.025357023193622836 with parameters: {'learning_rate': 0.014292401680709698, 'feature_fraction': 0.744299545597998, 'bagging_fraction': 0.6020554873480467, 'min_data_in_leaf': 760, 'lambda_l1': 69.98147454809539, 'lambda_l2': 18.684145154794525, 'max_bin': 86, 'num_leaves': 112, 'random_state': 8504}.
[I 2019-05-16 22:13:02,240] Finished a trial resulted in value: 0.24845911157911438. Current best value is 0.025357023193622836 with parameters: {'learning_rate': 0.014292401680709698, 'feature_fraction': 0.744299545597998, 'bagging_fraction': 0.6020554873480467, 'min_data_in_leaf': 760, 'lambda_l1': 69.98147454809539, 'lambda_l2': 18.684145154794525, 'max_bin': 86, 'num_leaves': 112, 'random_state': 8504}.
[I 2019-05-16 22:13:18,022] Finished a trial resulted in value: 0.04452818309232615. Current best value is 0.025357023193622836 with parameters: {'learning_rate': 0.0142924016807

[I 2019-05-16 22:17:55,671] Finished a trial resulted in value: 0.298333696455116. Current best value is 0.025357023193622836 with parameters: {'learning_rate': 0.014292401680709698, 'feature_fraction': 0.744299545597998, 'bagging_fraction': 0.6020554873480467, 'min_data_in_leaf': 760, 'lambda_l1': 69.98147454809539, 'lambda_l2': 18.684145154794525, 'max_bin': 86, 'num_leaves': 112, 'random_state': 8504}.
[I 2019-05-16 22:18:07,462] Finished a trial resulted in value: 0.13110075322816167. Current best value is 0.025357023193622836 with parameters: {'learning_rate': 0.014292401680709698, 'feature_fraction': 0.744299545597998, 'bagging_fraction': 0.6020554873480467, 'min_data_in_leaf': 760, 'lambda_l1': 69.98147454809539, 'lambda_l2': 18.684145154794525, 'max_bin': 86, 'num_leaves': 112, 'random_state': 8504}.
[I 2019-05-16 22:18:17,723] Finished a trial resulted in value: 0.07085661572157108. Current best value is 0.025357023193622836 with parameters: {'learning_rate': 0.014292401680709

[I 2019-05-16 22:22:29,729] Finished a trial resulted in value: 0.17444675599861362. Current best value is 0.013551428266739454 with parameters: {'learning_rate': 0.010021483468460352, 'feature_fraction': 0.7254795915754325, 'bagging_fraction': 0.6909981504802117, 'min_data_in_leaf': 511, 'lambda_l1': 8.082997362767607, 'lambda_l2': 90.15473065232199, 'max_bin': 24, 'num_leaves': 11, 'random_state': 6842}.
[I 2019-05-16 22:22:35,829] Finished a trial resulted in value: 0.043654645131644056. Current best value is 0.013551428266739454 with parameters: {'learning_rate': 0.010021483468460352, 'feature_fraction': 0.7254795915754325, 'bagging_fraction': 0.6909981504802117, 'min_data_in_leaf': 511, 'lambda_l1': 8.082997362767607, 'lambda_l2': 90.15473065232199, 'max_bin': 24, 'num_leaves': 11, 'random_state': 6842}.
[I 2019-05-16 22:22:40,826] Finished a trial resulted in value: 0.05393559712339996. Current best value is 0.013551428266739454 with parameters: {'learning_rate': 0.01002148346846

[I 2019-05-16 22:25:36,342] Finished a trial resulted in value: 0.21687056960972728. Current best value is 0.013551428266739454 with parameters: {'learning_rate': 0.010021483468460352, 'feature_fraction': 0.7254795915754325, 'bagging_fraction': 0.6909981504802117, 'min_data_in_leaf': 511, 'lambda_l1': 8.082997362767607, 'lambda_l2': 90.15473065232199, 'max_bin': 24, 'num_leaves': 11, 'random_state': 6842}.
[I 2019-05-16 22:25:46,831] Finished a trial resulted in value: 0.05441385018689909. Current best value is 0.013551428266739454 with parameters: {'learning_rate': 0.010021483468460352, 'feature_fraction': 0.7254795915754325, 'bagging_fraction': 0.6909981504802117, 'min_data_in_leaf': 511, 'lambda_l1': 8.082997362767607, 'lambda_l2': 90.15473065232199, 'max_bin': 24, 'num_leaves': 11, 'random_state': 6842}.
[I 2019-05-16 22:25:58,182] Finished a trial resulted in value: 0.26611495141662284. Current best value is 0.013551428266739454 with parameters: {'learning_rate': 0.010021483468460

[I 2019-05-16 22:28:39,523] Finished a trial resulted in value: 0.11296894486365405. Current best value is 0.012481270559635353 with parameters: {'learning_rate': 0.010785490766701936, 'feature_fraction': 0.6368415841627292, 'bagging_fraction': 0.6383422608689294, 'min_data_in_leaf': 739, 'lambda_l1': 55.7147268038884, 'lambda_l2': 98.71878646370965, 'max_bin': 19, 'num_leaves': 29, 'random_state': 7677}.
[I 2019-05-16 22:28:44,168] Finished a trial resulted in value: 0.017381158463224453. Current best value is 0.012481270559635353 with parameters: {'learning_rate': 0.010785490766701936, 'feature_fraction': 0.6368415841627292, 'bagging_fraction': 0.6383422608689294, 'min_data_in_leaf': 739, 'lambda_l1': 55.7147268038884, 'lambda_l2': 98.71878646370965, 'max_bin': 19, 'num_leaves': 29, 'random_state': 7677}.
[I 2019-05-16 22:28:47,108] Finished a trial resulted in value: 0.014979850905453517. Current best value is 0.012481270559635353 with parameters: {'learning_rate': 0.010785490766701

[I 2019-05-16 22:31:28,805] Finished a trial resulted in value: 0.059810720689779114. Current best value is 0.012481270559635353 with parameters: {'learning_rate': 0.010785490766701936, 'feature_fraction': 0.6368415841627292, 'bagging_fraction': 0.6383422608689294, 'min_data_in_leaf': 739, 'lambda_l1': 55.7147268038884, 'lambda_l2': 98.71878646370965, 'max_bin': 19, 'num_leaves': 29, 'random_state': 7677}.
[I 2019-05-16 22:31:40,963] Finished a trial resulted in value: 0.048717854866197605. Current best value is 0.012481270559635353 with parameters: {'learning_rate': 0.010785490766701936, 'feature_fraction': 0.6368415841627292, 'bagging_fraction': 0.6383422608689294, 'min_data_in_leaf': 739, 'lambda_l1': 55.7147268038884, 'lambda_l2': 98.71878646370965, 'max_bin': 19, 'num_leaves': 29, 'random_state': 7677}.
[I 2019-05-16 22:31:53,029] Finished a trial resulted in value: 0.07895098051606378. Current best value is 0.012481270559635353 with parameters: {'learning_rate': 0.010785490766701

[I 2019-05-16 22:34:18,704] Finished a trial resulted in value: 0.08781861861178589. Current best value is 0.012481270559635353 with parameters: {'learning_rate': 0.010785490766701936, 'feature_fraction': 0.6368415841627292, 'bagging_fraction': 0.6383422608689294, 'min_data_in_leaf': 739, 'lambda_l1': 55.7147268038884, 'lambda_l2': 98.71878646370965, 'max_bin': 19, 'num_leaves': 29, 'random_state': 7677}.
[I 2019-05-16 22:34:21,673] Finished a trial resulted in value: 0.024221850109798465. Current best value is 0.012481270559635353 with parameters: {'learning_rate': 0.010785490766701936, 'feature_fraction': 0.6368415841627292, 'bagging_fraction': 0.6383422608689294, 'min_data_in_leaf': 739, 'lambda_l1': 55.7147268038884, 'lambda_l2': 98.71878646370965, 'max_bin': 19, 'num_leaves': 29, 'random_state': 7677}.
[I 2019-05-16 22:34:28,340] Finished a trial resulted in value: 0.02421600797321164. Current best value is 0.012481270559635353 with parameters: {'learning_rate': 0.0107854907667019

[I 2019-05-16 22:37:41,897] Finished a trial resulted in value: 0.1550280116025227. Current best value is 0.011424480439599309 with parameters: {'learning_rate': 0.01044263831919312, 'feature_fraction': 0.6210416133580647, 'bagging_fraction': 0.7463082818235665, 'min_data_in_leaf': 799, 'lambda_l1': 48.574794946151535, 'lambda_l2': 69.48892316992044, 'max_bin': 12, 'num_leaves': 22, 'random_state': 5671}.
[I 2019-05-16 22:37:52,954] Finished a trial resulted in value: 0.1511613326941024. Current best value is 0.011424480439599309 with parameters: {'learning_rate': 0.01044263831919312, 'feature_fraction': 0.6210416133580647, 'bagging_fraction': 0.7463082818235665, 'min_data_in_leaf': 799, 'lambda_l1': 48.574794946151535, 'lambda_l2': 69.48892316992044, 'max_bin': 12, 'num_leaves': 22, 'random_state': 5671}.
[I 2019-05-16 22:38:03,980] Finished a trial resulted in value: 0.16272860644029288. Current best value is 0.011424480439599309 with parameters: {'learning_rate': 0.01044263831919312

[I 2019-05-16 22:41:51,684] Finished a trial resulted in value: 0.11165421323706763. Current best value is 0.011424480439599309 with parameters: {'learning_rate': 0.01044263831919312, 'feature_fraction': 0.6210416133580647, 'bagging_fraction': 0.7463082818235665, 'min_data_in_leaf': 799, 'lambda_l1': 48.574794946151535, 'lambda_l2': 69.48892316992044, 'max_bin': 12, 'num_leaves': 22, 'random_state': 5671}.
[I 2019-05-16 22:42:10,895] Finished a trial resulted in value: 0.18877763153119337. Current best value is 0.011424480439599309 with parameters: {'learning_rate': 0.01044263831919312, 'feature_fraction': 0.6210416133580647, 'bagging_fraction': 0.7463082818235665, 'min_data_in_leaf': 799, 'lambda_l1': 48.574794946151535, 'lambda_l2': 69.48892316992044, 'max_bin': 12, 'num_leaves': 22, 'random_state': 5671}.
[I 2019-05-16 22:42:26,242] Finished a trial resulted in value: 0.4053858813345861. Current best value is 0.011424480439599309 with parameters: {'learning_rate': 0.0104426383191931

In [36]:
df_trial[['datetime','nfeatures', 'kfold-type', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff','remark']].tail(10)

Unnamed: 0,datetime,nfeatures,kfold-type,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff,remark
2216,2019-05-16 22:40:59.409286,25,stratified,1.64386,1.9e-05,1.835343,0.000251,0.191483,tune 2025
2217,2019-05-16 22:41:17.433079,25,stratified,1.786154,4.5e-05,1.889711,0.000168,0.103556,tune 2025
2218,2019-05-16 22:41:28.781384,25,stratified,1.875471,3.1e-05,1.938784,0.000166,0.063314,tune 2025
2219,2019-05-16 22:41:41.252631,25,stratified,1.78456,3.7e-05,1.890141,0.000216,0.105581,tune 2025
2220,2019-05-16 22:41:51.655222,25,stratified,1.878189,9e-06,1.935866,0.000191,0.057677,tune 2025
2221,2019-05-16 22:42:10.866030,25,stratified,1.801695,8e-06,1.901,2.6e-05,0.099304,tune 2025
2222,2019-05-16 22:42:26.211561,25,stratified,1.583112,9e-06,1.807404,0.00033,0.224292,tune 2025
2223,2019-05-16 22:42:39.807151,25,stratified,2.027536,6e-06,2.050687,9.4e-05,0.02315,tune 2025
2224,2019-05-16 22:42:51.797308,25,stratified,1.911244,4.7e-05,1.960035,0.000108,0.048792,tune 2025
2225,2019-05-16 22:43:14.971232,25,stratified,1.693313,8e-06,1.846207,0.000143,0.152895,tune 2025


In [19]:
df_trial = pd.DataFrame(mytrial)
df_trial['kfold-type'] = df_trial['param'].apply(lambda x: x['kfold']['type'])
df_trial[['datetime','nfeatures', 'kfold-type', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff','remark']].tail(1)

Unnamed: 0,datetime,nfeatures,kfold-type,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff,remark
2225,2019-05-16 22:43:14.971232,25,stratified,1.693313,8e-06,1.846207,0.000143,0.152895,tune 2025


In [22]:
df_trial[(df_trial['remark']=='tune 2025')&(df_trial['mae_diff']<.05)].sort_values(by=['val_mae'])[['datetime','nfeatures', 'kfold-type', 'train_mae','train_mae_var','val_mae','val_mae_var','mae_diff','remark']]

Unnamed: 0,datetime,nfeatures,kfold-type,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff,remark
2156,2019-05-16 22:30:28.865774,25,stratified,1.911171,1.970566e-05,1.952897,0.000129,0.041726,tune 2025
2197,2019-05-16 22:36:59.856389,25,stratified,1.906177,3.362777e-05,1.953383,0.000159,0.047206,tune 2025
2114,2019-05-16 22:24:01.463351,25,stratified,1.906127,3.37165e-05,1.954325,9e-05,0.048197,tune 2025
2168,2019-05-16 22:32:40.935405,25,stratified,1.916184,3.821382e-05,1.955712,0.000202,0.039527,tune 2025
2096,2019-05-16 22:20:55.069361,25,stratified,1.908899,3.934967e-05,1.956906,0.00017,0.048007,tune 2025
2210,2019-05-16 22:39:36.342538,25,stratified,1.910199,3.149966e-05,1.957272,0.000117,0.047073,tune 2025
2151,2019-05-16 22:29:52.988914,25,stratified,1.913572,5.040441e-05,1.958958,0.000155,0.045387,tune 2025
2181,2019-05-16 22:34:18.680562,25,stratified,1.914324,2.330145e-05,1.959149,0.00015,0.044825,tune 2025
2224,2019-05-16 22:42:51.797308,25,stratified,1.911244,4.711916e-05,1.960035,0.000108,0.048792,tune 2025
2054,2019-05-16 22:10:12.185291,25,stratified,1.916025,3.429129e-05,1.960056,9.3e-05,0.044031,tune 2025


In [25]:
df_trial.to_pickle('../trial/lgbm.pkl')

In [174]:
columns112 = df_trial.loc[112]['param']['columns']
columns464 = df_trial.loc[464]['param']['columns']
columns691 = df_trial.loc[691]['param']['columns']#1398
columns939 = df_trial.loc[939]['param']['columns']#1619
xgboost_columns146 = ['q25_roll_std_100',
 'spkt_welch_density__coeff_3',
 'spkt_welch_densitycoeff_2',
 'abs_q01_4',
 'abs_q95_2',
 'abs_q25_5',
 'spkt_welch_density__coeff_4',
 'q05_roll_std_100',
 'q05_roll_std_1000',
 'q05_5',
 "number_peaks{'n': 10}",
 'median__roll_std',
 'abs_q75_6',
 '5000crest_factor_quantile75',
 'spkt_welch_density__coeff_28',
 'iqr_6',
 'q75_roll_mean_10',
 "number_crossing_m{'m': 1}",
 'abs_q75_7',
 'abs_q05_2',
 "number_peaks{'n': 5}",
 'abs_max_roll_mean_1000']#1959


In [23]:
df_trial.loc[2156:2156][['datetime','nfeatures', 'remark', 'kfold-type','train_mae','train_mae_var','val_mae','val_mae_var','mae_diff']]

Unnamed: 0,datetime,nfeatures,remark,kfold-type,train_mae,train_mae_var,val_mae,val_mae_var,mae_diff
2156,2019-05-16 22:30:28.865774,25,tune 2025,stratified,1.911171,2e-05,1.952897,0.000129,0.041726


In [20]:
df_test_pred = df_trial.loc[2024]['df_test_pred']

In [21]:
df_submit = pd.DataFrame()
df_submit['time_to_failure'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['seg_id'] = df_test_pred['index']
df_submit.to_csv('submission_maylgbm_2024.csv', index=False)

In [23]:
import random
from deap import creator, base, tools, algorithms
import warnings
warnings.filterwarnings("default")
#https://deap.readthedocs.io/en/master/api/tools.html

class GAFeatureSelection(object):
    

    def __init__(self, df, features, n_component, n_population=None, tournsize=None, cxpb=.5, mutpb=.2, indpb=.05, ngen=100, rndseed=1985):
        random.seed(rndseed)
        self.df = df
        self.features = features
        self.n_component = n_component
        self.n_population = n_population
        if (self.n_population == None):
            self.n_population = self.__evalute_n_population__(self.n_component, len(self.features))
        self.tournsize = tournsize
        if (self.tournsize == None):
            self.tournsize = int(self.n_population*.01)
        self.cxpb = cxpb
        self.mutpb = mutpb
        self.indpb = indpb
        self.ngen = ngen
#         creator.create("FitnessMax", base.Fitness, weights=(1.0,),)
#         creator.create("Individual", list, fitness=creator.FitnessMax)

        creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
        creator.create("Individual", list, fitness=creator.FitnessMin)
        self.toolbox = base.Toolbox()
        self.toolbox.register("attr_gen", random.randint, 0, len(self.features)-1)
        self.toolbox.register("individual", tools.initRepeat, creator.Individual, self.toolbox.attr_gen, n=self.n_component)
        self.toolbox.register("population", tools.initRepeat, list, self.toolbox.individual)
        self.toolbox.register("evaluate", self.evaluate)
        self.toolbox.register("mate", tools.cxTwoPoint)
        self.toolbox.register("mutate", tools.mutFlipBit, indpb=self.indpb)
        self.toolbox.register("select", tools.selTournament, tournsize=self.tournsize)
        self.population = self.toolbox.population(n=self.n_population)
        return
    
    
    def evaluate(self, individual):
        raise Exception('no inheritance')
        
        
    def run(self, ntop=1):
        for gen in range(self.ngen):
            offspring = algorithms.varAnd(self.population, self.toolbox, cxpb=self.cxpb, mutpb=self.mutpb)
            fits = self.toolbox.map(self.toolbox.evaluate, offspring)
            for fit, ind in zip(fits, offspring):
                ind.fitness.values = fit
            self.population = self.toolbox.select(offspring, k=len(self.population))
        return tools.selBest(self.population, k=ntop)
    
    
    def __evalute_n_population__(self, n_component, n_features, init_n_population=100, feature_adoption_rate=.9):
        n_population = init_n_population
        selected_n_feature = 0
        while selected_n_feature/n_features < feature_adoption_rate:
            n_population = int(n_population*1.2)
            k = []
            for j in np.arange(n_population):
                k.append([random.randint(0, n_features-1) for i in np.arange(n_component)])
            k = np.array(k)
            selected_n_feature = np.unique(k).shape[0]
        return n_population
    

In [39]:
ga_trial = []
fixed_columns = ['abs_q01_4',
             'spkt_welch_density__coeff_3',
             'abs_q75_7',
             'q05_roll_std_1000',
             'q25_roll_std_100',
             "number_peaks{'n': 10}",
             'spkt_welch_densitycoeff_2',
             'abs_max_7']

class MyGAFS(GAFeatureSelection):
    
#     def __init__(self, df, features, n_component, **kwargs):
#         super(MyGAFS, self).__init__(df, features, n_component, **kwargs)
#         return
    
    def evaluate(self, individual):
        
        select_i = [self.features[chromosome] for chromosome in individual]
        columns = fixed_columns + select_i
        
        param={
            'algorithm': {
                'cls': 'lgb.LGBMRegressor',
                'fit': {
                    'early_stopping_rounds': 200,
                    'eval_metric': 'mae',
                    'verbose': False
                },
                'init': {
                    'bagging_fraction': 0.9629636521622223,
                    'feature_fraction': 0.917549020490175,
                    'lambda_l1': 75.11011819901437,
                    'lambda_l2': 84.85130517060821,
                    'learning_rate': 0.1921537121698339,
                    'max_bin': 36,
                    'max_depth': 5,
                    'min_data_in_leaf': 687,
                    'n_jobs': 16,
                    'random_state': 9950
                }
                },
            'columns': columns,
            'feature_importance': {
                'is_output': False,
                'permutation_feature_importance': False,
                'permutation_random_state': 1
            },
            'kfold': {
                'n_splits': 8,
                'random_state': 1985,
                'shuffle': True,
                'type': 'group'
            },
            'scaler': {
                'cls': 'StandardScaler'
            }
        }
        
        score = []
        try:
            df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, param, df_test = None, trial=ga_trial, remark=None)
            val_mae_mean = np.mean(df_his.valid)
            val_mae_var = np.var(df_his.valid)
            train_mae_mean = np.mean(df_his.train)
            train_mae_var = np.var(df_his.train)
        except Exception as e:
            print(e.__str__())
            return 0
        fitness = np.abs(val_mae_mean-train_mae_mean)*val_mae_mean
        return fitness,

In [24]:
# gafs = MyGAFS(df = df_train,  features = df_train[tsfresh_columns].columns.drop(fixed_columns).tolist(),  n_component = 17,  ngen = 10,  n_population = 10000)
# top10 = gafs.run(10)