# LGBM + CV + stacking

## Load parameters

In [None]:
import glob

param_list = glob.glob("optuna_lgbm*.csv")
models = list()
best_params = list()

lgbm_params = pd.DataFrame()

for f in param_list:
    gb_type = [f.split('_')][0][2][:-4]
    tmp = pd.read_csv(f, index_col='Unnamed: 0')
    tmp['params_boosting_type'] = gb_type
    if lgbm_params.shape[0] == 0:
        lgbm_params = tmp
    else:
        lgbm_params = pd.concat([lgbm_params, tmp])
        
lgbm_params = lgbm_params.sort_values('value').head(20)
param_cols = [c for c in lgbm_params.columns if c.startswith('params_')]
lgbm_params = lgbm_params[param_cols]


for idx, row in lgbm_params.iterrows():
    row_dict = {k[7:]: v for k, v in row.items()}
    row_dict['objective'] = 'binary'
    row_dict['metric'] = 'none'
#     row_dict['subsample_for_bin'] = 300000
    row_dict['force_col_wise'] = False
    row_dict['early_stopping_rounds'] = 50
    row_dict['verbose'] = -1
    row_dict['max_bin'] = 255
    row_dict['bagging_freq'] = int(row_dict['bagging_freq'])
    if row_dict['bagging_fraction'] != row_dict['bagging_fraction']:
        row_dict['bagging_fraction'] = None
    row_dict['min_child_samples'] = int(row_dict['min_child_samples'])
    row_dict['n_estimators'] = 3000 #int(row_dict['n_estimators'])
    
    row_dict['learning_rate'] = 0.06733232950390658
    row_dict['num_leaves'] = int(row_dict['num_leaves'])
    row_dict['max_depth'] = int(row_dict['max_depth'])
    row_dict['is_unbalance'] = True
    row_dict['class_weight'] = 'balanced'
    row_dict['verbose'] = -1
    
    best_params.append(row_dict)

## Stacking with the Cross Validation

In [None]:
n_folds = 10

def bll_metric(y_true, y_pred):
    return 'balanced_log_loss', balanced_log_loss(y_true, y_pred), False

def lgbm_training():
    models_ = list()
    bll_list = list()
    weights_ = list()
    
    X, y = train_df[features], train_df.Class
#     X, y = generated_features_train, train_df.Class
     
    kf = MultilabelStratifiedKFold(n_splits=n_folds, shuffle=True, random_state=8062023+20)
    metric = balanced_log_loss
    eval_results_ = {}     # used to store evaluation results for each fold

    oof_level2 = np.zeros([y.shape[0], len(best_params) + 1])
    oof_level2[:, len(best_params)] = y

    print(f"Training with {blu}{X.shape[1]}{res} features")

    for fold, (fit_idx, val_idx) in tqdm(enumerate(kf.split(X=train_df, y=greeks.iloc[:,1:3]), start = 1),
                                         total=n_folds):
        
        # Split the dataset according to the fold indexes.
        X_train = X.iloc[fit_idx]
        X_val = X.iloc[val_idx]
        y_train = y.iloc[fit_idx]
        y_val = y.iloc[val_idx]

        for i, params in enumerate(best_params):
            
            clf = lgb.LGBMClassifier(**params)
            clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                    eval_metric=bll_metric, verbose=-1)
            models_.append(clf)

            val_preds = clf.predict_proba(X_val)[:,1]
            oof_level2[val_idx, i] = val_preds

            val_score = balanced_log_loss(y_val, val_preds)
            best_iter = clf.best_iteration_

            print(clf.best_iteration_)
            
            print(f'Fold: {blu}{fold:>3}{res}| bll_metric: {blu}{val_score:.5f}{res}'
                  f' | Best iteration: {blu}{best_iter:>4}{res}')
        
    return oof_level2

oof_level2 = lgbm_training()

## Stacking with Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

X = oof_level2[:,:-1]
y = oof_level2[:,-1]

# mean bll
print(balanced_log_loss(y, np.mean(X, axis=1)))

lr = LogisticRegression(class_weight='balanced')
lr.fit(X, y)

pred = lr.predict_proba(X)[:,1]

# lr bll
print(balanced_log_loss(y, pred))

weights_ = lr.coef_[0]

# LGBM + TSS + Stacking

### Create metafeatures for the train set

In [None]:
# dates for the second level
dates_level2 = [28, 29, 30, 31, 32, 33]

# train dates
dates_train = X_train.date_block_num
# dates_train_level2 = dates_train[dates_train.isin(dates)]

# That is how we get target for the 2nd level dataset
y_train_level2 = y_train[dates_train.isin(dates_level2)]

# And here we create 2nd level feeature matrix, init it with zeros first
X_train_level2 = np.zeros([y_train_level2.shape[0], len(best_params)+1])
X_train_level2[:, len(best_params)] = y_train_level2

meta_index_begin = 0
meta_index_end = 0

# Now fill `X_train_level2` with metafeatures
for cur_block_num in tqdm_notebook(dates_level2):
    
    # split data
    train_index = X_train.loc[dates_train <  cur_block_num].index
    test_index  = X_train.loc[dates_train == cur_block_num].index
    
    X_train_l2 = X_train.loc[train_index, :]
    X_test_l2 =  X_train.loc[test_index, :]

    y_train_l2 = y_train[train_index]
    y_test_l2 =  y_train[test_index]
    
    meta_index_end += y_test_l2.shape[0]
    
    # predict metafeatures for each of LGBM regressors
    for i, params in enumerate(tqdm_notebook(best_params)):
        lgb = LGBMRegressor(**params)
        reg = lgb.fit(X_train_l2, y_train_l2)
        pred = lgb.predict(X_test_l2)
        X_train_level2[meta_index_begin:meta_index_end, i] = pred

        del lgb, reg, pred
        gc.collect()
        
    meta_index_begin = meta_index_end
        
X_train_level2 = pd.DataFrame(X_train_level2, columns=[f'lgbr_{i+1}' for i in range(len(best_params))]+['gt'])
X_train_level2.to_pickle('LGBM_X_train_level2.pkl')

### Create metafeatures for the test set

In [None]:
X_test_level2 = np.zeros([X_test.shape[0], len(best_params)])

for i, params in enumerate(tqdm_notebook(best_params)):
    lgb = LGBMRegressor(**params)
    reg = lgb.fit(X_train, y_train)
    pred = lgb.predict(X_test)
    X_test_level2[:, i] = pred
    
    del lgb, reg, pred
    gc.collect()
    
X_test_level2 = pd.DataFrame(X_test_level2, columns=[f'lgbr_{i+1}' for i in range(len(best_params))])
X_test_level2.to_pickle('LGBM_X_test_level2.pkl')

### Stacking with Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

X = X_train_level2[[c for c in X_train_level2.columns if c != 'gt']]
y = X_train_level2['gt']

lr = LinearRegression()
lr.fit(X, y)

pred = lr.predict(X_test_level2)