## Imports

In [None]:
import lightgbm as lgb

In [None]:
from sklearn.model_selection import StratifiedKFold

## Config

In [None]:
feature_list_id = 'oofp_manual_lightgbm'

In [None]:
RANDOM_SEED = 42

In [None]:
np.random.seed(RANDOM_SEED)

## Read Data

In [None]:
feature_lists = [
    'simple_summaries',
    'jaccard_ngrams',
    'fuzzy',
    'jellyfish',
    'tfidf_distances',
    'embedding_mean',
    'embedding_normalized_sum',
    'wmd',
    'wordnet_similarity',
    'dasolmar_whq',
    'magic_jturkewitz',
]

In [None]:
df_train, df_test, _ = load_feature_lists(feature_lists)

In [None]:
X_train = df_train.values
X_test = df_test.values

In [None]:
y_train = load(features_data_folder + 'y_train.pickle')

## Train Models & Compute Out-of-Fold Predictions

In [None]:
NUM_FOLDS = 5

In [None]:
kfold = StratifiedKFold(
    n_splits=NUM_FOLDS,
    shuffle=True,
    random_state=RANDOM_SEED
)

In [None]:
y_train_oofp = np.zeros_like(y_train, dtype='float64')

In [None]:
y_test_oofp = np.zeros((len(X_test), NUM_FOLDS))

In [None]:
for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train, y_train)):
    X_fold_train = X_train[ix_train]
    X_fold_val = X_train[ix_val]

    y_fold_train = y_train[ix_train]
    y_fold_val = y_train[ix_val]
    
    print()
    print(f'Fitting fold {fold_num + 1} of {kfold.n_splits}')
    print()
    
    lgb_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'device': 'cpu',
        'num_leaves': 64,
        'lambda_l2': 3.0,
        'learning_rate': 0.1,
        'num_boost_round': 1000,
        'early_stopping_rounds': 5,
        'verbose': 1,
        'bagging_fraction_seed': RANDOM_SEED,
        'feature_fraction_seed': RANDOM_SEED,
    }
    
    lgb_data_train = lgb.Dataset(X_fold_train, y_fold_train)
    lgb_data_val = lgb.Dataset(X_fold_val, y_fold_val)
    
    model = lgb.train(
        lgb_params,
        lgb_data_train,
        valid_sets=[lgb_data_val],
        num_boost_round=lgb_params['num_boost_round'],
        early_stopping_rounds=lgb_params['early_stopping_rounds'],
    )
    
    y_pred_oofp = model.predict(X_fold_val).reshape(-1)
    y_test_oofp[:, fold_num] = model.predict(X_test).reshape(-1)
    
    # Remember them.
    y_train_oofp[ix_val] = y_pred_oofp

## Save feature names

In [None]:
feature_names = [
    'oofp_manual_lightgbm',
]

In [None]:
save_lines(feature_names, features_data_folder + f'X_train_{feature_list_id}.names')

## Save Train features

In [None]:
y_train_oofp = y_train_oofp.reshape((-1, 1))

In [None]:
save(y_train_oofp, features_data_folder + f'X_train_{feature_list_id}.pickle')

## Save Test features

In [None]:
y_test_oofp_mean = np.mean(y_test_oofp, axis=1).reshape((-1, 1))

In [None]:
save(y_test_oofp_mean, features_data_folder + f'X_test_{feature_list_id}.pickle')