In [1]:
import sys
sys.path.append("..")
from sklearn.model_selection import StratifiedKFold
import numpy as np

In [2]:
import lightgbm as lgbm

In [3]:
import pandas as pd

train = pd.read_csv("./train.csv")
train_label = train['target']
train_id = train['id']
del train['target'], train['id']

test = pd.read_csv("./test.csv")
test_id = test['id']
del test['id']

train['missing'] = (train==-1).sum(axis=1).astype(float)
test['missing'] = (test==-1).sum(axis=1).astype(float)

bin_features = [c for c in train.columns if 'bin' in c]
train['bin_sum'] = train[bin_features].sum(axis=1)
test['bin_sum'] = test[bin_features].sum(axis=1)

features = ['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_12_bin', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_11_cat', 'ps_ind_01', 'ps_ind_03', 'ps_ind_15', 'ps_car_11']

num_boost_round = 10000
params = {"objective": "binary",
          "boosting_type": "gbdt",
          "learning_rate": 0.1,
          "num_leaves": 15,
          "max_bin": 256,
          "feature_fraction": 0.6,
          "verbosity": 0,
          "drop_rate": 0.1,
          "is_unbalance": False,
          "max_drop": 50,
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9,
          "seed": 2018,
          "force_row_wise": True
}


In [4]:
NFOLDS = 5
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=218)
kf = kfold.split(train, train_label)

cv_train = np.zeros(len(train_label))
cv_pred = np.zeros(len(test_id))    
best_trees = []
fold_scores = []

In [5]:
def Gini(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]

    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:, 0].argsort()][::-1, 0]
    pred_order = arr[arr[:, 1].argsort()][::-1, 0]

    L_true = np.cumsum(true_order) * 1. / np.sum(true_order)
    L_pred = np.cumsum(pred_order) * 1. / np.sum(pred_order)
    L_ones = np.linspace(1 / n_samples, 1, n_samples)

    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)

    return G_pred * 1. / G_true

def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', Gini(labels, preds), True

In [6]:
for i, (train_fold, validate) in enumerate(kf):
    X_train, X_validate, label_train, label_validate = train.loc[train_fold, :], train.loc[validate, :], train_label[train_fold], train_label[validate]
    
    for feature in features:
        map_dic = pd.DataFrame([X_train[feature], label_train]).T.groupby(feature).agg('mean')
        map_dic = map_dic.to_dict()['target']
        X_train[feature + '_target_en'] = X_train[feature].apply(lambda x: map_dic.get(x, 0))
        X_validate[feature + '_target_enc'] = X_validate[feature].apply(lambda x: map_dic.get(x, 0))
        test[feature + '_target_enc'] = test[feature].apply(lambda x: map_dic.get(x, 0))

    dtrain = lgbm.Dataset(X_train, label_train)
    dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)

    verbose_eval = 0
    
    bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=evalerror, 
                     callbacks=[lgbm.early_stopping(stopping_rounds=100, verbose=True), lgbm.log_evaluation(verbose_eval)])
    
    bst.save_model('./model/baseline_model_{}.txt'.format(i))
    best_trees.append(bst.best_iteration)
    cv_pred += bst.predict(test, num_iteration=bst.best_iteration)
    cv_train[validate] += bst.predict(X_validate)

    score = Gini(label_validate, cv_train[validate])
    print(score)
    fold_scores.append(score)

cv_pred /= NFOLDS

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[204]	valid_0's binary_logloss: 0.151538	valid_0's gini: 0.29314
0.29314009182258344
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[83]	valid_0's binary_logloss: 0.152349	valid_0's gini: 0.265321
0.2653211692773015
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[125]	valid_0's binary_logloss: 0.152092	valid_0's gini: 0.273312
0.2733117835288082
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[201]	valid_0's binary_logloss: 0.151972	valid_0's gini: 0.280085
0.28008456208849136
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[99]	valid_0's binary_logloss: 0.151568	valid_0's gini: 0.286495
0.2864950254393784


In [7]:
print("cv score:")
print(Gini(train_label, cv_train))
print(fold_scores)
print(best_trees, np.mean(best_trees))

pd.DataFrame({'id': test_id, 'target': cv_pred}).to_csv('./lgbm_baseline.csv', index=False)


cv score:
0.27972510469464895
[0.29314009182258344, 0.2653211692773015, 0.2733117835288082, 0.28008456208849136, 0.2864950254393784]
[204, 83, 125, 201, 99] 142.4
