[2nd Place Lightgbm Solution](https://www.kaggle.com/xiaozhouwang/2nd-place-lightgbm-solution/script)

In [1]:
import lightgbm as lgbm
from scipy import sparse as ssp
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
def Gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]

    # sort rows on prediction column
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:, 0].argsort()][::-1, 0]
    pred_order = arr[arr[:, 1].argsort()][::-1, 0]

    # get Lorenz curves
    L_true = np.cumsum(true_order) * 1. / np.sum(true_order)
    L_pred = np.cumsum(pred_order) * 1. / np.sum(pred_order)
    L_ones = np.linspace(1 / n_samples, 1, n_samples)

    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)

    # normalize to true Gini coefficient
    return G_pred * 1. / G_true

In [3]:
cv_only = True
save_cv = True
full_train = False

In [4]:
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', Gini(labels, preds), True

In [142]:
train = pd.read_csv('./data/train.csv', nrows=50000)
train_label = train['target']
train_id = train['id']
test = pd.read_csv('./data/test.csv', nrows=50000)
test_id = test['id']

In [143]:
NFOLDS = 5
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=218)

+ 같은 분포를 가지도록 해서 fold 나눠주는 imbalance한 상황에 아주 좋다.

In [144]:
y=train['target'].values
drop_feature = ['id', 'target']

In [145]:
X = train.drop(drop_feature, axis=1)
feature_names = X.columns.tolist()

In [146]:
cat_features = [c for c in feature_names if ('cat' in c and 'count' not in c)]
num_features = [c for c in feature_names if ('cat' not in c and 'calc' not in c)]

In [147]:
cat_features

['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

In [148]:
num_features

['ps_ind_01',
 'ps_ind_03',
 'ps_ind_06_bin',
 'ps_ind_07_bin',
 'ps_ind_08_bin',
 'ps_ind_09_bin',
 'ps_ind_10_bin',
 'ps_ind_11_bin',
 'ps_ind_12_bin',
 'ps_ind_13_bin',
 'ps_ind_14',
 'ps_ind_15',
 'ps_ind_16_bin',
 'ps_ind_17_bin',
 'ps_ind_18_bin',
 'ps_reg_01',
 'ps_reg_02',
 'ps_reg_03',
 'ps_car_11',
 'ps_car_12',
 'ps_car_13',
 'ps_car_14',
 'ps_car_15']

+ calc 는 다 뺌 쓸모가 없다는 사람들의 consensus

+ competition할 때 discussion을 계속 봐야 한다.

+ best single model 올리면서 간을 본다. 너희들은 어떻니 결과 공유

# feature engineering

In [149]:
train['missing'] = (train==-1).sum(axis=1).astype(float)
test['missing'] = (test==-1).sum(axis=1).astype(float)
num_features.append('missing')

In [150]:
train==-1

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin,missing
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
49996,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
49997,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
49998,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [151]:
(train==-1).sum(axis=1)

0        1
1        2
2        3
3        0
4        2
        ..
49995    1
49996    1
49997    3
49998    2
49999    3
Length: 50000, dtype: int64

In [152]:
np.unique((train==-1))

array([False,  True])

In [153]:
(train==-1).sum(axis=1).value_counts()

2    21344
1    13167
0    10573
3     4681
4      229
7        3
6        3
dtype: int64

In [154]:
train['missing'].unique()

array([1., 2., 3., 0., 4., 7., 6.])

In [155]:
pd.concat([(train==-1).sum(axis=1), train['target']], axis=1).groupby(0).mean()

Unnamed: 0_level_0,target
0,Unnamed: 1_level_1
0,0.046912
1,0.037062
2,0.033452
3,0.026276
4,0.039301
6,0.666667
7,0.333333


In [156]:
for c in cat_features:
    le = LabelEncoder()
    le.fit(train[c])
    train[c] = le.transform(train[c])
    test[c] = le.transform(test[c])

In [157]:
enc = OneHotEncoder()
enc.fit(train[cat_features])

OneHotEncoder()

In [158]:
X_cat = enc.transform(train[cat_features])
X_t_cat = enc.transform(test[cat_features])

In [159]:
ind_features = [c for c in feature_names if 'ind' in c]

In [160]:
count = 0
for c in ind_features:
    if count == 0:
        train['new_ind'] = train[c].astype(str)+ '_'
        test['new_ind'] = test[c].astype(str)+'_'
        count += 1
    else:
        train['new_ind'] += train[c].astype(str)+'_'
        test['new_ind'] += test[c].astype(str)+'_'

In [161]:
train[c]

0        0
1        1
2        0
3        0
4        0
        ..
49995    0
49996    0
49997    0
49998    0
49999    0
Name: ps_ind_18_bin, Length: 50000, dtype: int64

In [162]:
train[c].astype(str)

0        0
1        1
2        0
3        0
4        0
        ..
49995    0
49996    0
49997    0
49998    0
49999    0
Name: ps_ind_18_bin, Length: 50000, dtype: object

In [163]:
train[c].astype(str)+'_'

0        0_
1        1_
2        0_
3        0_
4        0_
         ..
49995    0_
49996    0_
49997    0_
49998    0_
49999    0_
Name: ps_ind_18_bin, Length: 50000, dtype: object

In [164]:
(train[ind_features[0]].astype(str)+'_') + (train[ind_features[1]].astype(str)+'_')

0        2_2_
1        1_1_
2        5_4_
3        0_1_
4        0_2_
         ... 
49995    2_2_
49996    0_1_
49997    0_1_
49998    4_1_
49999    1_1_
Length: 50000, dtype: object

In [165]:
ind_features

['ps_ind_01',
 'ps_ind_02_cat',
 'ps_ind_03',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_ind_06_bin',
 'ps_ind_07_bin',
 'ps_ind_08_bin',
 'ps_ind_09_bin',
 'ps_ind_10_bin',
 'ps_ind_11_bin',
 'ps_ind_12_bin',
 'ps_ind_13_bin',
 'ps_ind_14',
 'ps_ind_15',
 'ps_ind_16_bin',
 'ps_ind_17_bin',
 'ps_ind_18_bin']

In [166]:
train['new_ind']

0        2_2_5_2_1_0_1_0_0_0_0_0_0_0_11_0_1_0_
1         1_1_7_1_1_0_0_1_0_0_0_0_0_0_3_0_0_1_
2        5_4_9_2_1_0_0_1_0_0_0_0_0_0_12_1_0_0_
3         0_1_2_1_1_1_0_0_0_0_0_0_0_0_8_1_0_0_
4         0_2_0_2_1_1_0_0_0_0_0_0_0_0_9_1_0_0_
                         ...                  
49995     2_2_1_1_1_0_0_1_0_0_0_0_0_0_8_1_0_0_
49996     0_1_5_1_1_1_0_0_0_0_0_0_0_0_0_0_1_0_
49997     0_1_6_2_1_0_1_0_0_0_0_0_0_0_7_1_0_0_
49998     4_1_3_2_1_0_0_0_1_0_0_0_0_0_3_1_0_0_
49999    1_1_9_2_1_0_0_1_0_0_0_0_0_0_10_0_0_0_
Name: new_ind, Length: 50000, dtype: object

In [167]:
train['new_ind'].value_counts()

0_2_1_1_1_1_0_0_0_0_0_0_0_0_7_1_0_0_     102
0_1_2_1_1_1_0_0_0_0_0_0_0_0_8_1_0_0_      97
0_1_2_1_1_1_0_0_0_0_0_0_0_0_7_1_0_0_      88
0_1_3_1_1_1_0_0_0_0_0_0_0_0_6_1_0_0_      82
0_1_3_1_1_1_0_0_0_0_0_0_0_0_7_1_0_0_      79
                                        ... 
7_3_9_1_5_0_0_1_0_0_0_0_0_0_7_1_0_0_       1
6_1_5_1_7_0_0_1_0_0_0_0_0_0_3_0_0_1_       1
0_1_1_2_0_0_1_0_0_0_0_0_0_0_8_1_0_0_       1
7_1_7_2_1_1_0_0_0_0_0_0_0_0_9_1_0_0_       1
1_1_9_2_1_0_0_1_0_0_0_0_0_0_10_0_0_0_      1
Name: new_ind, Length: 22454, dtype: int64

In [168]:
train['new_ind'].value_counts().shape

(22454,)

In [169]:
cat_count_features = []
for c in cat_features + ['new_ind']:
    break

In [170]:
cat_features + ['new_ind']

['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat',
 'new_ind']

In [171]:
pd.concat([train[c], test[c]]).value_counts().to_dict()

{1: 72719, 2: 20717, 3: 4645, 4: 1892, 0: 27}

+ 얼마나 많이 나왔나

In [172]:
d = pd.concat([train[c], test[c]]).value_counts().to_dict()

In [173]:
d

{1: 72719, 2: 20717, 3: 4645, 4: 1892, 0: 27}

In [174]:
x=1
d.get(x, 0)

72719

In [175]:
train[c].apply(lambda x: d.get(x, 0))

0        20717
1        72719
2         1892
3        72719
4        20717
         ...  
49995    20717
49996    72719
49997    72719
49998    72719
49999    72719
Name: ps_ind_02_cat, Length: 50000, dtype: int64

In [176]:
train[c]

0        2
1        1
2        4
3        1
4        2
        ..
49995    2
49996    1
49997    1
49998    1
49999    1
Name: ps_ind_02_cat, Length: 50000, dtype: int64

+ ind features를 하나로 묶어서 새로운 카테고리를 만들어 냄
+ 경우의 수가 2x2x3x4 가 되는 것
  + 타이타닉에서 남녀 선실(1,2,3) 묶으면 6개의 카테고리가 나오는데 그것을 한 것.
  + cardinality가 많아지니까 카테고리가 몇개씩 나오는지를 가지고 인코딩 한 것

In [177]:
train[num_features + cat_count_features]

Unnamed: 0,ps_ind_01,ps_ind_03,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,...,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15,missing
0,2,5,0,1,0,0,0,0,0,0,...,0,0.7,0.2,0.718070,2,0.400000,0.883679,0.370810,3.605551,1.0
1,1,7,0,0,1,0,0,0,0,0,...,1,0.8,0.4,0.766078,3,0.316228,0.618817,0.388716,2.449490,2.0
2,5,9,0,0,1,0,0,0,0,0,...,0,0.0,0.0,-1.000000,1,0.316228,0.641586,0.347275,3.316625,3.0
3,0,2,1,0,0,0,0,0,0,0,...,0,0.9,0.2,0.580948,1,0.374166,0.542949,0.294958,2.000000,0.0
4,0,0,1,0,0,0,0,0,0,0,...,0,0.7,0.6,0.840759,3,0.316070,0.565832,0.365103,2.000000,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2,1,0,0,1,0,0,0,0,0,...,0,0.5,0.4,0.633936,3,0.399500,0.605536,0.398497,2.236068,1.0
49996,0,5,1,0,0,0,0,0,0,0,...,0,0.9,0.3,0.646142,2,0.316228,0.673280,0.368782,3.316625,1.0
49997,0,6,0,1,0,0,0,0,0,0,...,0,0.1,0.2,-1.000000,3,0.446990,1.034777,0.452769,3.464102,3.0
49998,4,3,0,0,0,1,0,0,0,0,...,0,0.9,0.2,-1.000000,2,0.316228,0.708415,0.374833,3.464102,2.0


+ cat_count_features로 만들어서 

In [178]:
train_list = [train[num_features + cat_count_features].values, X_cat, ]
test_list = [test[num_features + cat_count_features].values, X_t_cat, ]

In [179]:
train_list

[array([[2.        , 5.        , 0.        , ..., 0.37080992, 3.60555128,
         1.        ],
        [1.        , 7.        , 0.        , ..., 0.38871583, 2.44948974,
         2.        ],
        [5.        , 9.        , 0.        , ..., 0.34727511, 3.31662479,
         3.        ],
        ...,
        [0.        , 6.        , 0.        , ..., 0.45276926, 3.46410162,
         3.        ],
        [4.        , 3.        , 0.        , ..., 0.3748333 , 3.46410162,
         2.        ],
        [1.        , 9.        , 0.        , ..., 0.40743098, 3.46410162,
         3.        ]]),
 <50000x183 sparse matrix of type '<class 'numpy.float64'>'
 	with 700000 stored elements in Compressed Sparse Row format>]

In [180]:
ssp.hstack(train_list)

<50000x207 sparse matrix of type '<class 'numpy.float64'>'
	with 1354785 stored elements in COOrdinate format>

In [181]:
ssp.hstack(train_list).tocsr()

<50000x207 sparse matrix of type '<class 'numpy.float64'>'
	with 1354785 stored elements in Compressed Sparse Row format>

In [182]:
X = ssp.hstack(train_list).tocsr()

In [183]:
X.shape

(50000, 207)

+ sparse matrix 풀어냄

In [184]:
X_test = ssp.hstack(test_list).tocsr()

# model development

In [215]:
learning_rate = 0.1
num_leaves = 15
min_data_in_leaf = 2000
feature_fraction = 0.6
num_boost_round = 10000

params = {'objective': 'binary',
         'boosting_type' : 'gbdt',
         'learning_rate': learning_rate,
         'num_leaves': num_leaves,
         'max_bin': 256,
         'feature_fraction': feature_fraction,
         'verbosity': 0,
         'drop_rate': 0.1,
         'is_unbalance': False,
         'max_drop': 50,
         'min_child_samples': 10,
         'min_child_weight': 150,
         'min_split_gain': 0,
         'subsample': 0.9}

In [186]:
x_score = []
final_cv_train = np.zeros(len(train_label))
final_cv_pred = np.zeros(len(test_id))

In [187]:
for s in np.arange(16):
    break

In [188]:
cv_train = np.zeros(len(train_label))
cv_pred = np.zeros(len(test_id))

In [189]:
cv_only

True

In [190]:
params['seed'] = s

In [191]:
kf = kfold.split(X, train_label)

In [192]:
best_trees = []
fold_scores = []

In [193]:
for i, (train_fold, validate) in enumerate(kf):
    break

In [194]:
X_train, label_train = X[train_fold, :], train_label[train_fold]
X_validate, label_validate = X[validate, :], train_label[validate]

In [195]:
dtrain = lgbm.Dataset(X_train, label_train)
dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)

In [196]:
bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=evalerror, 
           verbose_eval=100, early_stopping_rounds=100)

You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.153398	valid_0's gini: 0.257506
Early stopping, best iteration is:
[73]	valid_0's binary_logloss: 0.153275	valid_0's gini: 0.259739


In [197]:
bst.best_iteration

73

In [199]:
import warnings
warnings.filterwarnings("ignore")

In [200]:
best_trees.append(bst.best_iteration)

In [201]:
cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)

In [202]:
cv_train[validate] += bst.predict(X_validate)

In [203]:
score = Gini(label_validate, cv_train[validate])

In [204]:
print(score)

0.25973867597795147


In [205]:
fold_scores.append(score)

In [207]:
for i, (train_fold, validate) in enumerate(kf):
    print('#'*30, '{} of {}'.format(i+1, 5))
    X_train, X_validate, label_train, label_validate = X[train_fold, :], X[validate, :], train_label[train_fold], train_label[validate]
    dtrain = lgbm.Dataset(X_train, label_train)
    dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
    bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=evalerror, verbose_eval=100,
                    early_stopping_rounds=100)
    best_trees.append(bst.best_iteration)
    cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
    cv_train[validate] += bst.predict(X_validate)

    score = Gini(label_validate, cv_train[validate])
    print (score)
    fold_scores.append(score)

In [208]:
fold_scores

[0.25973867597795147,
 0.2603966371378236,
 0.2488592941328217,
 0.3039299795689828,
 0.2582202810445814]

In [209]:
final_cv_train += cv_train
final_cv_pred += cv_pred

In [210]:
print('cv score')
print(Gini(train_label, cv_train))

cv score
0.26533117621745883


In [211]:
print('current score', Gini(train_label, final_cv_train/(s + 1.)))

current score 0.26533117621745883


cross validation stacking을 16번 하는 것

In [214]:
for s in range(16):
    print('#'*30, 'random number outer iteration: {}'.format(s))
    cv_train = np.zeros(len(train_label))
    cv_pred = np.zeros(len(test_id))

    params['seed'] = s

    if cv_only:
        kf = kfold.split(X, train_label)

        best_trees = []
        fold_scores = []

        for i, (train_fold, validate) in enumerate(kf):
            print('#'*10, 'inner cross validation system: {}'.format(i))
            X_train, X_validate, label_train, label_validate = \
                X[train_fold, :], X[validate, :], train_label[train_fold], train_label[validate]
            dtrain = lgbm.Dataset(X_train, label_train)
            dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
            bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=evalerror, verbose_eval=100,
                            early_stopping_rounds=100)
            best_trees.append(bst.best_iteration)
            cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
            cv_train[validate] += bst.predict(X_validate)

            score = Gini(label_validate, cv_train[validate])
            print (score)
            fold_scores.append(score)

        cv_pred /= NFOLDS
        final_cv_train += cv_train
        final_cv_pred += cv_pred

        print("cv score:")
        print (Gini(train_label, cv_train))
        print ("current score:", Gini(train_label, final_cv_train / (s + 1.)), s+1)
        print(fold_scores)
        print(best_trees, np.mean(best_trees))

        x_score.append(Gini(train_label, cv_train))

############################## random number outer iteration: 0
########## inner cross validation system: 0
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.153398	valid_0's gini: 0.257506
Early stopping, best iteration is:
[73]	valid_0's binary_logloss: 0.153275	valid_0's gini: 0.259739
0.25973867597795147
########## inner cross validation system: 1
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.153164	valid_0's gini: 0.257127
Early stopping, best iteration is:
[84]	valid_0's binary_logloss: 0.15303	valid_0's gini: 0.260397
0.2603966371378236
########## inner cross validation system: 2
You can set `force_row_wise=true` to remove the overhead.
And if m

In [None]:
print(x_score)
pd.DataFrame({'id': test_id, 'target': final_cv_pred / 16.}).to_csv('../model/lgbm3_pred_avg.csv', index=False)
pd.DataFrame({'id': train_id, 'target': final_cv_train / 16.}).to_csv('../model/lgbm3_cv_avg.csv', index=False)