In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


# 1. Library Import & Data

In [1]:
import lightgbm as lgbm
from scipy import sparse as ssp
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

cv_only = True
save_cv = True
full_train = False

path = "/content/gdrive/My Drive/보아즈미니플젝/"

# train 데이터, test 데이터를 읽어오기
train = pd.read_csv(path+'train.csv')
train_label = train['target']
train_id = train['id']
test = pd.read_csv(path+'test.csv')
test_id = test['id']

# target 변수를 별도로 분리
y = train['target'].values
# id, target 변수 제거
# (train 데이터와 test 데이터를 동일하게 가져가기 위함!)
drop_feature = [
    'id',
    'target'
]
X = train.drop(drop_feature,axis=1)

## Function 정의

In [2]:
# 스코어 정의함수
def Gini(y_true, y_pred):
    # 정답과 예측값의 개수가 동일한지 확인
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]

    # y_pred 오름차순 정렬
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:, 0].argsort()][::-1, 0]
    pred_order = arr[arr[:, 1].argsort()][::-1, 0]

    # Lorenz curves 계산
    L_true = np.cumsum(true_order) * 1. / np.sum(true_order)
    L_pred = np.cumsum(pred_order) * 1. / np.sum(pred_order)
    L_ones = np.linspace(1 / n_samples, 1, n_samples)

    # 지니계수 계산
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)

    # 정규화
    return G_pred * 1. / G_true

# LightGBM 모델 학습 과정에서의 평가함수
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', Gini(labels, preds), True

# 2. Feature Engineering

- 파생변수 1 : 결측값 개수
- 파생변수 2 : categorical data -> binary data
- 파생변수 3 : ind를 포함한 변수들 조합한 new_ind 생성

모델 학습에 사용한 변수들
- num_features, cat_features
- 파생변수2, 파생변수3

In [3]:
feature_names = X.columns.tolist()

# categorical 변수 분리
cat_features = [c for c in feature_names if ('cat' in c and 'count' not in c)]
# numerical 변수 분리
num_features = [c for c in feature_names if ('cat' not in c and 'calc' not in c)]

In [4]:
# 파생변수1 : 결측값 개수
train['missing'] = (train==-1).sum(axis=1).astype(float)
test['missing'] = (test==-1).sum(axis=1).astype(float)
num_features.append('missing')


# 파생변수 2 : 범주형변수를 LabelEncoder()를 통해, numerical로 변환
for c in cat_features:
    le = LabelEncoder()
    le.fit(train[c])
    train[c] = le.transform(train[c])
    test[c] = le.transform(test[c])

# 이후 원핫인코더를 통해 고유값 별로 0/1의 binary 변수를 데이터로 사용
enc = OneHotEncoder()
enc.fit(train[cat_features])
X_cat = enc.transform(train[cat_features])
X_t_cat = enc.transform(test[cat_features])

# 파생변수 3 : ind 변수들의 고유값을 조합한 new_ind 변수를 생성
# ex) ps_ind01 = 1, ps_ind_02 =0 의 경우 new_ind는 1_2_ 라는 문자열 변수로!
# ind 변수들의 조합을 기반으로 파생변수 생성
ind_features = [c for c in feature_names if 'ind' in c]
count=0
for c in ind_features:
    if count==0:
        train['new_ind'] = train[c].astype(str)+'_'
        test['new_ind'] = test[c].astype(str)+'_'
        count+=1
    else:
        train['new_ind'] += train[c].astype(str)+'_'
        test['new_ind'] += test[c].astype(str)+'_'

# 범주형 변수와 new_ind 고유값의 빈도를 파생변수로 생성
cat_count_features = []
for c in cat_features+['new_ind']:
    d = pd.concat([train[c],test[c]]).value_counts().to_dict()
    train['%s_count'%c] = train[c].apply(lambda x:d.get(x,0))
    test['%s_count'%c] = test[c].apply(lambda x:d.get(x,0))
    cat_count_features.append('%s_count'%c)

In [5]:
# numerical 변수와 categorical 변수/ new_ind 빈도 및 범주형 변수를 모델 학습에 사용
# 나머지는 사용 X
train_list = [train[num_features+cat_count_features].values,X_cat,]
test_list = [test[num_features+cat_count_features].values,X_t_cat,]

# 모델 학습 속도 및 메모리 최적화를 위해 데이터를 Sparse Matrix 형태로 변환
X = ssp.hstack(train_list).tocsr()
X_test = ssp.hstack(test_list).tocsr()

# 3. Modeling

In [6]:
# LGBM 파라미터 지정
learning_rate = 0.1
num_leaves = 15
min_data_in_leaf = 2000
feature_fraction = 0.6
num_boost_round = 10000
params = {"objective": "binary",
          "boosting_type": "gbdt", # dart도 돌려보자
          "learning_rate": learning_rate,
          "num_leaves": num_leaves,
           "max_bin": 256,
          "feature_fraction": feature_fraction,
          "verbosity": 0,
          "drop_rate": 0.1,
          "is_unbalance": False,
          "max_drop": 50,
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9
          }

x_score = []
final_cv_train = np.zeros(len(train_label))
final_cv_pred = np.zeros(len(test_id))

In [7]:
# kfold 정의
# Stratified 5-Fold 내부 교차검증
NFOLDS = 5
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=218)


# 총 16번의 seed 값으로 학습을 돌려, 평균 값을 최종 예측 결과물로 사용
# 시드값이 많을 수록 랜덤 요소로 인한 분산을 줄일 수 있음!
# 하지만 시드값을 너무 크게 주면 학습이 오래걸림
for s in range(16):
    cv_train = np.zeros(len(train_label))
    cv_pred = np.zeros(len(test_id))

    params['seed'] = s

    if cv_only:
        kf = kfold.split(X, train_label)

        best_trees = []
        fold_scores = []

        for i, (train_fold, validate) in enumerate(kf):
            X_train, X_validate, label_train, label_validate = \
                X[train_fold, :], X[validate, :], train_label[train_fold], train_label[validate]
            dtrain = lgbm.Dataset(X_train, label_train)
            dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
            bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=evalerror, verbose_eval=100,
                            early_stopping_rounds=100)
            best_trees.append(bst.best_iteration)
            cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
            cv_train[validate] += bst.predict(X_validate)

            score = Gini(label_validate, cv_train[validate])
            print(score)
            fold_scores.append(score)

        cv_pred /= NFOLDS
        final_cv_train += cv_train
        final_cv_pred += cv_pred

        print("cv score:")
        print(Gini(train_label, cv_train))
        print("current score:", Gini(train_label, final_cv_train / (s + 1.)), s+1)
        print(fold_scores)
        print(best_trees, np.mean(best_trees))

        x_score.append(Gini(train_label, cv_train))

print(x_score)
pd.DataFrame({'id': test_id, 'target': final_cv_pred / 16.}).to_csv('/content/gdrive/My Drive/보아즈미니플젝/lgbm0819_1.csv', index=False)
pd.DataFrame({'id': train_id, 'target': final_cv_train / 16.}).to_csv('/content/gdrive/My Drive/보아즈미니플젝/lgbm0819_train.csv', index=False)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.15159	valid_0's gini: 0.291863
[200]	valid_0's binary_logloss: 0.151468	valid_0's gini: 0.29491
Early stopping, best iteration is:
[189]	valid_0's binary_logloss: 0.151457	valid_0's gini: 0.295075
0.2950745960037473
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.152123	valid_0's gini: 0.272679
[200]	valid_0's binary_logloss: 0.152038	valid_0's gini: 0.275581
[300]	valid_0's binary_logloss: 0.152089	valid_0's gini: 0.276195
Early stopping, best iteration is:
[224]	valid_0's binary_logloss: 0.152015	valid_0's gini: 0.276851
0.27685121636649
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.151974	valid_0's gini: 0.277978
[200]	valid_0's binary_logloss: 0.151926	valid_0's gini: 0.280307
Early stopping, best iteration is:
[139]	valid_0's binary_logloss: 0.151902	valid_0's gini: 0.280395
0.2803950421