In [93]:
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

import os, sys, gc, warnings, random

import datetime
import dateutil.relativedelta

# Data manipulation
import pandas as pd 
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold, GroupKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import quantile_transform
import scipy.stats as stats


import lightgbm as lgb
import optuna

from catboost import CatBoostClassifier, Pool
from tqdm.notebook import trange, tqdm

from IPython.display import display

%matplotlib inline

pd.options.display.max_rows = 10000
pd.options.display.max_columns = 1000
pd.options.display.max_colwidth = 1000

In [94]:
df = pd.read_csv('./input' + '/train.csv', parse_dates=['order_date'])

In [95]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
SEED = 42
seed_everything(SEED)

In [96]:
TOTAL_THRES = 300

'''
    입력인자로 받는 year_month에 대해 고객 ID별로 총 구매액이
    구매액 임계값을 넘는지 여부의 binary label을 생성하는 함수
'''
def generate_label(df, year_month, total_thres=TOTAL_THRES, print_log=False):
    
    
    df = df.copy()
    
    # year_month에 해당하는 label 데이터 생성
    df['year_month'] = df['order_date'].dt.strftime('%Y-%m')
    df.reset_index(drop=True, inplace=True)

    # year_month 이전 월의 고객 ID 추출
    cust = df[df['year_month']<year_month]['customer_id'].unique()
    # year_month에 해당하는 데이터 선택
    df = df[df['year_month']==year_month]
    
    # label 데이터프레임 생성
    label = pd.DataFrame({'customer_id':cust})
    label['year_month'] = year_month
    
    # year_month에 해당하는 고객 ID의 구매액의 합 계산
    grped = df.groupby(['customer_id','year_month'], as_index=False)[['total']].sum()
    
    # label 데이터프레임과 merge하고 구매액 임계값을 넘었는지 여부로 label 생성
    label = label.merge(grped, on=['customer_id','year_month'], how='left')
    label['total'].fillna(0.0, inplace=True)
    label['label'] = (label['total'] > total_thres).astype(int)

    # 고객 ID로 정렬
    label = label.sort_values('customer_id').reset_index(drop=True)
    if print_log: print(f'{year_month} - final label shape: {label.shape}')
    
    return label

In [97]:
def print_score(label, pred, prob_thres=0.5):
    print('Precision: {:.5f}'.format(precision_score(label, pred>prob_thres)))
    print('Recall: {:.5f}'.format(recall_score(label, pred>prob_thres)))
    print('F1 Score: {:.5f}'.format(f1_score(label, pred>prob_thres)))
    print('ROC AUC Score: {:.5f}'.format(roc_auc_score(label, pred)))

In [98]:
def qt(df):
    df = df.copy()
    
    for col in df.columns:
        if df[col].dtype.name == 'float64':
            temp = quantile_transform(np.array(df[col]).astype("float").reshape(-1,1), n_quantiles=100, random_state=0, copy=True, output_distribution = "normal").reshape(-1)   
            df[col] = temp
    return df

In [99]:
def feature_preprocessing1(train, test, features, do_imputing=True):
    x_tr = train.copy()
    x_te = test.copy()
    
    # 범주형 피처 이름을 저장할 변수
    cate_cols = []

    # 레이블 인코딩
    for f in features:
        if x_tr[f].dtype.name == 'object': # 데이터 타입이 object(str)이면 레이블 인코딩
            cate_cols.append(f)
            le = LabelEncoder()
            # train + test 데이터를 합쳐서 레이블 인코딩 함수에 fit
            le.fit(list(x_tr[f].values) + list(x_te[f].values))
            
            # train 데이터 레이블 인코딩 변환 수행
            x_tr[f] = le.transform(list(x_tr[f].values))
            
            # test 데이터 레이블 인코딩 변환 수행
            x_te[f] = le.transform(list(x_te[f].values))

    print('categorical feature:', cate_cols)

    if do_imputing:
        # 중위값으로 결측치 채우기
        imputer = SimpleImputer(strategy='median')

        x_tr[features] = imputer.fit_transform(x_tr[features])
        x_te[features] = imputer.transform(x_te[features])
    
    return x_tr, x_te

In [109]:
def feature_engineering1(df, year_month):
    df = df.copy()
    
    #Time Series diff Feature Generation
    df['order_ts'] = df['order_date'].astype(np.int64) // 1e9
    df['order_ts_diff'] = df.groupby(['customer_id'])['order_ts'].diff()
    df['quantity_diff'] = df.groupby(['customer_id'])['quantity'].diff()
    df['price_diff'] = df.groupby(['customer_id'])['price'].diff()
    df['total_diff'] = df.groupby(['customer_id'])['total'].diff()
    
    
    #cumsum agg
    df['cumsum_total_by_cust_id'] = df.groupby(['customer_id'])['total'].cumsum()
    df['cumsum_quantity_by_cust_id'] = df.groupby(['customer_id'])['quantity'].cumsum()
    df['cumsum_price_by_cust_id'] = df.groupby(['customer_id'])['price'].cumsum()
    
    df['cumsum_total_by_prod_id'] = df.groupby(['product_id'])['total'].cumsum()
    df['cumsum_quantity_by_prod_id'] = df.groupby(['product_id'])['quantity'].cumsum()
    df['cumsum_price_by_prod_id'] = df.groupby(['product_id'])['price'].cumsum()
    
    df['cumsum_total_by_order_id'] = df.groupby(['order_id'])['total'].cumsum()
    df['cumsum_quantity_by_order_id'] = df.groupby(['order_id'])['quantity'].cumsum()
    df['cumsum_price_by_order_id'] = df.groupby(['order_id'])['price'].cumsum()    
    
    # year_month 이전 월 계산
    d = datetime.datetime.strptime(year_month, "%Y-%m")
    prev_ym = d - dateutil.relativedelta.relativedelta(months=1)
    prev_ym = prev_ym.strftime('%Y-%m')
    
    # train, test 데이터 선택
    train = df[df['order_date'] < prev_ym]
    test = df[df['order_date'] < year_month]    
    
    # train, test 레이블 데이터 생성  
    train_label = generate_label(df, prev_ym)[['customer_id','year_month','label']]
    test_label = generate_label(df, year_month)[['customer_id','year_month','label']]

    
    # group by aggregation 함수 선언
    agg_func = ['mean','max','min','sum','count','std','skew']
    agg_dict = {
        'quantity': agg_func,
        'price': agg_func,
        'total': agg_func,
        'cumsum_total_by_cust_id': agg_func,
        'cumsum_quantity_by_cust_id': agg_func,
        'cumsum_price_by_cust_id': agg_func,
        'cumsum_total_by_prod_id': agg_func,
        'cumsum_quantity_by_prod_id': agg_func,
        'cumsum_price_by_prod_id': agg_func,
        'cumsum_total_by_order_id': agg_func,
        'cumsum_quantity_by_order_id': agg_func,
        'cumsum_price_by_order_id': agg_func,
        'order_ts' : agg_func,
        'order_ts_diff' : agg_func,
        'quantity_diff' : agg_func,
        'price_diff' : agg_func,
        'total_diff' : agg_func,
        'order_id': ['nunique'],
        'product_id': ['nunique'],
    }
    all_train_data = pd.DataFrame()
    
    for i, tr_ym in enumerate(train_label['year_month'].unique()):
        # group by aggretation 함수로 train 데이터 피처 생성
        train_agg = train.loc[train['order_date'] < tr_ym].groupby(['customer_id']).agg(agg_dict)
                
        # 멀티 레벨 컬럼을 사용하기 쉽게 1 레벨 컬럼명으로 변경
        new_cols = []
        for col in agg_dict.keys():
            for stat in agg_dict[col]:
                if type(stat) is str:
                    new_cols.append(f'{col}-{stat}')
                else:
                    new_cols.append(f'{col}-mode')

#         for col in train_agg.columns.levels[0]:
#             for stat in train_agg.columns.levels[1]:
#                 new_cols.append(f'{col}-{stat}')
        
        train_agg.columns = new_cols
        train_agg.reset_index(inplace = True)
        
        train_agg['year_month'] = tr_ym
        
        all_train_data = all_train_data.append(train_agg)
        
    
    all_train_data = train_label.merge(all_train_data, on=['customer_id', 'year_month'], how='left')
    
    features = all_train_data.drop(columns=['customer_id', 'label', 'year_month']).columns
    
    # group by aggretation 함수로 test 데이터 피처 생성
    test_agg = test.groupby(['customer_id']).agg(agg_dict)
    test_agg.columns = new_cols
    
    
    test_data = test_label.merge(test_agg, on=['customer_id'], how='left')
    # train, test 데이터 전처리
    x_tr, x_te = feature_preprocessing1(all_train_data, test_data, features)
    
    print('x_tr.shape', x_tr.shape, ', x_te.shape', x_te.shape)
    
    return x_tr, x_te, all_train_data['label'], features

In [110]:
def make_lgb_oof_prediction(train, y, test, features, categorical_features='auto', model_params=None, folds=10):
    x_train = train[features]
    x_test = test[features]
    
    # 테스트 데이터 예측값을 저장할 변수
    test_preds = np.zeros(x_test.shape[0])
    
    # Out Of Fold Validation 예측 데이터를 저장할 변수
    y_oof = np.zeros(x_train.shape[0])
    
    # 폴드별 평균 Validation 스코어를 저장할 변수
    score = 0
    
    # 피처 중요도를 저장할 데이터 프레임 선언
    fi = pd.DataFrame()
    fi['feature'] = features
    
    # Stratified K Fold 선언
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=SEED)

    for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
        # train index, validation index로 train 데이터를 나눔
        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features]
        y_tr, y_val = y[tr_idx], y[val_idx]
        
        print(f'fold: {fold+1}, x_tr.shape: {x_tr.shape}, x_val.shape: {x_val.shape}')

        # LightGBM 데이터셋 선언
        dtrain = lgb.Dataset(x_tr, label=y_tr)
        dvalid = lgb.Dataset(x_val, label=y_val)
        
        # LightGBM 모델 훈련
        clf = lgb.train(
            model_params,
            dtrain,
            valid_sets=[dtrain, dvalid], # Validation 성능을 측정할 수 있도록 설정
            categorical_feature=categorical_features,
            verbose_eval=200
        )

        # Validation 데이터 예측
        val_preds = clf.predict(x_val)
        
        # Validation index에 예측값 저장 
        y_oof[val_idx] = val_preds
        
        # 폴드별 Validation 스코어 측정
        print(f"Fold {fold + 1} | AUC: {roc_auc_score(y_val, val_preds)}")
        print('-'*80)

        # score 변수에 폴드별 평균 Validation 스코어 저장
        score += roc_auc_score(y_val, val_preds) / folds
        
        # 테스트 데이터 예측하고 평균해서 저장
        test_preds += clf.predict(x_test) / folds
        
        # 폴드별 피처 중요도 저장
        fi[f'fold_{fold+1}'] = clf.feature_importance()

        del x_tr, x_val, y_tr, y_val
        gc.collect()
        
    print(f"\nMean AUC = {score}") # 폴드별 Validation 스코어 출력
    print(f"OOF AUC = {roc_auc_score(y, y_oof)}") # Out Of Fold Validation 스코어 출력
        
    # 폴드별 피처 중요도 평균값 계산해서 저장 
    fi_cols = [col for col in fi.columns if 'fold_' in col]
    fi['importance'] = fi[fi_cols].mean(axis=1)
    
    return y_oof, test_preds, fi

In [111]:
def make_cat_oof_prediction(train, y, test, features, categorical_features=None, model_params=None, folds = 10):
    x_train = train[features]
    x_test = test[features]
    
    test_preds = np.zeros(x_test.shape[0])
    
    y_oof = np.zeros(x_train.shape[0])
    
    score = 0
    
    fi = pd.DataFrame()
    fi['feature'] = features
    
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=SEED)
    
    for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train,y)):
        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features]
        y_tr, y_val = y[tr_idx], y[val_idx]
        
        print(f'fold: {fold+1}, x_tr.shape: {x_tr.shape}, x_val.shape: {x_val.shape}')
        
        clf = CatBoostClassifier(**model_params)
        clf.fit(x_tr, y_tr,
                eval_set=(x_val, y_val),
                cat_features=categorical_features,
                use_best_model=True,
                verbose=True)
        
        val_preds = clf.predict_proba(x_val)[:,1]
        
        y_oof[val_idx] = val_preds
        
        print(f'Fold {fold+1} | AUC:{roc_auc_score(y_val, val_preds)}')
        print('-'*80)
        
        score += roc_auc_score(y_val, val_preds) / folds
        
        test_preds += clf.predict_proba(x_test)[:,1] / folds
        
        fi[f'fold_{fold+1}'] = clf.feature_importances_
        
        del x_tr, x_val, y_tr, y_val
        gc.collect()
        
    print(f'\nMean AUC = {score}')
    print(f'OOF AUC = {roc_auc_score(y, y_oof)}')
    
    fi_cols = [col for col in fi.columns if 'fold_' in col]
    fi['importance'] = fi[fi_cols].mean(axis = 1)
    
    return y_oof, test_preds, fi
        

In [112]:
def plot_feature_importances(df, n=20, color='blue', figsize=(12,8)):
    # 피처 중요도 순으로 내림차순 정렬
    df = df.sort_values('importance', ascending = False).reset_index(drop = True)
    
    # 피처 중요도 정규화 및 누적 중요도 계산
    df['importance_normalized'] = df['importance'] / df['importance'].sum()
    df['cumulative_importance'] = np.cumsum(df['importance_normalized'])
    
    plt.rcParams['font.size'] = 12
    plt.style.use('fivethirtyeight')
    # 피처 중요도 순으로 n개까지 바플롯으로 그리기
    df.loc[:n, :].plot.barh(y='importance_normalized', 
                            x='feature', color=color, 
                            edgecolor='k', figsize=figsize,
                            legend=False)

    plt.xlabel('Normalized Importance', size=18); plt.ylabel(''); 
    plt.title(f'Top {n} Most Important Features', size=18)
    plt.gca().invert_yaxis()
    
    return df

In [113]:
lgb_params = {
    'objective': 'binary', # 이진 분류
    'boosting_type': 'gbdt',
    'metric': 'auc', # 평가 지표 설정
    'feature_fraction': 0.8, # 피처 샘플링 비율
    'bagging_fraction': 0.8, # 데이터 샘플링 비율
    'bagging_freq': 1,
    'n_estimators': 10000, # 트리 개수
    'early_stopping_rounds': 100,
    'seed': SEED,
    'verbose': -1,
    'n_jobs': -1,    
}

tuned_lgb_params = {
    'objective': 'binary', # 이진 분류
    'boosting_type': 'gbdt',
    'metric': 'auc', # 평가 지표 설정
    'feature_fraction': 0.9068781382922049, # 피처 샘플링 비율
    'bagging_fraction': 0.5427572227580095, # 데이터 샘플링 비율
    'bagging_freq': 4,
    'n_estimators': 10000, # 트리 개수
    'early_stopping_rounds': 100,
    'seed': SEED,
    'verbose': -1,
    'n_jobs': -1, 
    'num_leaves': 159,
    'max_bin': 135,
    'min_data_in_leaf': 28,
    'lambda_l1': 2.531979160239369,
    'lambda_l2': 1.645404534997536e-05,
}

cat_params = {
    'n_estimators': 10000, # 트리 개수
    'learning_rate': 0.07, # 학습률
    'eval_metric': 'AUC', # 평가 지표 설정
    'loss_function': 'Logloss', # 손실 함수 설정
    'random_seed': SEED,
    'metric_period': 100,
    'od_wait': 100, # early stopping round
    'depth': 6, # 트리 최고 깊이
    'rsm': 0.8, # 피처 샘플링 비율
}

tuned_cat_params = {
    #'n_estimators': 10000, # 트리 개수
    'learning_rate': 0.06173991466879898, # 학습률
    'eval_metric': 'AUC', # 평가 지표 설정
    'loss_function': 'Logloss', # 손실 함수 설정
    'random_seed': SEED,
    'metric_period': 100,
    'od_wait': 100, # early stopping round
    'depth': 5, # 트리 최고 깊이
    'rsm': 0.8, # 피처 샘플링 비율
    'iterations': 180,
    'random_strength': 65,
    'bagging_temperature': 0.43432569253026015,
    'od_type': 'Iter'
}

In [114]:
label_2011_11 = generate_label(df, '2011-11')['label']

In [115]:
train, test, y, features = feature_engineering1(df, '2011-11')
#y_oof, test_preds_2011_11, fi = make_lgb_oof_prediction(train, y, test, features, model_params=lgb_params)
#print_score(label_2011_11, test_preds_2011_11)

categorical feature: []
x_tr.shape (5501, 124) , x_te.shape (5722, 124)


In [41]:
train.head()

Unnamed: 0,customer_id,year_month,label,cumsum_total_by_cust_id-mean,cumsum_total_by_cust_id-max,cumsum_total_by_cust_id-min,cumsum_total_by_cust_id-sum,cumsum_total_by_cust_id-count,cumsum_total_by_cust_id-std,cumsum_total_by_cust_id-skew,cumsum_quantity_by_cust_id-mean,cumsum_quantity_by_cust_id-max,cumsum_quantity_by_cust_id-min,cumsum_quantity_by_cust_id-sum,cumsum_quantity_by_cust_id-count,cumsum_quantity_by_cust_id-std,cumsum_quantity_by_cust_id-skew,cumsum_price_by_cust_id-mean,cumsum_price_by_cust_id-max,cumsum_price_by_cust_id-min,cumsum_price_by_cust_id-sum,cumsum_price_by_cust_id-count,cumsum_price_by_cust_id-std,cumsum_price_by_cust_id-skew,cumsum_total_by_prod_id-mean,cumsum_total_by_prod_id-max,cumsum_total_by_prod_id-min,cumsum_total_by_prod_id-sum,cumsum_total_by_prod_id-count,cumsum_total_by_prod_id-std,cumsum_total_by_prod_id-skew,cumsum_quantity_by_prod_id-mean,cumsum_quantity_by_prod_id-max,cumsum_quantity_by_prod_id-min,cumsum_quantity_by_prod_id-sum,cumsum_quantity_by_prod_id-count,cumsum_quantity_by_prod_id-std,cumsum_quantity_by_prod_id-skew,cumsum_price_by_prod_id-mean,cumsum_price_by_prod_id-max,cumsum_price_by_prod_id-min,cumsum_price_by_prod_id-sum,cumsum_price_by_prod_id-count,cumsum_price_by_prod_id-std,cumsum_price_by_prod_id-skew,cumsum_total_by_order_id-mean,cumsum_total_by_order_id-max,cumsum_total_by_order_id-min,cumsum_total_by_order_id-sum,cumsum_total_by_order_id-count,cumsum_total_by_order_id-std,cumsum_total_by_order_id-skew,cumsum_quantity_by_order_id-mean,cumsum_quantity_by_order_id-max,cumsum_quantity_by_order_id-min,cumsum_quantity_by_order_id-sum,cumsum_quantity_by_order_id-count,cumsum_quantity_by_order_id-std,cumsum_quantity_by_order_id-skew,cumsum_price_by_order_id-mean,cumsum_price_by_order_id-max,cumsum_price_by_order_id-min,cumsum_price_by_order_id-sum,cumsum_price_by_order_id-count,cumsum_price_by_order_id-std,cumsum_price_by_order_id-skew,order_ts-mean,order_ts-max,order_ts-min,order_ts-sum,order_ts-count,order_ts-std,order_ts-skew,order_ts_diff-mean,order_ts_diff-max,order_ts_diff-min,order_ts_diff-sum,order_ts_diff-count,order_ts_diff-std,order_ts_diff-skew,quantity_diff-mean,quantity_diff-max,quantity_diff-min,quantity_diff-sum,quantity_diff-count,quantity_diff-std,quantity_diff-skew,price_diff-mean,price_diff-max,price_diff-min,price_diff-sum,price_diff-count,price_diff-std,price_diff-skew,total_diff-mean,total_diff-max,total_diff-min,total_diff-sum,total_diff-count,total_diff-std,total_diff-skew,order_id-nunique,product_id-nunique
0,12346,2011-10,0,2953.304457,127267.569,-85.371,138805.3095,47.0,18527.811314,6.855058,1631.06383,74268.0,10.0,76660.0,47.0,10825.525671,6.855636,400.641415,936.375,7.425,18830.1465,47.0,262.090504,0.44016,-18933.867255,127352.94,-145015.2,-889891.8,47.0,64968.44567,-1.131166,2354.212766,74215.0,-1644.0,110648.0,47.0,10767.59839,6.742285,63650.017691,325951.263,1.65,2991551.0,47.0,128903.599171,1.619409,24.297128,127352.94,-127352.94,1141.965,47.0,26555.274233,-0.002867,3.914894,74215.0,-74215.0,184.0,47.0,15474.899161,-0.000793,80.73766,412.599,1.65,3794.67,47.0,87.485676,1.486909,1274484000.0,1295346000.0,1260780000.0,59900760000.0,47.0,8367102.0,0.056381,751438.695652,10197900.0,0.0,34566180.0,46.0,2323281.0,3.439852,-1613.586957,74216.0,-148430.0,-74225.0,46.0,24684.548109,-4.214331,-0.1241087,182.3745,-233.1615,-5.709,46.0,56.879462,-0.840908,-2770.156304,127360.365,-254705.88,-127427.19,46.0,42359.030338,-4.214047,17.0,30.0
1,12347,2011-10,1,2546.704976,5613.9435,29.205,417659.616,164.0,1636.394946,0.120206,1031.786585,2099.0,6.0,169213.0,164.0,606.713512,0.118385,317.244293,693.4455,4.8675,52028.064,164.0,207.325091,0.111363,14046.581799,363807.0975,156.3375,2303639.0,164.0,43323.609176,6.372546,4248.969512,45875.0,56.0,696831.0,164.0,7357.64638,3.02885,1940.714909,61460.19,8.25,318277.2,164.0,7252.527668,6.633689,470.958796,1174.4535,21.78,77237.2425,164.0,291.682227,0.353722,199.012195,509.0,4.0,32638.0,164.0,134.23777,0.480564,57.845881,147.8235,0.9075,9486.7245,164.0,35.377236,0.300351,1297744000.0,1312275000.0,1288535000.0,212830100000.0,164.0,8387982.0,0.513752,145644.662577,6120780.0,0.0,23740080.0,163.0,839729.2,5.854925,0.03681,234.0,-228.0,6.0,163.0,27.372576,0.288404,0.05263804,20.13,-16.17,8.58,163.0,4.74731,0.69366,0.810828,382.635,-387.09,132.165,163.0,50.079192,0.000551,6.0,107.0
2,12348,2011-10,0,1096.986,3332.01,21.78,55946.286,51.0,1000.019165,0.812596,1069.45098,2714.0,24.0,54542.0,51.0,902.451734,0.493378,66.215794,318.615,0.9075,3377.0055,51.0,79.524917,1.597336,17500.335471,146857.4085,471.24,892517.1,51.0,27894.026806,3.120231,11485.372549,35932.0,336.0,585754.0,51.0,11056.078952,0.673806,5104.847,95332.3635,25.1955,260347.2,51.0,16507.639799,4.106796,375.299294,1473.12,21.78,19140.264,51.0,300.618991,1.63919,388.392157,1254.0,24.0,19808.0,51.0,333.951917,1.339273,14.115588,81.84,0.4785,719.895,51.0,18.844181,2.593528,1292579000.0,1316956000.0,1285600000.0,65921530000.0,51.0,8085985.0,1.617142,627136.8,14955960.0,0.0,31356840.0,50.0,2475659.0,4.710186,-0.46,138.0,-143.0,-23.0,50.0,63.018692,0.06654,1.30185,65.5215,-65.5215,65.0925,50.0,24.419082,0.251831,0.8844,312.84,-327.096,44.22,50.0,83.855397,-0.069972,5.0,25.0
3,12349,2011-10,0,2014.247369,4367.5335,-39.8475,215524.4685,107.0,1402.86625,0.089354,467.682243,988.0,-5.0,50042.0,107.0,273.727436,-0.077728,615.014523,1484.1585,6.1875,65806.554,107.0,484.322565,0.447134,8876.528131,86587.3965,74.25,949788.5,107.0,12932.316422,3.313619,2352.869159,33089.0,12.0,251757.0,107.0,4244.373963,4.872195,1555.892636,30169.821,24.255,166480.5,107.0,3418.294904,6.236526,959.884724,2314.323,-39.8475,102707.6655,107.0,703.447214,0.321784,224.383178,519.0,-5.0,24009.0,107.0,135.943082,0.24465,239.373056,793.254,2.7225,25612.917,107.0,248.595111,1.326011,1280047000.0,1288254000.0,1259931000.0,136965000000.0,107.0,8875333.0,-0.411039,267200.377358,14077560.0,0.0,28323240.0,106.0,1832081.0,7.131391,0.103774,36.0,-36.0,11.0,106.0,8.632088,-0.091287,-0.03268868,399.3825,-411.0975,-3.465,106.0,71.657823,-0.213999,0.315212,386.265,-395.67,33.4125,106.0,75.363508,-0.089535,4.0,92.0
4,12350,2011-10,0,281.849118,551.76,41.58,4791.435,17.0,161.97922,0.200043,107.941176,197.0,12.0,1835.0,17.0,57.957388,-0.168685,42.133235,107.745,3.465,716.265,17.0,40.745863,0.879343,12878.018029,88027.863,292.05,218926.3,17.0,21329.32616,3.050607,3653.352941,16276.0,60.0,62107.0,17.0,4338.504177,1.810547,3051.218824,41075.826,77.88,51870.72,17.0,9814.784063,4.100398,281.849118,551.76,41.58,4791.435,17.0,161.97922,0.200043,107.941176,197.0,12.0,1835.0,17.0,57.957388,-0.168685,42.133235,107.745,3.465,716.265,17.0,40.745863,0.879343,1296662000.0,1296662000.0,1296662000.0,22043260000.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,12.0,-18.0,0.0,16.0,7.089899,-0.831202,4.440892e-16,63.6075,-62.535,7.105427e-15,16.0,23.067842,0.079587,0.0,37.29,-27.555,0.0,16.0,15.735384,0.404341,1.0,17.0


In [34]:
train, test, y, features = feature_engineering1(df, '2011-12')
y_oof, test_preds, fi = make_lgb_oof_prediction(train, y, test, features, model_params=lgb_params)

categorical feature: []
x_tr.shape (5722, 124) , x_te.shape (5914, 124)
fold: 1, x_tr.shape: (5149, 121), x_val.shape: (573, 121)
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[32]	training's auc: 0.94303	valid_1's auc: 0.811554
Fold 1 | AUC: 0.8115536963684152
--------------------------------------------------------------------------------
fold: 2, x_tr.shape: (5149, 121), x_val.shape: (573, 121)
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[20]	training's auc: 0.914591	valid_1's auc: 0.796653
Fold 2 | AUC: 0.7966534631473736
--------------------------------------------------------------------------------
fold: 3, x_tr.shape: (5150, 121), x_val.shape: (572, 121)
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[23]	training's auc: 0.919155	valid_1's auc: 0.839524
Fold 3 | AUC: 0.8395243784066304
-----------------------------------------

In [167]:
#fi = plot_feature_importances(fi)

In [35]:
#Optuna Objective Function 정의
def objective(trial, label=label_2011_11):
    lgb_params = {
        'objective' : 'binary',
        'boosting_type' : 'gbdt',
        'num_leaves' : trial.suggest_int('num_leaves', 2, 256),
        'max_depth' : -1,
        'max_bin' : trial.suggest_int('max_bin', 128, 256),
        'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 10, 40),
        'feature_fraction' : trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction' : trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq' : trial.suggest_int('bagging_freq', 1, 7),
        'n_estimators' : 10000,
        'early_stopping_rounds' : 100,
        'lambda_l1' : trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2' : trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'seed' : SEED,
        'verbose' : -1,
        'n_jobs' : -1,
    }
    
    y_oof, test_preds, fi = make_lgb_oof_prediction(train, y, test, features, model_params=lgb_params)
    
    val_auc = roc_auc_score(label, y_oof)
    
    return val_auc

def objective_cat(trial, label=label_2011_11):
    cat_params = {
        'iterations' : trial.suggest_int('iterations', 50, 300),                         
        'depth' : trial.suggest_int('depth', 4, 10),                                       
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.3),               
        'random_strength' :trial.suggest_int('random_strength', 0, 100),                       
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        'learning_rate' :trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter'])
    }
    
    y_oof, test_preds, fi = make_cat_oof_prediction(train, y, test, features, model_params=cat_params)
    
    val_auc = roc_auc_score(label, y_oof)
    
    return val_auc

In [36]:
#study = optuna.create_study(direction = 'maximize')
#study.optimize(objective, n_trials=10)

In [37]:
#study.best_params

In [38]:
#study.best_value

In [39]:
def submission(test_preds, sub_time):
    sub = pd.read_csv('input' + '/sample_submission.csv')
    sub['probability'] = test_preds
    sub.to_csv(os.path.join('../output', sub_time), index=False)

In [40]:
submission(test_preds, 'sub17.csv')