In [1]:
!git clone https://github.com/DShomin/boostcamp_source.git

Cloning into 'boostcamp_source'...
remote: Enumerating objects: 76, done.[K
remote: Counting objects: 100% (76/76), done.[K
remote: Compressing objects: 100% (60/60), done.[K
remote: Total 76 (delta 25), reused 59 (delta 15), pack-reused 0[K
Unpacking objects: 100% (76/76), done.


In [2]:
!pip install pytorch-tabnet



In [3]:
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

import os, sys, gc, warnings, random

import datetime
import dateutil.relativedelta

# Data manipulation
import pandas as pd 
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold, GroupKFold
from sklearn.ensemble import RandomForestClassifier

import lightgbm as lgb

from tqdm.notebook import trange, tqdm

from IPython.display import display

%matplotlib inline

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 100
pd.options.display.max_colwidth = 100

data_dir = '../input' # os.environ['SM_CHANNEL_TRAIN']
model_dir = '../model' # os.environ['SM_MODEL_DIR']
output_dir = '../output' # os.environ['SM_OUTPUT_DATA_DIR']

In [4]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
SEED = 42
seed_everything(SEED)

In [5]:
TOTAL_THRES = 300

def generate_label(df, year_month, total_thres=TOTAL_THRES, print_log=False):
    df = df.copy()
    
    # year_month에 해당하는 label 데이터 생성
    df['year_month'] = df['order_date'].dt.strftime('%Y-%m')
    df.reset_index(drop=True, inplace=True)

    # year_month 이전 월의 고객 ID 추출
    cust = df[df['year_month']<year_month]['customer_id'].unique()
    # year_month에 해당하는 데이터 선택
    df = df[df['year_month']==year_month]
    
    # label 데이터프레임 생성
    label = pd.DataFrame({'customer_id':cust})
    label['year_month'] = year_month
    
    # year_month에 해당하는 고객 ID의 구매액의 합 계산
    grped = df.groupby(['customer_id','year_month'], as_index=False)[['total']].sum()
    
    # label 데이터프레임과 merge하고 구매액 임계값을 넘었는지 여부로 label 생성
    label = label.merge(grped, on=['customer_id','year_month'], how='left')
    label['total'].fillna(0.0, inplace=True)
    label['label'] = (label['total'] > total_thres).astype(int)

    # 고객 ID로 정렬
    label = label.sort_values('customer_id').reset_index(drop=True)
    if print_log: print(f'{year_month} - final label shape: {label.shape}')
    
    return label

In [6]:
def print_score(label, pred, prob_thres=0.5):
    print('Precision: {:.5f}'.format(precision_score(label, pred>prob_thres)))
    print('Recall: {:.5f}'.format(recall_score(label, pred>prob_thres)))
    print('F1 Score: {:.5f}'.format(f1_score(label, pred>prob_thres)))
    print('ROC AUC Score: {:.5f}'.format(roc_auc_score(label, pred)))

In [7]:
def feature_preprocessing(train, test, features, do_imputing=True):
    x_tr = train.copy()
    x_te = test.copy()
    
    # 범주형 피처 이름을 저장할 변수
    cate_cols = []

    # 레이블 인코딩
    for f in features:
        if x_tr[f].dtype.name == 'object': # 데이터 타입이 object(str)이면 레이블 인코딩
            cate_cols.append(f)
            le = LabelEncoder()
            # train + test 데이터를 합쳐서 레이블 인코딩 함수에 fit
            le.fit(list(x_tr[f].values) + list(x_te[f].values))
            
            # train 데이터 레이블 인코딩 변환 수행
            x_tr[f] = le.transform(list(x_tr[f].values))
            
            # test 데이터 레이블 인코딩 변환 수행
            x_te[f] = le.transform(list(x_te[f].values))

    print('categorical feature:', cate_cols)

    if do_imputing:
        # 중위값으로 결측치 채우기
        imputer = SimpleImputer(strategy='median')

        x_tr[features] = imputer.fit_transform(x_tr[features])
        x_te[features] = imputer.transform(x_te[features])
    
    return x_tr, x_te

In [8]:

def make_time_series_data(df, Input, year_month, stand):
    # 기준을 잡습니다. 기준은 여기서 %Y-%m 입니다.
    standard = ['customer_id'] + [stand]
    data = Input.copy()
    df = df.copy()

    data[stand] = pd.to_datetime(df['order_date']).dt.strftime(stand)
    data.order_date = pd.to_datetime(data['order_date'])

    # 월단위의 틀을 만들어주고, 기준으로 aggregation을 해준 다음에 merge를 해줄 것입니다
    times = pd.date_range('2009-12-01', periods=(data.order_date.max() - data.order_date.min()).days + 1, freq='1d')
    customerid_frame = np.repeat(data.customer_id.unique(), len(times))
    date_frame = np.tile(times, len(data.customer_id.unique()))

    frame = pd.DataFrame({'customer_id': customerid_frame, 'order_date': date_frame})
    frame[stand] = pd.to_datetime(frame.order_date).dt.strftime(stand)

    # group by
    data_group = data.groupby(standard).sum().reset_index()
    frame_group = frame.groupby(standard).count().reset_index().drop(['order_date'], axis=1)

    # merge
    merge = pd.merge(frame_group, data_group, on=standard, how='left').fillna(0)
    merge = merge.rename(columns={stand: 'standard'})

    merge_test = merge[merge['standard'] == year_month].drop(columns=['standard', 'quantity', 'price']) #.drop(merge.columns.tolist() - ['customer_id', 'total'])
    return merge_test

def add_trend(df, year_month):
    df = df.copy()
    df['year_month'] = df['order_date'].dt.strftime('%Y-%m')
    # year_month 이전 월 계산
    d = datetime.datetime.strptime(year_month, "%Y-%m")
    prev_ym = d - dateutil.relativedelta.relativedelta(months=1)
    # train과 test 데이터 생성
    train = df[df['order_date'] < prev_ym]  # 2009-12부터 2011-10 데이터 추출
    test = df[df['order_date'] < year_month]  # 2009-12부터 2011-11 데이터 추출
    train_window_ym = []
    test_window_ym = []
    for month_back in [1, 2, 3, 5, 7, 12, 20, 23]:  # 1개월, 2개월, ... 20개월, 23개월 전 year_month 파악
        train_window_ym.append((prev_ym - dateutil.relativedelta.relativedelta(months=month_back)).strftime('%Y-%m'))
        test_window_ym.append((d - dateutil.relativedelta.relativedelta(months=month_back)).strftime('%Y-%m'))
    # aggregation 함수 선언
    agg_func = ['max', 'min', 'sum', 'mean', 'count', 'std', 'skew']
    # group by aggregation with Dictionary
    agg_dict = {
        'quantity': agg_func,
        'price': agg_func,
        'total': agg_func,
    }
    # general statistics for train data with time series trend
    for i, tr_ym in enumerate(train_window_ym):
        # group by aggretation 함수로 train 데이터 피처 생성
        train_agg = train.loc[train['year_month'] >= tr_ym].groupby(['customer_id']).agg(
            agg_dict)  # 해당 year_month 이후부터 모든 데이터에 대한 aggregation을 실시
        # 멀티 레벨 컬럼을 사용하기 쉽게 1 레벨 컬럼명으로 변경
        new_cols = []
        for level1, level2 in train_agg.columns:
            new_cols.append(f'{level1}-{level2}-{i}')
        train_agg.columns = new_cols
        train_agg.reset_index(inplace=True)

        if i == 0:
            train_data = train_agg
        else:
            train_data = train_data.merge(train_agg, on=['customer_id'], how='right')
    # general statistics for test data with time series trend
    for i, tr_ym in enumerate(test_window_ym):
        # group by aggretation 함수로 test 데이터 피처 생성
        test_agg = test.loc[test['year_month'] >= tr_ym].groupby(['customer_id']).agg(agg_dict)
        # 멀티 레벨 컬럼을 사용하기 쉽게 1 레벨 컬럼명으로 변경
        new_cols = []
        for level1, level2 in test_agg.columns:
            new_cols.append(f'{level1}-{level2}-{i}')
        test_agg.columns = new_cols
        test_agg.reset_index(inplace=True)

        if i == 0:
            test_data = test_agg
        else:
            test_data = test_data.merge(test_agg, on=['customer_id'], how='right')
    return train_data, test_data


def add_seasonality(df, year_month):
    df = df.copy()
    df['year_month'] = df['order_date'].dt.strftime('%Y-%m')
    # year_month 이전 월 계산
    d = datetime.datetime.strptime(year_month, "%Y-%m")
    prev_ym = d - dateutil.relativedelta.relativedelta(months=1)
    # train과 test 데이터 생성
    train = df[df['order_date'] < prev_ym]  # 2009-12부터 2011-10 데이터 추출
    test = df[df['order_date'] < year_month]  # 2009-12부터 2011-11 데이터 추출
    train_window_ym = []
    test_window_ym = []
    for month_back in [1, 6, 12, 18]:  # 각 주기성을 파악하고 싶은 구간을 생성
        train_window_ym.append(
            (
                (prev_ym - dateutil.relativedelta.relativedelta(months=month_back)).strftime('%Y-%m'),
                (prev_ym - dateutil.relativedelta.relativedelta(months=month_back + 2)).strftime('%Y-%m')
            # 1~3, 6~8, 12~14, 18~20 Pair를 만들어준다
            )
        )
        test_window_ym.append(
            (
                (d - dateutil.relativedelta.relativedelta(months=month_back)).strftime('%Y-%m'),
                (d - dateutil.relativedelta.relativedelta(months=month_back + 2)).strftime('%Y-%m')
            )
        )

    # aggregation 함수 선언
    agg_func = ['max', 'min', 'sum', 'mean', 'count', 'std', 'skew']
    # group by aggregation with Dictionary
    agg_dict = {
        'quantity': agg_func,
        'price': agg_func,
        'total': agg_func,
    }
    # seasonality for train data with time series
    for i, (tr_ym, tr_ym_3) in enumerate(train_window_ym):
        # group by aggretation 함수로 train 데이터 피처 생성
        # 구간 사이에 존재하는 월들에 대해서 aggregation을 진행
        train_agg = train.loc[(train['year_month'] >= tr_ym_3) & (train['year_month'] <= tr_ym)].groupby(
            ['customer_id']).agg(agg_dict)
        # 멀티 레벨 컬럼을 사용하기 쉽게 1 레벨 컬럼명으로 변경
        new_cols = []
        for level1, level2 in train_agg.columns:
            new_cols.append(f'{level1}-{level2}-season{i}')
        train_agg.columns = new_cols
        train_agg.reset_index(inplace=True)

        if i == 0:
            train_data = train_agg
        else:
            train_data = train_data.merge(train_agg, on=['customer_id'], how='right')
    # seasonality for test data with time series
    for i, (tr_ym, tr_ym_3) in enumerate(test_window_ym):
        # group by aggretation 함수로 train 데이터 피처 생성
        test_agg = test.loc[(test['year_month'] >= tr_ym_3) & (test['year_month'] <= tr_ym)].groupby(
            ['customer_id']).agg(agg_dict)
        # 멀티 레벨 컬럼을 사용하기 쉽게 1 레벨 컬럼명으로 변경
        new_cols = []
        for level1, level2 in test_agg.columns:
            new_cols.append(f'{level1}-{level2}-season{i}')
        test_agg.columns = new_cols
        test_agg.reset_index(inplace=True)

        if i == 0:
            test_data = test_agg
        else:
            test_data = test_data.merge(test_agg, on=['customer_id'], how='right')

    return train_data, test_data






def feature_engineering2(df, year_month):
    df = df.copy()

    # customer_id 기준으로 pandas group by 후 total, quantity, price 누적합 계산
    df['cumsum_total_by_cust_id'] = df.groupby(['customer_id'])['total'].cumsum()
    df['cumsum_quantity_by_cust_id'] = df.groupby(['customer_id'])['quantity'].cumsum()
    df['cumsum_price_by_cust_id'] = df.groupby(['customer_id'])['price'].cumsum()

    # product_id 기준으로 pandas group by 후 total, quantity, price 누적합 계산
    df['cumsum_total_by_prod_id'] = df.groupby(['product_id'])['total'].cumsum()
    df['cumsum_quantity_by_prod_id'] = df.groupby(['product_id'])['quantity'].cumsum()
    df['cumsum_price_by_prod_id'] = df.groupby(['product_id'])['price'].cumsum()

    # order_id 기준으로 pandas group by 후 total, quantity, price 누적합 계산
    df['cumsum_total_by_order_id'] = df.groupby(['order_id'])['total'].cumsum()
    df['cumsum_quantity_by_order_id'] = df.groupby(['order_id'])['quantity'].cumsum()
    df['cumsum_price_by_order_id'] = df.groupby(['order_id'])['price'].cumsum()

    # oredr_ts
    df['order_ts'] = df['order_date'].astype(np.int64)//1e9
    df['order_ts_diff'] = df.groupby(['customer_id'])['order_ts'].diff()
    df['quantity_diff'] = df.groupby(['customer_id'])['quantity'].diff()
    df['price_diff'] = df.groupby(['customer_id'])['price'].diff()
    df['total_diff'] = df.groupby(['customer_id'])['total'].diff()

    # mode
    df['month-mode'] = df['order_date'].dt.month
    df['year_month-mode'] = df['order_date'].dt.strftime('%Y-%m')

    # oredr_ts_plus ===
    df['order_ts_plus'] = df[df['total'] > 0]['order_date'].astype(np.int64) // 1e9
    df['order_ts_plus_diff'] = df[df['total'] > 0].groupby(['customer_id'])['order_ts'].diff()
    df['order_ts_plus'] = df['order_ts_plus'].fillna(0)
    df['order_ts_plus_diff'] = df['order_ts_plus_diff'].fillna(0)
    # df[~(df.order_id.str.contains('C'))].groupby(['customer_id'])['order_date'].last().astype(np.int64) // 1e9

    # ================================================================================================
    # year_month 이전 월 계산
    d = datetime.datetime.strptime(year_month, "%Y-%m")
    prev_ym = d - dateutil.relativedelta.relativedelta(months=1)
    prev_ym = prev_ym.strftime('%Y-%m')

    # train, test 데이터 선택
    train = df[df['order_date'] < prev_ym]
    test = df[df['order_date'] < year_month]

    # train, test 레이블 데이터 생성
    train_label = generate_label(df, prev_ym)[['customer_id', 'year_month', 'label']]
    test_label = generate_label(df, year_month)[['customer_id', 'year_month', 'label']]




    # ================================================================================================
    # 연월 피처 생성
    target = datetime.datetime.strptime('2011-11', "%Y-%m")  # 타겟 연월
    prev = target - dateutil.relativedelta.relativedelta(years=1)  # 전년 연월
    prev = prev.strftime('%Y-%m')  # 문자열로 변환
    groupby = train.groupby(['customer_id', 'year_month-mode'])['total'].sum()  # 고객별, 월별 total 합
    groupby = groupby.unstack()  # 월별을 컬럼으로 변환
    prev_pprev_total = groupby.loc[:, [prev]]  # 전년, 전전년 데이터만 추출
    prev_pprev_total = prev_pprev_total.fillna(0)

    train_1224 = (prev_pprev_total['2010-11']) / 2


    target = datetime.datetime.strptime('2011-12', "%Y-%m")  # 타겟 연월
    prev = target - dateutil.relativedelta.relativedelta(years=1)  # 전년 연월
    pprev = prev - dateutil.relativedelta.relativedelta(years=1)  # 전전년 연월
    prev, pprev = prev.strftime('%Y-%m'), pprev.strftime('%Y-%m')  # 문자열로 변환
    groupby = test.groupby(['customer_id', 'year_month-mode'])['total'].sum()  # 고객별, 월별 total 합
    groupby = groupby.unstack()  # 월별을 컬럼으로 변환
    prev_pprev_total = groupby.loc[:, [prev, pprev]]  # 전년, 전전년 데이터만 추출
    prev_pprev_total = prev_pprev_total.fillna(0)

    test_1224 = (prev_pprev_total['2010-12'] + prev_pprev_total['2009-12']) / 2


    # ================================================================================================

    # lambda 식
    mode_f = lambda x: x.value_counts().index[0]

    # group by aggregation 함수 선언
    agg_func = ['mean', 'max', 'min', 'sum', 'count', 'std', 'skew']
    # agg_func = ['mean', 'max'] # , 'min', 'sum', 'count', 'std', 'skew']
    agg_dict = {
        'order_ts': ['first', 'last'],
        'order_ts_diff': agg_func,
        'order_ts_plus': ['first', 'last'],
        'order_ts_plus_diff': agg_func,
        'quantity_diff': agg_func,
        'price_diff': agg_func,
        'total_diff': agg_func,
        'quantity': agg_func,
        'price': agg_func,
        'total': agg_func,
        'cumsum_total_by_cust_id': agg_func,
        'cumsum_quantity_by_cust_id': agg_func,
        'cumsum_price_by_cust_id': agg_func,
        'cumsum_total_by_prod_id': agg_func,
        'cumsum_quantity_by_prod_id': agg_func,
        'cumsum_price_by_prod_id': agg_func,
        'cumsum_total_by_order_id': agg_func,
        'cumsum_quantity_by_order_id': agg_func,
        'cumsum_price_by_order_id': agg_func,
        'order_id': ['nunique'],
        'product_id': ['nunique'],
        'month-mode': [mode_f],
        'year_month-mode': [mode_f],
    }
    all_train_data = pd.DataFrame()

    for i, tr_ym in enumerate(train_label['year_month'].unique()):
        # group by aggretation 함수로 train 데이터 피처 생성
        train_agg = train.loc[train['order_date'] < tr_ym].groupby(['customer_id']).agg(agg_dict)

        new_cols = []
        for col in agg_dict.keys():
            for stat in agg_dict[col]:
                if type(stat) is str:
                    new_cols.append(f'{col}-{stat}')
                else:
                    new_cols.append(f'{col}-mode')
        train_agg.columns = new_cols
        train_agg.reset_index(inplace=True)

        train_agg['year_month'] = tr_ym

        all_train_data = all_train_data.append(train_agg)

    all_train_data = train_label.merge(all_train_data, on=['customer_id', 'year_month'], how='left')
    all_train_data['cycle_1224'] = train_1224.to_numpy()

    # ================================================================================================

    data = pd.read_csv("/opt/ml/code/input/train.csv", parse_dates=["order_date"])
    # # baseline feature engineering
    # train, test, y, features = feature_engineering(data, '2011-12')
    # trend
    train_t, test_t = add_trend(data, year_month='2011-12')
    # seasonality
    train_s, test_s = add_seasonality(data, year_month='2011-12')
    # train 데이터 병합
    all_train_data = all_train_data.merge(train_t, on=['customer_id'], how='left')
    all_train_data = all_train_data.merge(train_s, on=['customer_id'], how='left')
    all_train_data = all_train_data.fillna(0)

    # ================================================================================================

    features = all_train_data.drop(columns=['customer_id', 'label', 'year_month']).columns
    print(features.shape)

    import csv
    with open("../output/feature.csv", 'w', newline='') as f:
        writer = csv.writer(f)
        for items in features.tolist():
            print(items)
            writer.writerow([items])

    test_agg = test.groupby(['customer_id']).agg(agg_dict)
    test_agg.columns = new_cols
    test_agg['cycle_1224'] = test_1224

    test_data = test_label.merge(test_agg, on=['customer_id'], how='left')

    # test 데이터 병합 ===================================================================================
    test_data = test_data.merge(test_t, on=['customer_id'], how='left')
    test_data = test_data.merge(test_s, on=['customer_id'], how='left')
    test_data = test_data.fillna(0)

    # train, test 데이터 전처리
    print(all_train_data.shape)
    print(test_data.shape)
    x_tr, x_te = feature_preprocessing(all_train_data, test_data, features)
    print('x_tr.shape', x_tr.shape, ', x_te.shape', x_te.shape)

    return x_tr, x_te, all_train_data['label'], features


In [9]:
def make_lgb_oof_prediction(train, y, test, features, categorical_features='auto', model_params=None, folds=10):
    x_train = train[features]
    x_test = test[features]
    
    # 테스트 데이터 예측값을 저장할 변수
    test_preds = np.zeros(x_test.shape[0])
    
    # Out Of Fold Validation 예측 데이터를 저장할 변수
    y_oof = np.zeros(x_train.shape[0])
    
    # 폴드별 평균 Validation 스코어를 저장할 변수
    score = 0
    
    # 피처 중요도를 저장할 데이터 프레임 선언
    fi = pd.DataFrame()
    fi['feature'] = features
    
    # Stratified K Fold 선언
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=SEED)

    for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
        # train index, validation index로 train 데이터를 나눔
        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features]
        y_tr, y_val = y[tr_idx], y[val_idx]
        
        print(f'fold: {fold+1}, x_tr.shape: {x_tr.shape}, x_val.shape: {x_val.shape}')

        # LightGBM 데이터셋 선언
        dtrain = lgb.Dataset(x_tr, label=y_tr)
        dvalid = lgb.Dataset(x_val, label=y_val)
        
        # LightGBM 모델 훈련
        clf = lgb.train(
            model_params,
            dtrain,
            valid_sets=[dtrain, dvalid], # Validation 성능을 측정할 수 있도록 설정
            categorical_feature=categorical_features,
            verbose_eval=200
        )

        # Validation 데이터 예측
        val_preds = clf.predict(x_val)
        
        # Validation index에 예측값 저장 
        y_oof[val_idx] = val_preds
        
        # 폴드별 Validation 스코어 측정
        print(f"Fold {fold + 1} | AUC: {roc_auc_score(y_val, val_preds)}")
        print('-'*80)

        # score 변수에 폴드별 평균 Validation 스코어 저장
        score += roc_auc_score(y_val, val_preds) / folds
        
        # 테스트 데이터 예측하고 평균해서 저장
        test_preds += clf.predict(x_test) / folds
        
        # 폴드별 피처 중요도 저장
        fi[f'fold_{fold+1}'] = clf.feature_importance()

        del x_tr, x_val, y_tr, y_val
        gc.collect()
        
    print(f"\nMean AUC = {score}") # 폴드별 Validation 스코어 출력
    print(f"OOF AUC = {roc_auc_score(y, y_oof)}") # Out Of Fold Validation 스코어 출력
        
    # 폴드별 피처 중요도 평균값 계산해서 저장 
    fi_cols = [col for col in fi.columns if 'fold_' in col]
    fi['importance'] = fi[fi_cols].mean(axis=1)
    
    return y_oof, test_preds, fi

In [10]:
def plot_feature_importances(df, n=20, color='blue', figsize=(12,8)):
    # 피처 중요도 순으로 내림차순 정렬
    df = df.sort_values('importance', ascending = False).reset_index(drop = True)
    
    # 피처 중요도 정규화 및 누적 중요도 계산
    df['importance_normalized'] = df['importance'] / df['importance'].sum()
    df['cumulative_importance'] = np.cumsum(df['importance_normalized'])
    
    plt.rcParams['font.size'] = 12
    plt.style.use('fivethirtyeight')
    # 피처 중요도 순으로 n개까지 바플롯으로 그리기
    df.loc[:n, :].plot.barh(y='importance_normalized', 
                            x='feature', color=color, 
                            edgecolor='k', figsize=figsize,
                            legend=False)

    plt.xlabel('Normalized Importance', size=18); plt.ylabel(''); 
    plt.title(f'Top {n} Most Important Features', size=18)
    plt.gca().invert_yaxis()
    
    return df

In [11]:
# label_2011_11 = generate_label(data, '2011-11')['label']


# # ## Label 데이터 분포 플롯
# # sns.countplot(label_2011_11);
# # label_2011_11.value_counts()

In [12]:
model_params = {
    'objective': 'binary', # 이진 분류
    'boosting_type': 'gbdt',
    'metric': 'auc', # 평가 지표 설정
    'feature_fraction': 0.8, # 피처 샘플링 비율
    'bagging_fraction': 0.8, # 데이터 샘플링 비율
    'bagging_freq': 1,
    'n_estimators': 10000, # 트리 개수(반복횟수)
    'early_stopping_rounds': 100,
    'seed': SEED,
    'verbose': -1,
    'n_jobs': -1,    
}

In [13]:
# train, test, y, features = feature_engineering2(data, '2011-11')

# y_oof, test_preds_2011_11, fi = make_lgb_oof_prediction(train, y, test, features, model_params=model_params)
# print_score(label_2011_11, test_preds_2011_11)

In [15]:
data = pd.read_csv(data_dir + '/train.csv', parse_dates=['order_date'])
train, test, y, features = feature_engineering2(data, '2011-12')
# y_oof, test_preds, fi = make_lgb_oof_prediction(train, y, test, features, model_params=model_params)

(380,)
order_ts-first
order_ts-last
order_ts_diff-mean
order_ts_diff-max
order_ts_diff-min
order_ts_diff-sum
order_ts_diff-count
order_ts_diff-std
order_ts_diff-skew
order_ts_plus-first
order_ts_plus-last
order_ts_plus_diff-mean
order_ts_plus_diff-max
order_ts_plus_diff-min
order_ts_plus_diff-sum
order_ts_plus_diff-count
order_ts_plus_diff-std
order_ts_plus_diff-skew
quantity_diff-mean
quantity_diff-max
quantity_diff-min
quantity_diff-sum
quantity_diff-count
quantity_diff-std
quantity_diff-skew
price_diff-mean
price_diff-max
price_diff-min
price_diff-sum
price_diff-count
price_diff-std
price_diff-skew
total_diff-mean
total_diff-max
total_diff-min
total_diff-sum
total_diff-count
total_diff-std
total_diff-skew
quantity-mean
quantity-max
quantity-min
quantity-sum
quantity-count
quantity-std
quantity-skew
price-mean
price-max
price-min
price-sum
price-count
price-std
price-skew
total-mean
total-max
total-min
total-sum
total-count
total-std
total-skew
cumsum_total_by_cust_id-mean
cumsum_tot

In [16]:

# Quantile 실험
from sklearn.preprocessing import QuantileTransformer
# quan = QuantileTransformer(n_quantiles=2000, output_distribution='normal',random_state=42)
quan = QuantileTransformer(n_quantiles=1000, random_state=42)
X_quan = quan.fit_transform(train[features])
Y_quan = quan.fit_transform(test[features])
x_quan = pd.DataFrame(X_quan, columns=features)
y_quan = pd.DataFrame(Y_quan, columns=features)
train[features] = x_quan[features]
test[features] = y_quan[features]

In [17]:
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
from pytorch_tabnet.pretraining import TabNetPretrainer
import torch

In [18]:
# clf = TabNetClassifier(
#     optimizer_fn=torch.optim.Adam,
#     optimizer_params=dict(lr=2e-2),
#     scheduler_params={"step_size":10, # how to use learning rate scheduler
#                       "gamma":0.9},
#     scheduler_fn=torch.optim.lr_scheduler.StepLR,
#     mask_type='sparsemax' # This will be overwritten if using pretrain model
# )

# clf.fit(
#     X_train=train[features].values, y_train=y,
#     eval_set= [(train[features].values, y)],
#     eval_name=['train'], #, 'valid'],
#     eval_metric=['auc'],
# )

In [19]:
unsupervised_model = TabNetPretrainer(
    optimizer_fn = torch.optim.Adam,
    optimizer_params = dict(lr=2e-2),
    mask_type='sparsemax'
)

unsupervised_model.fit(
    X_train = train[features].values,
    pretraining_ratio=0.8,
)

clf = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, # how to use learning rate scheduler
                      "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax' # This will be overwritten if using pretrain model
)

clf.fit(
    X_train=train[features].values, y_train=y,
    eval_set= [(train[features].values, y)],
    eval_name=['train'], #, 'valid'],
    eval_metric=['auc'],
    from_unsupervised = unsupervised_model
)

Device used : cuda
No early stopping will be performed, last training weights will be used.
epoch 0  | loss: 761507.50805|  0:00:00s
epoch 1  | loss: 150351.20642|  0:00:01s
epoch 2  | loss: 45443.30082|  0:00:02s
epoch 3  | loss: 12429.08759|  0:00:02s
epoch 4  | loss: 2929.05939|  0:00:03s
epoch 5  | loss: 479.02132|  0:00:04s
epoch 6  | loss: 109.1671|  0:00:04s
epoch 7  | loss: 29.0335 |  0:00:05s
epoch 8  | loss: 31.49836|  0:00:05s
epoch 9  | loss: 17.04393|  0:00:06s
epoch 10 | loss: 6.89258 |  0:00:06s
epoch 11 | loss: 7.76593 |  0:00:07s
epoch 12 | loss: 8.30071 |  0:00:07s
epoch 13 | loss: 5.47485 |  0:00:08s
epoch 14 | loss: 4.33075 |  0:00:09s
epoch 15 | loss: 3.9328  |  0:00:09s
epoch 16 | loss: 3.64508 |  0:00:10s
epoch 17 | loss: 3.57547 |  0:00:10s
epoch 18 | loss: 3.3733  |  0:00:11s
epoch 19 | loss: 3.24925 |  0:00:11s
epoch 20 | loss: 2.92094 |  0:00:12s
epoch 21 | loss: 2.74688 |  0:00:13s
epoch 22 | loss: 2.40697 |  0:00:13s
epoch 23 | loss: 2.15205 |  0:00:14s
epo

In [20]:
pred = clf.predict_proba(test[features].values)

In [22]:
output_dir = '../output'
os.makedirs(output_dir, exist_ok=True)
submission = pd.read_csv('../input/sample_submission.csv')
submission['probability'] = pred
submission.to_csv(os.path.join(output_dir, 'output20.csv'), index=False)


In [None]:
fi = plot_feature_importances(fi)