In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.preprocessing import  LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.metrics import classification_report
from tqdm import tqdm
import xgboost as xgb
import lightgbm as lgb

import optuna
from optuna.integration import XGBoostPruningCallback
from optuna.samplers import TPESampler
from joblib import dump

from util import make_overheat, make_adjust

In [47]:
df = pd.read_csv('dataset.csv')

# 인코딩
le1 = LabelEncoder()
df['광역'] = le1.fit_transform(df['광역'])
le2 = LabelEncoder()
df['시군구'] = le2.fit_transform(df['시군구'])

dump(le1, 'save/gwangyeok.joblib')
dump(le2, 'save/sigungu.joblib')


# 계약년월 기준으로 정렬해야 TimeSeriesSplit을 쓸 수 있음
df = df.sort_values(['계약년월'])

remove_features = ['target70', 'target80', 'target90']
df = df.drop(columns=remove_features)
df

Unnamed: 0,시군구,계약년월,면적당보증금,조정대상지역,투기과열지구,광역,target60
0,0,201908,207.164427,0,0,0,1
80348,1674,201908,147.397491,0,0,12,1
18191,379,201908,202.168085,0,0,1,1
80300,1673,201908,110.380435,0,0,12,1
18239,380,201908,211.076429,0,0,1,1
...,...,...,...,...,...,...,...
94267,1963,202307,279.589133,0,0,16,1
74395,1549,202307,334.634847,0,0,11,1
24910,518,202307,211.374951,0,0,1,1
74347,1548,202307,320.703608,0,0,11,1


#### target60 학습

In [48]:
# target 순서대로 for 문 돌릴 수 있게 해줘야함
target = 'target60'

x = df.drop(columns=target)
y = df[target]

In [49]:
# train_test 분리
# df가 계약년월 기준 오름차순 데이터이기 때문에 shuffle없이 split해야함. 시계열 데이터 특성 상 과거 데이터는 학습, 최신 데이터는 평가에 쓰기 때문
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=False)

In [50]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 10),
    }

    model = xgb.XGBClassifier(**params, n_jobs=-1)
    tscv = TimeSeriesSplit(n_splits=10)
    recalls = []
        
    for train_index, valid_index in tscv.split(x_train):
        train_x, val_x = x_train.iloc[train_index], x_train.iloc[valid_index]
        train_y, val_y = y_train.iloc[train_index], y_train.iloc[valid_index]

        model.fit(train_x, train_y)
        y_pred = model.predict(val_x)
        recalls.append(recall_score(y_pred, val_y))

    return np.mean(recalls)

if __name__ == "__main__":
    # Optuna 스터디 시작
    study = optuna.create_study(direction='maximize')  # recall maximize
    study.optimize(objective, n_trials=100)

    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[I 2023-08-28 01:50:01,584] A new study created in memory with name: no-name-d02ff500-aa5b-466b-b736-ab9a6fa23ea6
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2023-08-28 01:50:08,595] Trial 0 finished with value: 0.8534563840655128 and parameters: {'n_estimators': 90, 'max_depth': 7, 'learning_rate': 0.0018835609432601855, 'subsample': 0.7903210787000047, 'colsample_bytree': 0.9837011820989212, 'gamma': 0.4557689392990189, 'lambda': 1.2911347970478077e-05, 'alpha': 1.9369202355982906e-06, 'min_child_weight': 4.982823812196141}. Best is trial 0 with value: 0.8534563840655128.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2023-08-28 01:50:24,078] Trial 1 finished with value: 0.87514583174118 and parameters: {'n_estimators': 284, 'max_depth': 5, 'learning_rate': 0.04061881979869378, 'subsample': 0.7074285029421654, 'colsample_bytree': 0.9695180404905458, 'gamma': 0.377029359181757, 'lambda': 8.468148137393924e-07, 'alpha': 1.9

Number of finished trials: 100
Best trial:
  Value: 0.8995770393783195
  Params: 
    n_estimators: 228
    max_depth: 10
    learning_rate: 0.0768583812353364
    subsample: 0.9662229301474877
    colsample_bytree: 0.9785005736064512
    gamma: 0.30686503400500115
    lambda: 3.522762852015552e-08
    alpha: 2.2796151107527633e-08
    min_child_weight: 1.0809349434491105


#### target60 평가성능

In [51]:
best_params = study.best_params
sota60 = xgb.XGBClassifier(**best_params, n_jobs=-1)
tscv = TimeSeriesSplit(n_splits=10)

for train_index, valid_index in tscv.split(x_train):
    train_x, val_x = x_train.iloc[train_index], x_train.iloc[valid_index]
    train_y, val_y = y_train.iloc[train_index], y_train.iloc[valid_index]
    sota60.fit(train_x, train_y)

y_pred_test = sota60.predict(x_test)

# 모델 저장
dump(sota60, 'save/apt_target60.joblib')

# 평가 성능 확인
report = classification_report(y_test, y_pred_test)
print(report)

              precision    recall  f1-score   support

           0       0.79      0.28      0.41      4941
           1       0.80      0.97      0.88     14345

    accuracy                           0.80     19286
   macro avg       0.79      0.63      0.65     19286
weighted avg       0.80      0.80      0.76     19286



#### target70 학습

In [23]:
df = pd.read_csv('dataset.csv')

# 인코딩
le1 = LabelEncoder()
df['광역'] = le1.fit_transform(df['광역'])
le2 = LabelEncoder()
df['시군구'] = le2.fit_transform(df['시군구'])

# log 변환
df['log_면적당보증금'] = np.log(df['면적당보증금'])
df.drop(columns='면적당보증금', inplace=True)

# 계약년월 기준으로 정렬해야 TimeSeriesSplit을 쓸 수 있음
df = df.sort_values(['계약년월'])

remove_features = ['target60', 'target80', 'target90']
df_temp = df.drop(columns=remove_features)
df_temp

Unnamed: 0,시군구,계약년월,조정대상지역,투기과열지구,광역,target70,log_면적당보증금
0,0,201908,0,0,0,1,5.333513
80348,1674,201908,0,0,12,0,4.993133
18191,379,201908,0,0,1,1,5.309099
80300,1673,201908,0,0,12,1,4.703933
18239,380,201908,0,0,1,1,5.352220
...,...,...,...,...,...,...,...
94267,1963,202307,0,0,16,1,5.633321
74395,1549,202307,0,0,11,0,5.813040
24910,518,202307,0,0,1,0,5.353634
74347,1548,202307,0,0,11,0,5.770517


In [24]:
# target 순서대로 for 문 돌릴 수 있게 해줘야함
target = 'target70'

x = df_temp.drop(columns=target)
y = df_temp[target]

# train_test 분리
# df가 계약년월 기준 오름차순 데이터이기 때문에 shuffle없이 split해야함. 시계열 데이터 특성 상 과거 데이터는 학습, 최신 데이터는 평가에 쓰기 때문
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.9, shuffle=False)

In [27]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 250),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }

    model = xgb.XGBClassifier(**params, n_jobs=-1)
    tscv = TimeSeriesSplit(n_splits=10)
    recalls = []
        
    for train_index, valid_index in tscv.split(x_train):
        train_x, val_x = x_train.iloc[train_index], x_train.iloc[valid_index]
        train_y, val_y = y_train.iloc[train_index], y_train.iloc[valid_index]

        model.fit(train_x, train_y)
        y_pred = model.predict(val_x)
        recalls.append(recall_score(y_pred, val_y))

    return np.mean(recalls)

if __name__ == "__main__":
    # Optuna 스터디 시작
    study = optuna.create_study(direction='maximize')  # recall maximize
    study.optimize(objective, n_trials=100)

    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[I 2023-08-27 22:57:55,425] A new study created in memory with name: no-name-a48ba53b-2abf-4167-b0bf-728f902e452d
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.2),
[I 2023-08-27 22:58:05,118] Trial 0 finished with value: 0.7443484715617424 and parameters: {'n_estimators': 84, 'max_depth': 10, 'learning_rate': 0.0027570934106791816, 'subsample': 0.8070572829440938, 'colsample_bytree': 0.7934128361337476, 'gamma': 0.45445372825391667, 'lambda': 2.3403298607193645e-06, 'alpha': 0.018827654173084296, 'min_child_weight': 1}. Best is trial 0 with value: 0.7443484715617424.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.2),
[I 2023-08-27 22:58:19,647] Trial 1 finished with value: 0.7211726154158506 and parameters: {'n_estimators': 205, 'max_depth': 7, 'learning_rate': 0.002701594518657209, 'subsample': 0.6311453347412965, 'colsample_bytree': 0.6995630561551834, 'gamma': 0.48521137006908477, 'lambda': 6.46742160054717e-07, 'alpha': 0.4498895884376

Number of finished trials: 100
Best trial:
  Value: 0.8170352885590756
  Params: 
    n_estimators: 215
    max_depth: 10
    learning_rate: 0.16077919807440758
    subsample: 0.9135532693912264
    colsample_bytree: 0.7197754959056628
    gamma: 0.32176155905630405
    lambda: 8.057588898109088e-07
    alpha: 0.0013400305230964816
    min_child_weight: 1


#### target70 평가 성능

In [28]:
best_params = study.best_params
sota70 = xgb.XGBClassifier(**best_params, n_jobs=-1)
tscv = TimeSeriesSplit(n_splits=10)

for train_index, valid_index in tscv.split(x_train):
    train_x, val_x = x_train.iloc[train_index], x_train.iloc[valid_index]
    train_y, val_y = y_train.iloc[train_index], y_train.iloc[valid_index]
    sota70.fit(train_x, train_y)

y_pred_test = sota70.predict(x_test)

# 모델 저장
dump(sota70, 'save/apt_target70.joblib')

# 평가 성능 확인
report = classification_report(y_test, y_pred_test)
print(report)

              precision    recall  f1-score   support

           0       0.81      0.69      0.74      4911
           1       0.72      0.83      0.77      4732

    accuracy                           0.76      9643
   macro avg       0.76      0.76      0.76      9643
weighted avg       0.76      0.76      0.76      9643



#### target80 학습

In [19]:
df = pd.read_csv('dataset.csv')

# 인코딩
df['광역'] = le1.fit_transform(df['광역'])
df['시군구'] = le2.fit_transform(df['시군구'])

# log 변환
df['log_면적당보증금'] = np.log(df['면적당보증금'])
df.drop(columns='면적당보증금', inplace=True)

# 계약년월 기준으로 정렬해야 TimeSeriesSplit을 쓸 수 있음
df = df.sort_values(['계약년월'])

remove_features = ['target60', 'target70', 'target90']
df_temp = df.drop(columns=remove_features)
df_temp

Unnamed: 0,시군구,계약년월,조정대상지역,투기과열지구,광역,target80,log_면적당보증금
0,0,201908,0,0,0,1,5.333513
80348,1674,201908,0,0,12,0,4.993133
18191,379,201908,0,0,1,1,5.309099
80300,1673,201908,0,0,12,1,4.703933
18239,380,201908,0,0,1,1,5.352220
...,...,...,...,...,...,...,...
94267,1963,202307,0,0,16,1,5.633321
74395,1549,202307,0,0,11,0,5.813040
24910,518,202307,0,0,1,0,5.353634
74347,1548,202307,0,0,11,0,5.770517


In [20]:
# target 순서대로 for 문 돌릴 수 있게 해줘야함
target = 'target80'

x = df_temp.drop(columns=target)
y = df_temp[target]

# train_test 분리
# df가 계약년월 기준 오름차순 데이터이기 때문에 shuffle없이 split해야함. 시계열 데이터 특성 상 과거 데이터는 학습, 최신 데이터는 평가에 쓰기 때문
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.9, shuffle=False)

In [21]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 250),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.2),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }


    model = xgb.XGBClassifier(**params, n_jobs=-1)
    tscv = TimeSeriesSplit(n_splits=10)
    recalls = []
        
    for train_index, valid_index in tscv.split(x_train):
        train_x, val_x = x_train.iloc[train_index], x_train.iloc[valid_index]
        train_y, val_y = y_train.iloc[train_index], y_train.iloc[valid_index]

        model.fit(train_x, train_y)
        y_pred = model.predict(val_x)
        recalls.append(recall_score(y_pred, val_y))

    return np.mean(recalls)

if __name__ == "__main__":
    # Optuna 스터디 시작
    study = optuna.create_study(direction='maximize')  # recall maximize
    study.optimize(objective, n_trials=100)

    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[I 2023-08-27 22:18:01,478] A new study created in memory with name: no-name-cfa2d684-fa76-44cf-a5e0-fa946cd0c372
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.2),
[I 2023-08-27 22:18:09,504] Trial 0 finished with value: 0.6224981260693547 and parameters: {'n_estimators': 115, 'max_depth': 6, 'learning_rate': 0.0018934838487485216, 'subsample': 0.8866450422181953, 'colsample_bytree': 0.6952118498762563, 'gamma': 0.1805684021835352, 'lambda': 3.653479829327894e-05, 'alpha': 0.23263189976140297, 'min_child_weight': 6}. Best is trial 0 with value: 0.6224981260693547.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.2),
[I 2023-08-27 22:18:20,675] Trial 1 finished with value: 0.6723612202945921 and parameters: {'n_estimators': 110, 'max_depth': 10, 'learning_rate': 0.0012966936289925647, 'subsample': 0.5982944565704811, 'colsample_bytree': 0.7981836687447479, 'gamma': 0.040267401236542155, 'lambda': 1.0223149067531458e-08, 'alpha': 2.00813469083

Number of finished trials: 100
Best trial:
  Value: 0.7266520995073746
  Params: 
    n_estimators: 225
    max_depth: 12
    learning_rate: 0.08434351267106448
    subsample: 0.9603746784124344
    colsample_bytree: 0.6839618527946499
    gamma: 0.18450648730656335
    lambda: 0.03916943898020069
    alpha: 0.007893206304307562
    min_child_weight: 1


#### target80 평가성능

In [22]:
best_params = study.best_params
sota80 = xgb.XGBClassifier(**best_params)
tscv = TimeSeriesSplit(n_splits=10)

for train_index, valid_index in tscv.split(x_train):
    train_x, val_x = x_train.iloc[train_index], x_train.iloc[valid_index]
    train_y, val_y = y_train.iloc[train_index], y_train.iloc[valid_index]
    sota80.fit(train_x, train_y)

y_pred_test = sota80.predict(x_test)

# 모델 저장
dump(sota80, 'save/apt_target80.joblib')

# 평가 성능 확인
report = classification_report(y_test, y_pred_test)
print(report)


              precision    recall  f1-score   support

           0       0.88      0.83      0.86      6925
           1       0.63      0.71      0.67      2718

    accuracy                           0.80      9643
   macro avg       0.75      0.77      0.76      9643
weighted avg       0.81      0.80      0.80      9643



#### target90 학습

In [42]:
df = pd.read_csv('dataset.csv')

# 인코딩
df['광역'] = le1.fit_transform(df['광역'])
df['시군구'] = le2.fit_transform(df['시군구'])

# 계약년월 기준으로 정렬해야 TimeSeriesSplit을 쓸 수 있음
df = df.sort_values(['계약년월'])

remove_features = ['target60', 'target70', 'target80']
df_temp = df.drop(columns=remove_features)
df_temp

Unnamed: 0,시군구,계약년월,면적당보증금,조정대상지역,투기과열지구,광역,target90
0,0,201908,207.164427,0,0,0,1
80348,1674,201908,147.397491,0,0,12,0
18191,379,201908,202.168085,0,0,1,0
80300,1673,201908,110.380435,0,0,12,1
18239,380,201908,211.076429,0,0,1,0
...,...,...,...,...,...,...,...
94267,1963,202307,279.589133,0,0,16,1
74395,1549,202307,334.634847,0,0,11,0
24910,518,202307,211.374951,0,0,1,0
74347,1548,202307,320.703608,0,0,11,0


In [45]:
# target 순서대로 for 문 돌릴 수 있게 해줘야함
target = 'target90'

x = df_temp.drop(columns=target)
y = df_temp[target]

# train_test 분리
# df가 계약년월 기준 오름차순 데이터이기 때문에 shuffle없이 split해야함. 시계열 데이터 특성 상 과거 데이터는 학습, 최신 데이터는 평가에 쓰기 때문
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=False)

In [46]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }

    model = xgb.XGBClassifier(**params, n_jobs=-1)
    tscv = TimeSeriesSplit(n_splits=10)
    recalls = []
        
    for train_index, valid_index in tscv.split(x_train):
        train_x, val_x = x_train.iloc[train_index], x_train.iloc[valid_index]
        train_y, val_y = y_train.iloc[train_index], y_train.iloc[valid_index]

        model.fit(train_x, train_y)
        y_pred = model.predict(val_x)
        recalls.append(recall_score(y_pred, val_y))

    return np.mean(recalls)

if __name__ == "__main__":
    # Optuna 스터디 시작
    study = optuna.create_study(direction='maximize')  # recall maximize
    study.optimize(objective, n_trials=100)

    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[I 2023-08-28 01:48:40,209] A new study created in memory with name: no-name-0a1fa1db-0bc1-4245-a3f7-75f17b1db2d1
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2023-08-28 01:48:45,807] Trial 0 finished with value: 0.7447517168321778 and parameters: {'n_estimators': 66, 'max_depth': 9, 'learning_rate': 0.0026542495141854126, 'subsample': 0.8078988722807421, 'colsample_bytree': 0.7627150315602176, 'gamma': 0.3769358268585264, 'lambda': 6.527430051566336e-06, 'alpha': 0.001216700758536511, 'min_child_weight': 4}. Best is trial 0 with value: 0.7447517168321778.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
[I 2023-08-28 01:48:57,822] Trial 1 finished with value: 0.6426801339315388 and parameters: {'n_estimators': 264, 'max_depth': 4, 'learning_rate': 0.052589856918729506, 'subsample': 0.7255022783431117, 'colsample_bytree': 0.9208288958408274, 'gamma': 0.12069802634244242, 'lambda': 0.019504375684092565, 'alpha': 1.0785567401547852

KeyboardInterrupt: 

In [None]:
"""  
  Value: 0.8804029304029305
  Params: 
    n_estimators: 128
    max_depth: 4
    learning_rate: 0.02698777567422039
    subsample: 0.6662096314670163
    colsample_bytree: 0.9217544827819781
    gamma: 0.27038755382214374
    lambda: 0.0002866567623589203
    alpha: 0.0003088636857809953
    min_child_weight: 3.2029904586284736
"""

#### target90 평가성능

In [None]:
best_params = study.best_params
sota90 = xgb.XGBClassifier(**best_params)
tscv = TimeSeriesSplit(n_splits=10)

for train_index, valid_index in tscv.split(x_train):
    train_x, val_x = x_train.iloc[train_index], x_train.iloc[valid_index]
    train_y, val_y = y_train.iloc[train_index], y_train.iloc[valid_index]
    sota90.fit(train_x, train_y)

y_pred_test = sota90.predict(x_test)

# 모델 저장
dump(sota90, 'save/apt_target90.joblib')

# 평가 성능 확인
report = classification_report(y_test, y_pred_test)
print(report)

              precision    recall  f1-score   support

           0       0.83      1.00      0.91     15970
           1       1.00      0.00      0.00      3316

    accuracy                           0.83     19286
   macro avg       0.91      0.50      0.45     19286
weighted avg       0.86      0.83      0.75     19286

