# 라이브러리 불러오기

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from data.load_dataset import load_dataset
from data.merge_dataset import merge_dataset
from data.feature_engineering import apply_log_transformation, ClusteringModel
# from model.inference import save_csv
# from model.feature_select import select_features
# from model.data_split import split_features_and_target
# from model.model_train import cv_train, set_model, optuna_train
import argparse
import os
import wandb
import warnings
warnings.filterwarnings("ignore")

# Data Load

In [2]:
# 기존 데이터 불러오기
train_data, test_data, sample_submission, interest_data, subway_data, school_data, park_data = load_dataset()

# 기존 데이터에 새로운 feature들을 병합한 데이터프레임 불러오기
train_data, test_data = merge_dataset(train_data, test_data, interest_data, subway_data, school_data, park_data)

# Data Preprocessing

In [3]:
# 위치 중복도 낮은 행 삭제
groups = train_data.groupby(["latitude", "longitude"])["index"].count()
conditioned_groups_index = groups[(groups >= 2) & (groups <= 5)].index # 이 범위를 파라미터로 조정하는걸로
small_groups = train_data[
    train_data["latitude"].isin(conditioned_groups_index.get_level_values(0)) &
    train_data["longitude"].isin(conditioned_groups_index.get_level_values(1))
]
train_data.drop(small_groups.index, axis=0, inplace=True)

In [4]:
# built_year > 2024 행 삭제
train_data = train_data[train_data["built_year"] < 2024]
train_data.reset_index(drop=True, inplace=True)

# Feature Engineering

In [5]:
# log 변환
train_data, test_data = apply_log_transformation(train_data, test_data)

#### 가격 Clustering EDA

In [8]:
# 먼저 데이터 정렬 및 인덱스 리셋
sorted_train_data = train_data.sort_values(by="deposit").reset_index(drop=True)
# deposit을 기준으로 그룹을 나눔
# 10,000 미만은 그룹 0, 10,000~100,000 사이는 그룹 1, 100,000 이상은 100,000 단위로 그룹화
def categorize_deposit(deposit):
    if deposit < 10000:
        return 0  # 10,000 미만
    elif deposit <= 100000:
        return 1  # 10,000 ~ 100,000 사이
    elif deposit <= 200000:
        return 2
    elif deposit <= 300000:
        return 3
    elif deposit <= 400000:
        return 4
    elif deposit <= 500000:
        return 5
    elif deposit <= 600000:
        return 6
    else:
        return 7 
# 그룹화 적용
sorted_train_data["deposit_group"] = sorted_train_data["deposit"].apply(categorize_deposit)
# 그룹별 통계 출력
train_data = sorted_train_data
print(train_data.groupby("deposit_group")["deposit"].agg(["min", "max", "mean", "count"]))

                    min       max           mean    count
deposit_group                                            
0                 300.0    9990.0    7131.513486    54576
1               10000.0  100000.0   36449.974768  1689931
2              100100.0  200000.0  131001.424749    42717
3              201500.0  300000.0  238188.343877     2466
4              304000.0  400000.0  347309.405941      303
5              405000.0  500000.0  450325.000000       80
6              505000.0  600000.0  553764.705882       34
7              620000.0  950000.0  734527.777778       18


In [9]:
# Feature Select
selected_cols = [
    "log_area_m2",
    "built_year",
    "latitude",
    "longitude",
    "log_subway_distance",
    "contract_year_month",
    "num_of_subways_within_radius",
    "park_exists",
    "region",
    "region_mean",
]
X, test_data = train_data[selected_cols], test_data[selected_cols]

# Data Split
y = train_data["deposit_group"]

In [12]:
print(f"X: {X.columns}")
print(f"test_data: {test_data.columns}")
print(f"y: {y.shape}")

X: Index(['log_area_m2', 'built_year', 'latitude', 'longitude',
       'log_subway_distance', 'contract_year_month',
       'num_of_subways_within_radius', 'park_exists', 'region', 'region_mean'],
      dtype='object')
test_data: Index(['log_area_m2', 'built_year', 'latitude', 'longitude',
       'log_subway_distance', 'contract_year_month',
       'num_of_subways_within_radius', 'park_exists', 'region', 'region_mean'],
      dtype='object')
y: (1790125,)


# Modeling

In [13]:
import xgboost as xgb
from sklearn.metrics import log_loss, mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
import optuna
from tqdm import tqdm

In [14]:
run = str(input("run 이름을 입력하세요 :"))
selected_model = str(input("model 명을 입력하세요 (xgb/rf) :"))
opt = bool(input("Optuna 사용 여부를 입력하세요 (뭐라도 입력 시 사용) :"))

wandb.init(
    settings=wandb.Settings(start_method="thread"),
    dir=None,  # 로컬에 로그 저장하지 않음
    entity="remember-us", # team name,
    project="deposit", # project name
    name=run, # run name
    config={
        "User": os.path.basename(os.path.dirname(os.getcwd())) # jupyter는 이렇게
    } # common setting
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdavinkeem[0m ([33mremember-us[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [15]:
def train(model_name, X, y, params):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

    match model_name:
        case "xgb-cls":
            model = xgb.XGBClassifier(**params, random_state=42, device="cuda", use_label_encoder=False, n_jobs=-1)
            model.fit(X_train, y_train)

            y_pred_proba = model.predict_proba(X_valid)
            score = log_loss(y_valid, y_pred_proba)
            
        case "xgb-reg":
            model = xgb.XGBRegressor(**params, random_state=42, device="cuda", n_jobs=-1)
            model.fit(X_train, y_train)

            y_pred = model.predict(X_valid)
            score = mean_absolute_error(y_valid, y_pred)
        
        case "rf-cls":
            model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
            model.fit(X_train, y_train)

            y_pred_proba = model.predict_proba(X_valid)
            score = log_loss(y_valid, y_pred_proba)
        
        case "rf-reg":
            model = RandomForestRegressor(**params, random_state=42, n_jobs=-1)
            model.fit(X_train, y_train)

            y_pred = model.predict(X_valid)
            score = mean_absolute_error(y_valid, y_pred)
            
        case _:
            raise ValueError(f"지원하지 않는 모델 이름: {model_name}")
    
    return score

In [16]:
def cv_train(model_name, X, y, params):
    cv = 5
    kfold = KFold(n_splits=cv, shuffle=True, random_state=42)

    score_list = []
    for i, (train_idx, valid_idx) in enumerate(kfold.split(X, y), start=1):
        X_train, y_train = X.loc[train_idx, :], y.iloc[train_idx]
        X_valid, y_valid = X.loc[valid_idx, :], y.iloc[valid_idx]
        
        match model_name:
            case "xgb-cls":
                model = xgb.XGBClassifier(**params, random_state=42, device="cuda", use_label_encoder=False, n_jobs=-1)
                model.fit(X_train, y_train)

                y_pred_proba = model.predict_proba(X_valid)
                fold_score = log_loss(y_valid, y_pred_proba)
                
            case "xgb-reg":
                model = xgb.XGBRegressor(**params, random_state=42, device="cuda", n_jobs=-1)
                model.fit(X_train, y_train)

                y_pred = model.predict(X_valid)
                fold_score = mean_absolute_error(y_valid, y_pred)
            
            case "rf-cls":
                model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
                model.fit(X_train, y_train)

                y_pred_proba = model.predict_proba(X_valid)
                fold_score = log_loss(y_valid, y_pred_proba)
            
            case "rf-reg":
                model = RandomForestRegressor(**params, random_state=42, n_jobs=-1)
                model.fit(X_train, y_train)

                y_pred = model.predict(X_valid)
                fold_score = mean_absolute_error(y_valid, y_pred)
                
            case _:
                raise ValueError(f"지원하지 않는 모델 이름: {model_name}")
                
        score_list.append(fold_score)
    mean_score = np.mean(score_list)
    return mean_score

In [17]:
def optuna_train(model_name, X, y):
    match model_name:
        case "xgb-cls":
            def objective(trial):
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
                    "max_depth": trial.suggest_int("max_depth", 3, 12),
                    "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
                    "gamma": trial.suggest_float("gamma", 0, 5),
                }

                return train(model_name, X, y, params)

        case "xgb-reg":
            def objective(trial):
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
                    "max_depth": trial.suggest_int("max_depth", 3, 12),
                    "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
                    "gamma": trial.suggest_float("gamma", 0, 5),
                }
                
                return train(model_name, X, y, params)
         
        case "rf-cls":
            def objective(trial):
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                    "max_depth": trial.suggest_int("max_depth", 1, 30),  # 깊이를 1에서 30으로 조정
                    "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),  # 최소 샘플 분할 수
                    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),  # 최소 리프 샘플 수
                }

                return train(model_name, X, y, params)
        
        case "rf-reg":
            def objective(trial):
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                    "max_depth": trial.suggest_int("max_depth", 1, 30),  # 깊이를 1에서 30으로 조정
                    "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),  # 최소 샘플 분할 수
                    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),  # 최소 리프 샘플 수
                }

                return train(model_name, X, y, params)

        case _:
            raise ValueError(f"지원하지 않는 모델 이름: {model_name}")
    
    sampler = optuna.samplers.TPESampler(seed=42)
    study = optuna.create_study(direction="minimize", sampler=sampler)
    study.optimize(objective, n_trials=50)
    return study.best_params, study.best_value

### deposit_group 학습 및 예측

In [18]:
match selected_model:
    case "xgb": model_name = "xgb-cls"
    case "rf": model_name = "rf-cls"

In [19]:
# if opt:
#     best_params, _ = optuna_train(model_name, X, y) # 26m
# else:
best_params = {
    "n_estimators": 233,
    "learning_rate": 0.07852904743105574,
    "max_depth": 11,
    "subsample": 0.6783519848540355,
    "colsample_bytree": 0.8538253356284413,
    "gamma": 0.5541267582675744
}

# valid_logloss = cv_train(model_name, X, y, best_params)
# print(f"Valid logloss = {valid_logloss}")

[I 2024-10-23 14:25:08,976] A new study created in memory with name: no-name-46fcb6ab-0420-4fdc-9be7-d0b1c782c57f
[I 2024-10-23 14:25:22,273] Trial 0 finished with value: 0.061051012982404604 and parameters: {'n_estimators': 144, 'learning_rate': 0.19063571821788408, 'max_depth': 10, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132}. Best is trial 0 with value: 0.061051012982404604.
[I 2024-10-23 14:25:28,752] Trial 1 finished with value: 0.06993587350373705 and parameters: {'n_estimators': 64, 'learning_rate': 0.1745734676972377, 'max_depth': 9, 'subsample': 0.8540362888980227, 'colsample_bytree': 0.5102922471479012, 'gamma': 4.8495492608099715}. Best is trial 0 with value: 0.061051012982404604.
[I 2024-10-23 14:25:45,140] Trial 2 finished with value: 0.09204055996173294 and parameters: {'n_estimators': 258, 'learning_rate': 0.05034443102887247, 'max_depth': 4, 'subsample': 0.5917022549267169, 'colsample_bytree': 0.6521211214797689, 

In [20]:
best_params

{'n_estimators': 233,
 'learning_rate': 0.07852904743105574,
 'max_depth': 11,
 'subsample': 0.6783519848540355,
 'colsample_bytree': 0.8538253356284413,
 'gamma': 0.5541267582675744}

In [21]:
match selected_model:
    case "xgb":
        best_model = xgb.XGBClassifier(**best_params, random_state=42, device="cuda", use_label_encoder=False, n_jobs=-1)
    case "rf":
        best_model = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)

best_model.fit(X, y)

In [22]:
predicted_groups = best_model.predict(test_data)
test_data["predicted_group"] = predicted_groups

### deposit 예측

In [23]:
match selected_model:
    case "xgb": model_name = "xgb-reg"
    case "rf": model_name = "rf-reg"

In [24]:
# reg
group_params = {
    0: {"n_estimators": 269,
    "learning_rate": 0.0594699816408674,
    "max_depth": 11,
    "subsample": 0.7547912219027157,
    "colsample_bytree": 0.7020843771180812,
    "gamma": 3.037806599477243},
    1: {"n_estimators": 276,
    "learning_rate": 0.15579191199373718,
    "max_depth": 12,
    "subsample": 0.909150931054429,
    "colsample_bytree": 0.8709809907337003,
    "gamma": 3.936332525239126},
    2: {"n_estimators": 187,
    "learning_rate": 0.04512234654985014,
    "max_depth": 12,
    "subsample": 0.8875664116805573,
    "colsample_bytree": 0.9697494707820946,
    "gamma": 4.474136752138244},
    3: {"n_estimators": 279,
    "learning_rate": 0.11548075621633985,
    "max_depth": 5,
    "subsample": 0.6857659688575958,
    "colsample_bytree": 0.86707596884712,
    "gamma": 0.2970741820173067},
    4: {"n_estimators": 262,
    "learning_rate": 0.10181884312738954,
    "max_depth": 12,
    "subsample": 0.9636784876731649,
    "colsample_bytree": 0.9301563662590965,
    "gamma": 3.9023500438592036},
    5: {"n_estimators": 144,
    "learning_rate": 0.19063571821788408,
    "max_depth": 10,
    "subsample": 0.7993292420985183,
    "colsample_bytree": 0.5780093202212182,
    "gamma": 0.7799726016810132},
    6: {"n_estimators": 98,
    "learning_rate": 0.13418531015780658,
    "max_depth": 7,
    "subsample": 0.8210566991625188,
    "colsample_bytree": 0.91306660229789,
    "gamma": 1.1997602717553963},
    7: {"n_estimators": 237,
    "learning_rate": 0.1903026381932035,
    "max_depth": 8,
    "subsample": 0.6737126835787389,
    "colsample_bytree": 0.7374821279913889,
    "gamma": 1.1574290155684595}
}

In [25]:
from imblearn.over_sampling import SMOTE, RandomOverSampler

deposit_group_unique = train_data["deposit_group"].unique()

In [26]:
deposit_group_unique

array([0, 1, 2, 3, 4, 5, 6, 7])

In [27]:
def train_regressors_per_group(train_data):
    group_models = {}
    group_params = {}
    group_scores = {}
    group_lens = {}
    for group in tqdm(deposit_group_unique, desc="Training models per group"):
        group_data = train_data[train_data["deposit_group"] == group]

        y_group = group_data["deposit"]
        if group >= deposit_group_unique.max()-2:
            X_group = group_data[["latitude", "longitude"]]
            ros = RandomOverSampler(random_state=42)
            X_group, y_group = ros.fit_resample(X_group, y_group)
        else:
            X_group = group_data[selected_cols]

        # 모델 훈련
        if opt:
            best_params, best_value = optuna_train(model_name, X_group, y_group) # 26m
            group_params[group] = best_params
            group_scores[group] = best_value
        else:
            best_params = group_params.get(group, {})
            score = train(model_name, X_group, y_group, best_params)
            group_scores[group] = score
        
        match selected_model:
            case "xgb": model = xgb.XGBRegressor(**best_params, random_state=42, device="cuda", n_jobs=-1)
            case "rf": model = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
        model.fit(X_group, y_group)
        
        # 각 그룹에 해당하는 모델 저장
        group_models[group] = model
        group_lens[group] = len(y_group)
        
    return group_models, group_params, group_scores, group_lens

In [28]:
# 4. `deposit_group` 별로 회귀 모델 훈련
group_models, group_params, group_scores, group_lens = train_regressors_per_group(train_data)

Training models per group:   0%|          | 0/8 [00:00<?, ?it/s][I 2024-10-23 14:45:31,407] A new study created in memory with name: no-name-375ad804-6a6c-4295-bca5-912dd7f1a779
[I 2024-10-23 14:45:34,604] Trial 0 finished with value: 817.1135795596058 and parameters: {'n_estimators': 144, 'learning_rate': 0.19063571821788408, 'max_depth': 10, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132}. Best is trial 0 with value: 817.1135795596058.
[I 2024-10-23 14:45:35,487] Trial 1 finished with value: 865.1740304870717 and parameters: {'n_estimators': 64, 'learning_rate': 0.1745734676972377, 'max_depth': 9, 'subsample': 0.8540362888980227, 'colsample_bytree': 0.5102922471479012, 'gamma': 4.8495492608099715}. Best is trial 0 with value: 817.1135795596058.
[I 2024-10-23 14:45:37,211] Trial 2 finished with value: 1104.6272257367698 and parameters: {'n_estimators': 258, 'learning_rate': 0.05034443102887247, 'max_depth': 4, 'subsample': 0.591702

In [40]:
group_params

{0: {'n_estimators': 269,
  'learning_rate': 0.0594699816408674,
  'max_depth': 11,
  'subsample': 0.7547912219027157,
  'colsample_bytree': 0.7020843771180812,
  'gamma': 3.037806599477243},
 1: {'n_estimators': 276,
  'learning_rate': 0.15579191199373718,
  'max_depth': 12,
  'subsample': 0.909150931054429,
  'colsample_bytree': 0.8709809907337003,
  'gamma': 3.936332525239126},
 2: {'n_estimators': 187,
  'learning_rate': 0.04512234654985014,
  'max_depth': 12,
  'subsample': 0.8875664116805573,
  'colsample_bytree': 0.9697494707820946,
  'gamma': 4.474136752138244},
 3: {'n_estimators': 279,
  'learning_rate': 0.11548075621633985,
  'max_depth': 5,
  'subsample': 0.6857659688575958,
  'colsample_bytree': 0.86707596884712,
  'gamma': 0.2970741820173067},
 4: {'n_estimators': 262,
  'learning_rate': 0.10181884312738954,
  'max_depth': 12,
  'subsample': 0.9636784876731649,
  'colsample_bytree': 0.9301563662590965,
  'gamma': 3.9023500438592036},
 5: {'n_estimators': 144,
  'learning_

In [30]:
group_scores

{0: 794.3067360133992,
 1: 3927.712754652222,
 2: 10142.438444559048,
 3: 15067.828725961539,
 4: 22157.635245901638,
 5: 16451.714488636364,
 6: 16338.940476190477,
 7: 29481.75}

In [31]:
scores = sum(score * group_lens[group] for group, score in group_scores.items())
total_count = sum(group_lens.values())
mean_score = scores / total_count

print(f"Mean MAE: {mean_score:.4f}")

Mean MAE: 4002.4018


In [34]:
def predict_per_group(test_data, group_models):
    # 예측값을 저장할 배열 초기화
    y_pred = np.zeros(len(test_data))
    
    # 그룹별로 데이터 분리 후 예측
    for group, model in group_models.items():
        group_data = test_data[test_data["predicted_group"] == group]
        if group >= deposit_group_unique.max()-2:
            X_group = group_data[["latitude", "longitude"]]
        else:
            X_group = group_data[selected_cols]
        
        # 각 그룹에 대해 예측
        if len(X_group) > 0:  # 해당 그룹에 데이터가 있는 경우만 예측
            y_pred_group = model.predict(X_group)
            y_pred[test_data["predicted_group"] == group] = y_pred_group

    return y_pred

In [35]:
y_test_pred = predict_per_group(test_data, group_models)

In [36]:
group_params = {int(k): v for k, v in group_params.items()}
wandb.log({
    "features": selected_cols,
    "model": selected_model,
    "params": best_params,
    "group_params": group_params,
    # "Valid logloss": valid_logloss,
    "Valid MAE": mean_score,
    "Optuna": opt
})
wandb.finish()

0,1
Valid MAE,▁

0,1
Optuna,True
Valid MAE,4002.40185
model,xgb


# Inference

In [37]:
sample_submission["deposit"] = y_test_pred

In [38]:
sample_submission

Unnamed: 0,index,deposit
0,0,22709.406250
1,1,7512.338867
2,2,5625.015625
3,3,5625.015625
4,4,6254.698730
...,...,...
150167,150167,37650.242188
150168,150168,44235.730469
150169,150169,44235.730469
150170,150170,38394.164062


In [39]:
sample_submission.to_csv("output.csv", index=False)