# 라이브러리 불러오기

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from data.load_dataset import load_dataset
from data.merge_dataset import merge_dataset
from data.feature_engineering import ClusteringModel
# from model.inference import save_csv
from model.feature_select import select_features
from model.data_split import split_features_and_target
from model.log_transformation import apply_log_transformation
# from model.model_train import cv_train, set_model, optuna_train
import argparse
import os
import wandb
import warnings
warnings.filterwarnings("ignore")

# Data Load

In [37]:
# 기존 데이터 불러오기
train_data, test_data, sample_submission, interest_data, subway_data, school_data, park_data = load_dataset()

# 기존 데이터에 새로운 feature들을 병합한 데이터프레임 불러오기
train_data, test_data = merge_dataset(train_data, test_data, interest_data, subway_data, school_data, park_data)

# Data Preprocessing

In [38]:
# 위치 중복도 낮은 행 삭제
groups = train_data.groupby(["latitude", "longitude"])["index"].count()
conditioned_groups_index = groups[(groups >= 2) & (groups <= 5)].index # 이 범위를 파라미터로 조정하는걸로
small_groups = train_data[
    train_data["latitude"].isin(conditioned_groups_index.get_level_values(0)) &
    train_data["longitude"].isin(conditioned_groups_index.get_level_values(1))
]
train_data.drop(small_groups.index, axis=0, inplace=True)

In [39]:
# built_year > 2024 행 삭제
train_data = train_data[train_data["built_year"] < 2024]
train_data.reset_index(drop=True, inplace=True)

# Feature Engineering

In [40]:
#clustering
cluster_data = train_data[["latitude", "longitude"]]
clustering_model = ClusteringModel(cluster_data)
kmeans_model = clustering_model.kmeans_clustering(
    n_clusters = 25,
    train_data = train_data,
    test_data = test_data,
    feature_columns = ["latitude", "longitude"],
    label_column = 'region'
)

In [41]:
# region_mean 병합
region_mean = train_data.groupby('region')['deposit'].mean().reset_index()
region_mean.columns = ['region', 'region_mean']
train_data = train_data.merge(region_mean, on='region', how='left')
test_data = test_data.merge(region_mean, on='region', how='left')

In [42]:
# log 변환
train_data, test_data = apply_log_transformation(train_data, test_data)

#### 가격 Clustering EDA

In [43]:
# sorted_train_data = train_data.sort_values(by="deposit").reset_index(drop=True)
# sorted_train_data["deposit_group"] = sorted_train_data.index // 180000
# train_data = sorted_train_data
# print(train_data.groupby('deposit_group')['deposit'].agg(['min', 'max', 'mean', 'count']))

In [None]:
# 먼저 데이터 정렬 및 인덱스 리셋
sorted_train_data = train_data.sort_values(by="deposit").reset_index(drop=True)
# deposit을 기준으로 그룹을 나눔
# 10,000 미만은 그룹 0, 10,000~100,000 사이는 그룹 1, 100,000 이상은 100,000 단위로 그룹화
def categorize_deposit(deposit):
    if deposit < 10000:
        return 0  # 10,000 미만
    elif deposit <= 100000:
        return 1  # 10,000 ~ 100,000 사이
    elif deposit <= 200000:
        return 2
    elif deposit <= 300000:
        return 3
    elif deposit <= 400000:
        return 4
    elif deposit <= 500000:
        return 5
    elif deposit <= 600000:
        return 6
    else:
        return 7 
# 그룹화 적용
sorted_train_data["deposit_group"] = sorted_train_data["deposit"].apply(categorize_deposit)
# 그룹별 통계 출력
train_data = sorted_train_data
print(train_data.groupby('deposit_group')['deposit'].agg(['min', 'max', 'mean', 'count']))

In [45]:
# 성능 평가를 위한 holdout 데이터셋
holdout_start = 202307
holdout_end = 202312
holdout_data = train_data[(train_data["contract_year_month"] >= holdout_start) & (train_data["contract_year_month"] <= holdout_end)]
train_data = train_data[~((train_data["contract_year_month"] >= holdout_start) & (train_data["contract_year_month"] <= holdout_end))]

holdout_data.reset_index(drop=True, inplace=True)
train_data.reset_index(drop=True, inplace=True)

In [46]:
# Feature Select
selected_cols = [
    "log_area_m2",
    "built_year",
    "latitude",
    "longitude",
    "log_subway_distance",
    "log_school_distance",
    "log_park_distance",
    "contract_year_month",
    "contract_day"
    # "num_of_subways_within_radius",
    # "num_of_parks_within_radius",
    # "region",
    # "region_mean",
]
X, test_data, X_hold = train_data[selected_cols], test_data[selected_cols], holdout_data[selected_cols]

# Data Split
y, y_hold = train_data["deposit_group"], holdout_data["deposit"]

In [47]:
# from sklearn.feature_selection import SelectKBest, f_regression, f_classif, RFE

# def select_kbest(X, y, target, k=10):
#     """
#     SelectKBest 방법을 사용하여 상위 k개의 특성 선택
#     Args:
#         X (DataFrame): 독립변수
#         y (DataFrame): 종속변수
#         target (str): 종속변수 열 중 실제 사용할 target 열 이름
#         k (int, optional): 선택할 상위 k개 특성의 수 (Defaults to 10)
#     Returns:
#         List[str]: 선택된 상위 k개의 특성의 열 이름 리스트
#     """
#     # SelectKBest 적용
#     selector = SelectKBest(score_func=f_classif, k=k)
#     selector.fit(X, y[target])
#     # 선택된 특성의 열 이름 리스트 반환
#     selected_cols = X.columns[selector.get_support()].tolist()
#     return selected_cols

In [48]:
# selected_cols = select_kbest(X, train_data, "deposit_group")
# selected_cols

In [None]:
print(f"X: {X.columns}")
print(f"X_hold: {X_hold.columns}")
print(f"test_data: {test_data.columns}")
print(f"y: {y.shape}")
print(f"y_hold: {y_hold.shape}")

# Modeling

In [50]:
import xgboost as xgb
from sklearn.metrics import log_loss, mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
import optuna
from tqdm import tqdm

In [None]:
run = str(input("run 이름을 입력하세요 :"))
selected_model = str(input("model 명을 입력하세요 (xgb/rf) :"))
opt = bool(input("Optuna 사용 여부를 입력하세요 (뭐라도 입력 시 사용) :"))

wandb.init(
    settings=wandb.Settings(start_method="thread"),
    dir=None,  # 로컬에 로그 저장하지 않음
    entity="remember-us", # team name,
    project="deposit", # project name
    name=run, # run name
    config={
        "User": os.path.basename(os.path.dirname(os.getcwd())) # jupyter는 이렇게
    } # common setting
)

In [52]:
def train(model_name, X, y, params):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

    match model_name:
        case "xgb-cls":
            model = xgb.XGBClassifier(**params, random_state=42, device="cuda", use_label_encoder=False, n_jobs=-1)
            model.fit(X_train, y_train)

            y_pred_proba = model.predict_proba(X_valid)
            score = log_loss(y_valid, y_pred_proba)
            
        case "xgb-reg":
            model = xgb.XGBRegressor(**params, random_state=42, device="cuda", n_jobs=-1)
            model.fit(X_train, y_train)

            y_pred = model.predict(X_valid)
            score = mean_absolute_error(y_valid, y_pred)
        
        case "rf-cls":
            model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
            model.fit(X_train, y_train)

            y_pred_proba = model.predict_proba(X_valid)
            score = log_loss(y_valid, y_pred_proba)
        
        case "rf-reg":
            model = RandomForestRegressor(**params, random_state=42, n_jobs=-1)
            model.fit(X_train, y_train)

            y_pred = model.predict(X_valid)
            score = mean_absolute_error(y_valid, y_pred)
            
        case _:
            raise ValueError(f"지원하지 않는 모델 이름: {model_name}")
    
    return score

In [53]:
def cv_train(model_name, X, y, params):
    cv = 5
    kfold = KFold(n_splits=cv, shuffle=True, random_state=42)

    score_list = []
    for i, (train_idx, valid_idx) in enumerate(kfold.split(X, y), start=1):
        X_train, y_train = X.loc[train_idx, :], y.iloc[train_idx]
        X_valid, y_valid = X.loc[valid_idx, :], y.iloc[valid_idx]
        
        match model_name:
            case "xgb-cls":
                model = xgb.XGBClassifier(**params, random_state=42, device="cuda", use_label_encoder=False, n_jobs=-1)
                model.fit(X_train, y_train)

                y_pred_proba = model.predict_proba(X_valid)
                fold_score = log_loss(y_valid, y_pred_proba)
                
            case "xgb-reg":
                model = xgb.XGBRegressor(**params, random_state=42, device="cuda", n_jobs=-1)
                model.fit(X_train, y_train)

                y_pred = model.predict(X_valid)
                fold_score = mean_absolute_error(y_valid, y_pred)
            
            case "rf-cls":
                model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
                model.fit(X_train, y_train)

                y_pred_proba = model.predict_proba(X_valid)
                fold_score = log_loss(y_valid, y_pred_proba)
            
            case "rf-reg":
                model = RandomForestRegressor(**params, random_state=42, n_jobs=-1)
                model.fit(X_train, y_train)

                y_pred = model.predict(X_valid)
                fold_score = mean_absolute_error(y_valid, y_pred)
                
            case _:
                raise ValueError(f"지원하지 않는 모델 이름: {model_name}")
                
        score_list.append(fold_score)
    mean_score = np.mean(score_list)
    return mean_score

In [54]:
def optuna_train(model_name, X, y):
    match model_name:
        case "xgb-cls":
            def objective(trial):
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
                    "max_depth": trial.suggest_int("max_depth", 3, 12),
                    "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
                    "gamma": trial.suggest_float("gamma", 0, 5),
                }

                return train(model_name, X, y, params)

        case "xgb-reg":
            def objective(trial):
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
                    "max_depth": trial.suggest_int("max_depth", 3, 12),
                    "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
                    "gamma": trial.suggest_float("gamma", 0, 5),
                }
                
                return train(model_name, X, y, params)
         
        case "rf-cls":
            def objective(trial):
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                    "max_depth": trial.suggest_int("max_depth", 1, 30),  # 깊이를 1에서 30으로 조정
                    "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),  # 최소 샘플 분할 수
                    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),  # 최소 리프 샘플 수
                }

                return train(model_name, X, y, params)
        
        case "rf-reg":
            def objective(trial):
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                    "max_depth": trial.suggest_int("max_depth", 1, 30),  # 깊이를 1에서 30으로 조정
                    "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),  # 최소 샘플 분할 수
                    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),  # 최소 리프 샘플 수
                }

                return train(model_name, X, y, params)

        case _:
            raise ValueError(f"지원하지 않는 모델 이름: {model_name}")
    
    sampler = optuna.samplers.TPESampler(seed=42)
    study = optuna.create_study(direction="minimize", sampler=sampler)
    study.optimize(objective, n_trials=50)
    return study.best_params

### deposit_group 학습 및 예측

In [55]:
match selected_model:
    case "xgb": model_name = "xgb-cls"
    case "rf": model_name = "rf-cls"

In [56]:
best_params = {'n_estimators': 144, 'max_depth': 29, 'min_samples_split': 8, 'min_samples_leaf': 6}

In [None]:
if opt:
    best_params = optuna_train(model_name, X, y) # 26m
else:
    best_params = {
        'n_estimators': 294,
        'learning_rate': 0.12447572794344873,
        'max_depth': 12,
        'subsample': 0.6648422529245377,
        'colsample_bytree': 0.8403361280374215,
        'gamma': 0.021125621227948838
    }
    
valid_logloss = cv_train(model_name, X, y, best_params)
print(f"Valid logloss = {valid_logloss}")

In [None]:
match selected_model:
    case "xgb":
        best_model = xgb.XGBClassifier(**best_params, random_state=42, device="cuda", use_label_encoder=False, n_jobs=-1)
    case "rf":
        best_model = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)

best_model.fit(X, y)

In [25]:
predicted_groups = best_model.predict(X_hold)
X_hold['predicted_group'] = predicted_groups

In [26]:
predicted_groups = best_model.predict(test_data)
test_data['predicted_group'] = predicted_groups

### deposit 예측

In [27]:
match selected_model:
    case "xgb": model_name = "xgb-reg"
    case "rf": model_name = "rf-reg"

In [28]:
group_params = {
    0: {'n_estimators': 271,
        'learning_rate': 0.0891731270740701,
        'max_depth': 12,
        'subsample': 0.9732533070235716,
        'colsample_bytree': 0.8126244533797121,
        'gamma': 3.7808956754662457},
    1: {'n_estimators': 283,
        'learning_rate': 0.09647453018390326,
        'max_depth': 12,
        'subsample': 0.9737151093633044,
        'colsample_bytree': 0.9939494366999071,
        'gamma': 3.9455604749354842},
    2: {'n_estimators': 282,
        'learning_rate': 0.1065334717290639,
        'max_depth': 12,
        'subsample': 0.9702043530164062,
        'colsample_bytree': 0.9789964718750591,
        'gamma': 3.8378676531756986},
    3: {'n_estimators': 282,
        'learning_rate': 0.17336659309722366,
        'max_depth': 12,
        'subsample': 0.9702043530164062,
        'colsample_bytree': 0.9697948745561529,
        'gamma': 0.12678299219713374},
    4: {'n_estimators': 287,
        'learning_rate': 0.09993219107638701,
        'max_depth': 12,
        'subsample': 0.9722495727394489,
        'colsample_bytree': 0.9919371080730972,
        'gamma': 3.939212978268534},
    5: {'n_estimators': 300,
        'learning_rate': 0.11132303462474055,
        'max_depth': 12,
        'subsample': 0.9737522364550457,
        'colsample_bytree': 0.9698805295562679,
        'gamma': 4.13663053619915},
    6: {'n_estimators': 292,
        'learning_rate': 0.1038221683452801,
        'max_depth': 12,
        'subsample': 0.9870909779118721,
        'colsample_bytree': 0.9997869198474675,
        'gamma': 3.9253609956556277},
    7: {'n_estimators': 284,
        'learning_rate': 0.1072259934273102,
        'max_depth': 12,
        'subsample': 0.9340584828362537,
        'colsample_bytree': 0.9719409290683207,
        'gamma': 4.634228201489641},
    8: {'n_estimators': 292,
        'learning_rate': 0.1038221683452801,
        'max_depth': 12,
        'subsample': 0.9870909779118721,
        'colsample_bytree': 0.9997869198474675,
        'gamma': 3.9253609956556277},
    9: {'n_estimators': 295,
        'learning_rate': 0.10456616732278738,
        'max_depth': 12,
        'subsample': 0.9986735852799455,
        'colsample_bytree': 0.9979009436789888,
        'gamma': 3.8785918564436708}
}

In [29]:
def train_regressors_per_group(train_data):
    group_models = {}
    group_params = {}
    for group in tqdm(train_data['deposit_group'].unique(), desc="Training models per group"):
        group_data = train_data[train_data['deposit_group'] == group]
        X_group = group_data[selected_cols]
        y_group = group_data['deposit']

        # 모델 훈련
        if opt:
            best_params = optuna_train(model_name, X_group, y_group) # 26m
            group_params[group] = best_params
        else:
            best_params = group_params[group]
        
        match selected_model:
            case "xgb": model = xgb.XGBRegressor(**best_params, random_state=42, device="cuda", n_jobs=-1)
            case "rf": model = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
        model.fit(X_group, y_group)
        
        # 각 그룹에 해당하는 모델 저장
        group_models[group] = model
        
    return group_models, group_params

In [None]:
# 4. `deposit_group` 별로 회귀 모델 훈련
group_models, group_params = train_regressors_per_group(train_data)

In [31]:
def predict_per_group_hold(X_hold, group_models):
    # 예측값을 저장할 배열 초기화
    y_pred = np.zeros(len(X_hold))
    
    # 그룹별로 데이터 분리 후 예측
    for group, model in group_models.items():
        group_data = X_hold[X_hold['predicted_group'] == group]
        X_group = group_data[selected_cols]
        
        # 각 그룹에 대해 예측
        if len(X_group) > 0:  # 해당 그룹에 데이터가 있는 경우만 예측
            y_pred_group = model.predict(X_group)
            y_pred[X_hold['predicted_group'] == group] = y_pred_group

    return y_pred

In [32]:
def predict_per_group(test_data, group_models):
    # 예측값을 저장할 배열 초기화
    y_pred = np.zeros(len(test_data))
    
    # 그룹별로 데이터 분리 후 예측
    for group, model in group_models.items():
        group_data = test_data[test_data['predicted_group'] == group]
        X_group = group_data[selected_cols]
        
        # 각 그룹에 대해 예측
        if len(X_group) > 0:  # 해당 그룹에 데이터가 있는 경우만 예측
            y_pred_group = model.predict(X_group)
            y_pred[test_data['predicted_group'] == group] = y_pred_group

    return y_pred

In [33]:
y_hold_pred = predict_per_group_hold(X_hold, group_models)
y_test_pred = predict_per_group(test_data, group_models)

In [None]:
hold_mae = mean_absolute_error(holdout_data["deposit"], y_hold_pred)
print(f"Holdout Mean MAE = {hold_mae.mean():.4f}")

In [None]:
group_params = {int(k): v for k, v in group_params.items()}
wandb.log({
    "features": selected_cols,
    "model": selected_model,
    "params": best_params,
    "group_params": group_params,
    "Valid logloss": valid_logloss,
    "Holdout MAE": hold_mae,
    "Optuna": opt
})
wandb.finish()

# Inference

In [27]:
sample_submission["deposit"] = y_test_pred

In [None]:
sample_submission

In [31]:
sample_submission.to_csv("output.csv", index=False)