# 라이브러리 불러오기

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from data.load_dataset import load_dataset
from data.merge_dataset import merge_dataset
from data.feature_engineering import apply_log_transformation, ClusteringModel
from model.inference import save_csv
from model.feature_select import select_features
# from model.data_split import split_features_and_target
# from model.model_train import cv_train, set_model, optuna_train
import argparse
import os
import wandb
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# 데이터 불러오기

In [2]:
# 기존 데이터 불러오기
train_data, test_data, sample_submission, interest_data, subway_data, school_data, park_data = load_dataset()

# 기존 데이터에 새로운 feature들을 병합한 데이터프레임 불러오기
train_data, test_data = merge_dataset(train_data, test_data, interest_data, subway_data, school_data, park_data)

In [3]:
train_data

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,...,nearest_school_num,nearest_park_num,num_of_subways_within_radius,num_of_schools_within_radius,num_of_parks_within_radius,park,region,nearest_leader_distance,nearest_leader_latitude,nearest_leader_longitude
0,0,84.9981,201906,25,2,9,2019,37.054314,127.045216,0,...,1,1,0,2,1,1,9,6364.110019,37.016774,127.099337
1,1,84.9981,202003,26,2,20,2019,37.054314,127.045216,1,...,1,1,0,2,1,1,9,6364.110019,37.016774,127.099337
2,2,84.9981,202003,28,2,8,2019,37.054314,127.045216,1,...,1,1,0,2,1,1,9,6364.110019,37.016774,127.099337
3,3,59.3400,201907,15,2,1,1986,36.964647,127.055847,33,...,1,1,0,4,0,0,9,6965.283186,37.016774,127.099337
4,4,59.8100,201904,12,2,6,1995,36.972390,127.084514,24,...,1,1,0,0,0,0,9,5107.786855,37.016774,127.099337
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1801223,1801223,114.8126,202311,25,0,5,2010,37.528394,126.659398,13,...,1,1,0,4,1,1,21,1870.242566,37.531248,126.638497
1801224,1801224,101.9088,202311,28,0,6,2010,37.528394,126.659398,13,...,1,1,0,4,1,1,21,1870.242566,37.531248,126.638497
1801225,1801225,114.7900,202312,3,0,19,2010,37.528394,126.659398,13,...,1,1,0,4,1,1,21,1870.242566,37.531248,126.638497
1801226,1801226,101.9088,202312,4,1,15,2010,37.528394,126.659398,13,...,1,1,0,4,1,1,21,1870.242566,37.531248,126.638497


In [4]:
test_data

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,...,nearest_school_num,nearest_park_num,num_of_subways_within_radius,num_of_schools_within_radius,num_of_parks_within_radius,park,region,nearest_leader_distance,nearest_leader_latitude,nearest_leader_longitude
0,0,84.9610,202404,12,1,14,2016,36.965423,127.048779,8,...,1,1,0,0,0,0,9,7263.960558,37.016774,127.099337
1,1,59.9000,202404,13,0,4,1997,36.963105,127.040678,27,...,1,1,0,0,0,0,9,7921.813207,37.016774,127.099337
2,2,39.2700,202404,29,0,5,1990,36.957089,127.047449,34,...,1,1,0,0,0,0,9,8079.923658,37.016774,127.099337
3,3,39.2700,202405,3,0,1,1990,36.957089,127.047449,34,...,1,1,0,0,0,0,9,8079.923658,37.016774,127.099337
4,4,46.9800,202406,2,0,4,1990,36.957089,127.047449,34,...,1,1,0,0,0,0,9,8079.923658,37.016774,127.099337
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150167,150167,115.5101,202402,27,0,17,2010,37.528394,126.659398,14,...,1,1,0,4,1,1,21,1870.242566,37.531248,126.638497
150168,150168,142.8738,202403,2,0,4,2010,37.528394,126.659398,14,...,1,1,0,4,1,1,21,1870.242566,37.531248,126.638497
150169,150169,142.8738,202403,16,1,13,2010,37.528394,126.659398,14,...,1,1,0,4,1,1,21,1870.242566,37.531248,126.638497
150170,150170,114.9285,202403,22,1,2,2010,37.528394,126.659398,14,...,1,1,0,4,1,1,21,1870.242566,37.531248,126.638497


In [5]:
# 위치 중복도 낮은 행 삭제
groups = train_data.groupby(["latitude", "longitude"])["index"].count()
conditioned_groups_index = groups[(groups >= 2) & (groups <= 5)].index # 이 범위를 파라미터로 조정하는걸로
small_groups = train_data[
    train_data["latitude"].isin(conditioned_groups_index.get_level_values(0)) &
    train_data["longitude"].isin(conditioned_groups_index.get_level_values(1))
]
train_data.drop(small_groups.index, axis=0, inplace=True)

In [6]:
# built_year > 2024 행 삭제
train_data = train_data[train_data["built_year"] < 2024]
train_data.reset_index(drop=True, inplace=True)

In [7]:
#clustering
cluster_data = train_data[["latitude", "longitude"]]
clustering_model = ClusteringModel(cluster_data)
kmeans_model = clustering_model.kmeans_clustering(
    n_clusters = 25,
    train_data = train_data,
    test_data = test_data,
    feature_columns = ["latitude", "longitude"],
    label_column = 'region'
)

In [8]:
# region_mean 병합
region_mean = train_data.groupby('region')['deposit'].mean().reset_index()
region_mean.columns = ['region', 'region_mean']
train_data = train_data.merge(region_mean, on='region', how='left')
test_data = test_data.merge(region_mean, on='region', how='left')

In [9]:
# log 변환
train_data, test_data = apply_log_transformation(train_data, test_data)

In [10]:
# sorted_train_data = train_data.sort_values(by="deposit").reset_index(drop=True)
# sorted_train_data["deposit_group"] = sorted_train_data.index // 180000
# train_data = sorted_train_data
# print(train_data.groupby('deposit_group')['deposit'].agg(['min', 'max', 'mean', 'count']))

In [11]:
# 먼저 데이터 정렬 및 인덱스 리셋
sorted_train_data = train_data.sort_values(by="deposit").reset_index(drop=True)
# deposit을 기준으로 그룹을 나눔
# 10,000 미만은 그룹 0, 10,000~100,000 사이는 그룹 1, 100,000 이상은 100,000 단위로 그룹화
def categorize_deposit(deposit):
    if deposit < 10000:
        return 0  # 10,000 미만
    elif deposit <= 100000:
        return 1  # 10,000 ~ 100,000 사이
    elif deposit <= 200000:
        return 2
    elif deposit <= 300000:
        return 3
    elif deposit <= 400000:
        return 4
    elif deposit <= 500000:
        return 5
    elif deposit <= 600000:
        return 6
    else:
        return 7 
# 그룹화 적용
sorted_train_data["deposit_group"] = sorted_train_data["deposit"].apply(categorize_deposit)
# 그룹별 통계 출력
train_data = sorted_train_data
print(train_data.groupby('deposit_group')['deposit'].agg(['min', 'max', 'mean', 'count']))

                    min       max           mean    count
deposit_group                                            
0                 300.0    9990.0    7131.513486    54576
1               10000.0  100000.0   36449.974768  1689931
2              100100.0  200000.0  131001.424749    42717
3              201500.0  300000.0  238188.343877     2466
4              304000.0  400000.0  347309.405941      303
5              405000.0  500000.0  450325.000000       80
6              505000.0  600000.0  553764.705882       34
7              620000.0  950000.0  734527.777778       18


In [12]:
# Feature Select
train_cols = [
    "deposit_group",
    "deposit",
    "log_deposit",
    "log_area_m2",
    "built_year",
    "latitude",
    "longitude",
    "log_subway_distance",
    "log_school_distance",
    "log_park_distance",
    "contract_year_month",
    "contract_day",
    "num_of_subways_within_radius",
    "num_of_parks_within_radius",
    "region",
    "region_mean",
]
test_cols = [
    "log_area_m2",
    "built_year",
    "latitude",
    "longitude",
    "log_subway_distance",
    "log_school_distance",
    "log_park_distance",
    "contract_year_month",
    "contract_day",
    "num_of_subways_within_radius",
    "num_of_parks_within_radius",
    "region",
    "region_mean",
]
train_data, test_data = train_data[train_cols], test_data[test_cols]

In [13]:
features = train_data.drop(columns=['deposit', 'log_deposit', 'deposit_group'])
target = train_data['deposit_group']

In [14]:
from sklearn.feature_selection import SelectKBest, f_regression, RFE, f_classif

In [15]:
def select_kbest(X, y, target, k=10):
    """
    SelectKBest 방법을 사용하여 상위 k개의 특성 선택
    Args:
        X (DataFrame): 독립변수
        y (DataFrame): 종속변수
        target (str): 종속변수 열 중 실제 사용할 target 열 이름
        k (int, optional): 선택할 상위 k개 특성의 수 (Defaults to 10)
    Returns:
        List[str]: 선택된 상위 k개의 특성의 열 이름 리스트
    """
    # SelectKBest 적용
    selector = SelectKBest(score_func=f_classif, k=k)
    selector.fit(X, y[target])
    # 선택된 특성의 열 이름 리스트 반환
    selected_cols = X.columns[selector.get_support()].tolist()
    return selected_cols


selected_cols = select_kbest(features, train_data, "deposit_group")

In [16]:
selected_cols

['log_area_m2',
 'built_year',
 'latitude',
 'longitude',
 'log_subway_distance',
 'log_school_distance',
 'log_park_distance',
 'num_of_subways_within_radius',
 'region',
 'region_mean']

모델 학습에 필요한 라이브러리 import

In [17]:
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

In [18]:
# feature importance 시각화 함수
def plot_feature_importance(model, feature_names, model_type):
    # feature importance 가져오기
    importances = model.feature_importances_
    
    # 중요도를 기준으로 피처 정렬
    indices = np.argsort(importances)[::-1]
    sorted_importances = importances[indices]
    sorted_features = [feature_names[i] for i in indices]
    
    # 시각화
    plt.figure(figsize=(10, 6))
    plt.title(f"Feature Importances - {model_type}")
    plt.barh(range(len(sorted_features)), sorted_importances, align="center")
    plt.yticks(range(len(sorted_features)), sorted_features)
    plt.gca().invert_yaxis()  # 상위 중요도가 위로 오게 반전
    plt.xlabel("Importance")
    plt.show()

In [19]:
# 2. `deposit_group`을 타겟으로 모델 훈련 (XGBoost 사용)
def train_group_classifier(train_data):
    # 독립변수와 종속변수 분리
    features = train_data.drop(columns=['deposit', 'log_deposit', 'deposit_group'])
    target = train_data['deposit_group']
    
    # RandomForest 모델 훈련
    # best_params = {
    #     'n_estimators': 252,
    #     'learning_rate': 0.07771055959576277,
    #     'max_depth': 11,
    #     'subsample': 0.7986613347562391,
    #     'colsample_bytree': 0.8422383683572395,
    #     'gamma': 0.34407558339867306
    # }
    # classifier = RandomForestClassifier(**best_params, random_state=42, eval_metric='mlogloss')
    classifier = RandomForestClassifier(random_state=42, eval_metric='mlogloss', n_jobs=-1)
    classifier.fit(features, target)

    # XGBoost 모델 훈련
    # best_params = {
    #     'n_estimators': 252,
    #     'learning_rate': 0.07771055959576277,
    #     'max_depth': 11,
    #     'subsample': 0.7986613347562391,
    #     'colsample_bytree': 0.8422383683572395,
    #     'gamma': 0.34407558339867306
    # }
    # # classifier = XGBClassifier(**best_params, random_state=42, use_label_encoder=False, eval_metric='mlogloss')
    # classifier = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
    # classifier.fit(features, target)
    
    return classifier

In [21]:
# group_classifier = train_group_classifier(train_data)
# plot_feature_importance(group_classifier, features.columns, model_type="RandomForestClassifier")

In [22]:
# 3. 테스트 데이터의 `deposit_group` 예측
def predict_deposit_group(classifier, test_data):
    features = test_data
    predicted_groups = classifier.predict(features)
    test_data['predicted_group'] = predicted_groups
    return test_data

In [23]:
# 3. 테스트 데이터의 `deposit_group` 예측
test_data = predict_deposit_group(group_classifier, test_data)

NameError: name 'group_classifier' is not defined

In [24]:
group_params = {
        0: {'n_estimators': 299,
        'learning_rate': 0.07528019634863661,
        'max_depth': 12,
        'subsample': 0.9826861644413183,
        'colsample_bytree': 0.6361974955396621,
        'gamma': 0.13938848002312465},
    1: {'n_estimators': 276,
        'learning_rate': 0.15579191199373718,
        'max_depth': 12,
        'subsample': 0.909150931054429,
        'colsample_bytree': 0.8709809907337003,
        'gamma': 3.936332525239126},
    2: {'n_estimators': 282,
        'learning_rate': 0.05353956138863308,
        'max_depth': 12,
        'subsample': 0.9346361282442123,
        'colsample_bytree': 0.9739714687453176,
        'gamma': 4.082676340052684},
    3: {'n_estimators': 164,
        'learning_rate': 0.08351992311654627,
        'max_depth': 12,
        'subsample': 0.9598482300597749,
        'colsample_bytree': 0.8213376377326619,
        'gamma': 0.13938848002312465},
    4: {'n_estimators': 128,
        'learning_rate': 0.1224960449028662,
        'max_depth': 8,
        'subsample': 0.882895691571187,
        'colsample_bytree': 0.7384358080293545,
        'gamma': 1.3643518616482488},
    5: {'n_estimators': 60,
        'learning_rate': 0.0195312280618686,
        'max_depth': 8,
        'subsample': 0.604197860374508,
        'colsample_bytree': 0.9606755064214969,
        'gamma': 3.8599892180273407},
    6: {'n_estimators': 104,
        'learning_rate': 0.1408633637803759,
        'max_depth': 7,
        'subsample': 0.763889446515016,
        'colsample_bytree': 0.8858320716284691,
        'gamma': 0.9091078632943758},
    7: {'n_estimators': 172,
        'learning_rate': 0.164450647273469,
        'max_depth': 4,
        'subsample': 0.7642767983066922,
        'colsample_bytree': 0.8095967005933605,
        'gamma': 0.8290811474680035}
}

In [25]:
# 4. deposit_group 별로 나눈 후 회귀 모델 생성 및 훈련
def train_regressors_per_group(train_data):
    group_models = {}
    group_scores = {}
    for group in train_data['deposit_group'].unique():
        group_data = train_data[train_data['deposit_group'] == group]
        X_group = group_data.drop(columns=['deposit', 'log_deposit', 'deposit_group'])
        y_group = group_data['deposit']
        
        best_params = group_params[group]
        X_train, X_valid, y_train, y_valid = train_test_split(X_group, y_group, test_size=0.2, random_state=42)
        # RandomForest
        # model = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
        model = RandomForestRegressor(random_state=42, n_jobs=-1)
        # XGBoost
        # model = XGBRegressor(**best_params, random_state=42, device="cuda", n_jobs=-1)
        # model = XGBRegressor(random_state=42, device="cuda", n_jobs=-1)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_valid)
        score = mean_absolute_error(y_valid, y_pred)
        group_scores[group] = score
        
        # 모델 훈련
        model.fit(X_group, y_group)
        
        # 각 그룹에 해당하는 모델 저장
        group_models[group] = model
        
    return group_models, group_scores

In [26]:
# 4. deposit_group 별로 회귀 모델 훈련
group_models, group_scores = train_regressors_per_group(train_data)

In [27]:
group_scores

{0: 792.9399569157985,
 1: 3806.0339765716058,
 2: 10117.36475288803,
 3: 15542.699942163099,
 4: 22697.663934426226,
 5: 20714.6875,
 6: 19084.285714285714,
 7: 71775.0}

In [29]:
counts = train_data.groupby('deposit_group')['deposit'].count()
scores = sum(score * counts[group] for group, score in group_scores.items())
total_count = counts.sum()
mean_score = scores / total_count

print(f"Mean MAE: {mean_score:.4f}")

Mean MAE: 3885.8725


In [None]:
# 각 그룹별로 중요도 확인 (예: 그룹 1의 모델)
group = 1  # 확인하고자 하는 그룹
plot_feature_importance(group_models[group], features.columns, model_type=f"RandomForestRegressor (Group {group})")

In [26]:
# 5. 테스트 데이터의 `deposit_group` 별로 예측
def predict_per_group(test_data, group_models):
    # 예측값을 저장할 배열 초기화
    y_pred = np.zeros(len(test_data))
    
    # 그룹별로 데이터 분리 후 예측
    for group, model in group_models.items():
        group_data = test_data[test_data['predicted_group'] == group]
        
        # test_data에는 'deposit'이 없으므로 'deposit' 대신 'predicted_group'만 제외
        X_group = group_data.drop(columns=['predicted_group'])
        
        # 각 그룹에 대해 예측
        if len(X_group) > 0:  # 해당 그룹에 데이터가 있는 경우만 예측
            y_pred_group = model.predict(X_group)
            y_pred[test_data['predicted_group'] == group] = y_pred_group

    return y_pred


In [27]:
 # 5. `deposit_group` 별로 예측 진행
y_test_pred = predict_per_group(test_data, group_models)

In [None]:
y_test_pred

In [29]:
sample_submission["deposit"] = y_test_pred
sample_submission.to_csv("output.csv", index=False)