# 라이브러리 불러오기

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from data.load_dataset import load_dataset
from data.merge_dataset import merge_dataset
from data.feature_engineering import ClusteringModel
from model.inference import save_csv
from model.feature_select import select_features
from model.data_split import split_features_and_target
from model.log_transformation import apply_log_transformation
from model.model_train import cv_train, set_model, optuna_train
import argparse
import os
import wandb
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# 데이터 불러오기

In [24]:
# 기존 데이터 불러오기
train_data, test_data, sample_submission, interest_data, subway_data, school_data, park_data = load_dataset()

# 기존 데이터에 새로운 feature들을 병합한 데이터프레임 불러오기
train_data, test_data = merge_dataset(train_data, test_data, interest_data, subway_data, school_data, park_data)

In [25]:
train_data

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,...,nearest_school_longitude,nearest_park_distance,nearest_park_latitude,nearest_park_longitude,nearest_subway_num,nearest_school_num,nearest_park_num,num_of_subways_within_radius,num_of_schools_within_radius,num_of_parks_within_radius
0,0,84.9981,201906,25,2,9,2019,37.054314,127.045216,0,...,127.046337,498.618918,37.051333,127.041019,1,1,1,0,2,2
1,1,84.9981,202003,26,2,20,2019,37.054314,127.045216,1,...,127.046337,498.618918,37.051333,127.041019,1,1,1,0,2,2
2,2,84.9981,202003,28,2,8,2019,37.054314,127.045216,1,...,127.046337,498.618918,37.051333,127.041019,1,1,1,0,2,2
3,3,59.3400,201907,15,2,1,1986,36.964647,127.055847,33,...,127.056980,169.839678,36.963502,127.054582,1,1,1,0,4,8
4,4,59.8100,201904,12,2,6,1995,36.972390,127.084514,24,...,127.085154,382.401815,36.971743,127.088742,1,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1801223,1801223,114.8126,202311,25,0,5,2010,37.528394,126.659398,13,...,126.657114,398.113485,37.528189,126.654891,1,1,1,0,4,2
1801224,1801224,101.9088,202311,28,0,6,2010,37.528394,126.659398,13,...,126.657114,398.113485,37.528189,126.654891,1,1,1,0,4,2
1801225,1801225,114.7900,202312,3,0,19,2010,37.528394,126.659398,13,...,126.657114,398.113485,37.528189,126.654891,1,1,1,0,4,2
1801226,1801226,101.9088,202312,4,1,15,2010,37.528394,126.659398,13,...,126.657114,398.113485,37.528189,126.654891,1,1,1,0,4,2


In [26]:
test_data

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,...,nearest_school_longitude,nearest_park_distance,nearest_park_latitude,nearest_park_longitude,nearest_subway_num,nearest_school_num,nearest_park_num,num_of_subways_within_radius,num_of_schools_within_radius,num_of_parks_within_radius
0,0,84.9610,202404,12,1,14,2016,36.965423,127.048779,8,...,127.056980,288.442840,36.964653,127.045679,1,1,1,0,0,7
1,1,59.9000,202404,13,0,4,1997,36.963105,127.040678,27,...,127.056980,153.733042,36.961730,127.040502,1,1,1,0,0,5
2,2,39.2700,202404,29,0,5,1990,36.957089,127.047449,34,...,127.051762,272.286038,36.959505,127.047945,1,1,1,0,0,4
3,3,39.2700,202405,3,0,1,1990,36.957089,127.047449,34,...,127.051762,272.286038,36.959505,127.047945,1,1,1,0,0,4
4,4,46.9800,202406,2,0,4,1990,36.957089,127.047449,34,...,127.051762,272.286038,36.959505,127.047945,1,1,1,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150167,150167,115.5101,202402,27,0,17,2010,37.528394,126.659398,14,...,126.657114,398.113485,37.528189,126.654891,1,1,1,0,4,2
150168,150168,142.8738,202403,2,0,4,2010,37.528394,126.659398,14,...,126.657114,398.113485,37.528189,126.654891,1,1,1,0,4,2
150169,150169,142.8738,202403,16,1,13,2010,37.528394,126.659398,14,...,126.657114,398.113485,37.528189,126.654891,1,1,1,0,4,2
150170,150170,114.9285,202403,22,1,2,2010,37.528394,126.659398,14,...,126.657114,398.113485,37.528189,126.654891,1,1,1,0,4,2


In [27]:
# 위치 중복도 낮은 행 삭제
groups = train_data.groupby(["latitude", "longitude"])["index"].count()
conditioned_groups_index = groups[(groups >= 2) & (groups <= 5)].index # 이 범위를 파라미터로 조정하는걸로
small_groups = train_data[
    train_data["latitude"].isin(conditioned_groups_index.get_level_values(0)) &
    train_data["longitude"].isin(conditioned_groups_index.get_level_values(1))
]
train_data.drop(small_groups.index, axis=0, inplace=True)

In [28]:
# built_year > 2024 행 삭제
train_data = train_data[train_data["built_year"] < 2024]
train_data.reset_index(drop=True, inplace=True)

In [29]:
#clustering
cluster_data = train_data[["latitude", "longitude"]]
clustering_model = ClusteringModel(cluster_data)
kmeans_model = clustering_model.kmeans_clustering(
    n_clusters = 25,
    train_data = train_data,
    test_data = test_data,
    feature_columns = ["latitude", "longitude"],
    label_column = 'region'
)

In [30]:
# region_mean 병합
region_mean = train_data.groupby('region')['deposit'].mean().reset_index()
region_mean.columns = ['region', 'region_mean']
train_data = train_data.merge(region_mean, on='region', how='left')
test_data = test_data.merge(region_mean, on='region', how='left')

In [31]:
# log 변환
train_data, test_data = apply_log_transformation(train_data, test_data)

# Feature Select
train_data, test_data = select_features(train_data, test_data)

In [32]:
train_data

Unnamed: 0,deposit,log_deposit,log_area_m2,latitude,longitude,log_subway_distance,log_park_distance,contract_year_month,num_of_subways_within_radius,num_of_parks_within_radius,region,region_mean
0,17000.0,9.741027,4.454325,37.054314,127.045216,6.576404,6.213846,201906,0,2,9,18492.268822
1,23000.0,10.043293,4.454325,37.054314,127.045216,6.576404,6.213846,202003,0,2,9,18492.268822
2,23000.0,10.043293,4.454325,37.054314,127.045216,6.576404,6.213846,202003,0,2,9,18492.268822
3,1800.0,7.496097,4.107754,36.972390,127.084514,7.621041,5.949084,201904,0,1,9,18492.268822
4,20000.0,9.903538,4.453582,36.965423,127.048779,8.363056,5.667958,201904,0,7,9,18492.268822
...,...,...,...,...,...,...,...,...,...,...,...,...
1790120,39000.0,10.571343,4.751973,37.528394,126.659398,7.302526,5.989246,202311,0,2,21,24812.270588
1790121,38000.0,10.545368,4.633843,37.528394,126.659398,7.302526,5.989246,202311,0,2,21,24812.270588
1790122,37000.0,10.518700,4.751778,37.528394,126.659398,7.302526,5.989246,202312,0,2,21,24812.270588
1790123,34400.0,10.445841,4.633843,37.528394,126.659398,7.302526,5.989246,202312,0,2,21,24812.270588


In [33]:
# 먼저 데이터 정렬 및 인덱스 리셋
sorted_train_data = train_data.sort_values(by="deposit").reset_index(drop=True)

In [34]:
# deposit을 기준으로 그룹을 나눔
# 10,000 미만은 그룹 0, 10,000~100,000 사이는 그룹 1, 100,000 이상은 100,000 단위로 그룹화
def categorize_deposit(deposit):
    if deposit < 10000:
        return 0  # 10,000 미만
    elif deposit <= 100000:
        return 1  # 10,000 ~ 100,000 사이
    else:
        return (deposit // 100000) + 1  # 100,000 이상을 100,000 단위로 그룹화

In [35]:
# 그룹화 적용
sorted_train_data["deposit_group"] = sorted_train_data["deposit"].apply(categorize_deposit)

# 그룹별 통계 출력
train_data = sorted_train_data
print(train_data.groupby('deposit_group')['deposit'].agg(['min', 'max', 'mean', 'count']))

                    min       max           mean    count
deposit_group                                            
0.0               300.0    9990.0    7131.513486    54576
1.0             10000.0  100000.0   36449.974768  1689931
2.0            100100.0  199500.0  130020.130609    42118
3.0            200000.0  299200.0  228963.685514     2989
4.0            300000.0  395000.0  333053.267045      352
5.0            400000.0  490000.0  431897.959184       98
6.0            500000.0  590000.0  534947.368421       38
7.0            600000.0  690000.0  626150.000000       10
8.0            700000.0  780000.0  734444.444444        9
9.0            800000.0  800000.0  800000.000000        3
10.0           950000.0  950000.0  950000.000000        1


In [36]:
train_data

Unnamed: 0,deposit,log_deposit,log_area_m2,latitude,longitude,log_subway_distance,log_park_distance,contract_year_month,num_of_subways_within_radius,num_of_parks_within_radius,region,region_mean,deposit_group
0,300.0,5.707110,3.876396,37.013363,127.267601,9.704824,5.419926,201910,0,4,9,18492.268822,0.0
1,500.0,6.216606,2.938733,37.755771,126.779882,7.131259,4.958903,202001,0,11,12,29335.186779,0.0
2,500.0,6.216606,2.820414,37.755771,126.779882,7.131259,4.958903,202002,0,11,12,29335.186779,0.0
3,500.0,6.216606,3.706719,37.323651,126.836041,6.787942,6.030472,202002,0,5,17,23033.924827,0.0
4,500.0,6.216606,2.654614,37.617515,127.074391,4.426952,5.050052,202208,1,10,1,34569.585123,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1790120,780000.0,13.567050,5.230841,37.483569,127.032598,5.255148,5.615110,202307,1,8,14,74881.640092,8.0
1790121,800000.0,13.592368,5.502677,37.536441,127.003471,5.960255,6.714200,202111,1,0,14,74881.640092,9.0
1790122,800000.0,13.592368,5.502677,37.536441,127.003471,5.960255,6.714200,202112,1,0,14,74881.640092,9.0
1790123,800000.0,13.592368,5.502677,37.536441,127.003471,5.960255,6.714200,202303,1,0,14,74881.640092,9.0


In [15]:
# sorted_train_data = train_data.sort_values(by="deposit").reset_index(drop=True)
# sorted_train_data["deposit_group"] = sorted_train_data.index // 180000
# train_data = sorted_train_data
# print(train_data.groupby('deposit_group')['deposit'].agg(['min', 'max', 'mean', 'count']))

In [37]:
features = train_data.drop(columns=['deposit', 'log_deposit', 'deposit_group'])
target = train_data['deposit_group']

In [38]:
from sklearn.feature_selection import SelectKBest, f_regression, RFE, f_classif

In [39]:
def select_kbest(X, y, target, k=10):
    """
    SelectKBest 방법을 사용하여 상위 k개의 특성 선택
    Args:
        X (DataFrame): 독립변수
        y (DataFrame): 종속변수
        target (str): 종속변수 열 중 실제 사용할 target 열 이름
        k (int, optional): 선택할 상위 k개 특성의 수 (Defaults to 10)
    Returns:
        List[str]: 선택된 상위 k개의 특성의 열 이름 리스트
    """
    # SelectKBest 적용
    selector = SelectKBest(score_func=f_classif, k=k)
    selector.fit(X, y[target])
    # 선택된 특성의 열 이름 리스트 반환
    selected_cols = X.columns[selector.get_support()].tolist()
    return selected_cols


selected_cols = select_kbest(features, train_data, "deposit_group")

In [40]:
selected_cols

['log_area_m2',
 'latitude',
 'longitude',
 'log_subway_distance',
 'log_park_distance',
 'contract_year_month',
 'num_of_subways_within_radius',
 'num_of_parks_within_radius',
 'region',
 'region_mean']

모델 학습에 필요한 라이브러리 import

In [17]:
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

In [18]:
# feature importance 시각화 함수
def plot_feature_importance(model, feature_names, model_type):
    # feature importance 가져오기
    importances = model.feature_importances_
    
    # 중요도를 기준으로 피처 정렬
    indices = np.argsort(importances)[::-1]
    sorted_importances = importances[indices]
    sorted_features = [feature_names[i] for i in indices]
    
    # 시각화
    plt.figure(figsize=(10, 6))
    plt.title(f"Feature Importances - {model_type}")
    plt.barh(range(len(sorted_features)), sorted_importances, align="center")
    plt.yticks(range(len(sorted_features)), sorted_features)
    plt.gca().invert_yaxis()  # 상위 중요도가 위로 오게 반전
    plt.xlabel("Importance")
    plt.show()

In [19]:
# 2. `deposit_group`을 타겟으로 모델 훈련 (XGBoost 사용)
def train_group_classifier(train_data):
    # 독립변수와 종속변수 분리
    features = train_data.drop(columns=['deposit', 'log_deposit', 'deposit_group'])
    target = train_data['deposit_group']
    

    classifier = RandomForestClassifier(random_state=42)
    # # XGBoost 모델 훈련
    # classifier = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
    classifier.fit(features, target)
    
    return classifier

In [None]:
group_classifier = train_group_classifier(train_data)
plot_feature_importance(group_classifier, features.columns, model_type="RandomForestClassifier")

In [21]:
# 3. 테스트 데이터의 `deposit_group` 예측
def predict_deposit_group(classifier, test_data):
    features = test_data
    predicted_groups = classifier.predict(features)
    test_data['predicted_group'] = predicted_groups
    return test_data

In [22]:
# 3. 테스트 데이터의 `deposit_group` 예측
test_data = predict_deposit_group(group_classifier, test_data)

In [23]:
# 4. deposit_group 별로 나눈 후 회귀 모델 생성 및 훈련
def train_regressors_per_group(train_data):
    group_models = {}
    for group in train_data['deposit_group'].unique():
        group_data = train_data[train_data['deposit_group'] == group]
        X_group = group_data.drop(columns=['deposit', 'log_deposit', 'deposit_group'])
        y_group = group_data['deposit']
        
        model = RandomForestRegressor(random_state=42)
        # XGBoost Regressor with tuned parameters
        # model = XGBRegressor(
        #     n_estimators=1000,         # 트리 개수 (1000개 트리 생성)
        #     learning_rate=0.05,        # 학습률 (0.05로 설정하여 안정적인 학습)
        #     max_depth=6,               # 트리의 최대 깊이
        #     subsample=0.8,             # 각 트리 학습에 사용될 데이터 비율
        #     colsample_bytree=0.8,      # 각 트리의 피처 샘플링 비율
        #     random_state=42            # 결과 재현성을 위한 시드 값
        # )
        
        # 모델 훈련
        model.fit(X_group, y_group)
        
        # 각 그룹에 해당하는 모델 저장
        group_models[group] = model
        
    return group_models

In [24]:
# 4. deposit_group 별로 회귀 모델 훈련
group_models = train_regressors_per_group(train_data)

In [None]:
# 각 그룹별로 중요도 확인 (예: 그룹 1의 모델)
group = 1  # 확인하고자 하는 그룹
plot_feature_importance(group_models[group], features.columns, model_type=f"RandomForestRegressor (Group {group})")

In [26]:
# 5. 테스트 데이터의 `deposit_group` 별로 예측
def predict_per_group(test_data, group_models):
    # 예측값을 저장할 배열 초기화
    y_pred = np.zeros(len(test_data))
    
    # 그룹별로 데이터 분리 후 예측
    for group, model in group_models.items():
        group_data = test_data[test_data['predicted_group'] == group]
        
        # test_data에는 'deposit'이 없으므로 'deposit' 대신 'predicted_group'만 제외
        X_group = group_data.drop(columns=['predicted_group'])
        
        # 각 그룹에 대해 예측
        if len(X_group) > 0:  # 해당 그룹에 데이터가 있는 경우만 예측
            y_pred_group = model.predict(X_group)
            y_pred[test_data['predicted_group'] == group] = y_pred_group

    return y_pred


In [27]:
 # 5. `deposit_group` 별로 예측 진행
y_test_pred = predict_per_group(test_data, group_models)

In [None]:
y_test_pred

In [29]:
sample_submission["deposit"] = y_test_pred
sample_submission.to_csv("output.csv", index=False)