# 라이브러리 불러오기

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from data.load_dataset import load_dataset
from data.merge_dataset import merge_dataset
from data.feature_engineering import ClusteringModel
from model.inference import save_csv
from model.feature_select import select_features
from model.data_split import split_features_and_target
from model.log_transformation import apply_log_transformation
from model.model_train import cv_train, set_model, optuna_train
import argparse
import os
import wandb
import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


# 데이터 불러오기

In [2]:
# 기존 데이터 불러오기
train_data, test_data, sample_submission, interest_data, subway_data, school_data, park_data = load_dataset()

# 기존 데이터에 새로운 feature들을 병합한 데이터프레임 불러오기
train_data, test_data = merge_dataset(train_data, test_data, interest_data, subway_data, school_data, park_data)

In [3]:
train_data

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,...,nearest_school_longitude,nearest_park_distance,nearest_park_latitude,nearest_park_longitude,nearest_subway_num,nearest_school_num,nearest_park_num,num_of_subways_within_radius,num_of_schools_within_radius,num_of_parks_within_radius
0,0,84.9981,201906,25,2,9,2019,37.054314,127.045216,0,...,127.046337,498.618918,37.051333,127.041019,1,1,1,0,2,2
1,1,84.9981,202003,26,2,20,2019,37.054314,127.045216,1,...,127.046337,498.618918,37.051333,127.041019,1,1,1,0,2,2
2,2,84.9981,202003,28,2,8,2019,37.054314,127.045216,1,...,127.046337,498.618918,37.051333,127.041019,1,1,1,0,2,2
3,3,59.3400,201907,15,2,1,1986,36.964647,127.055847,33,...,127.056980,169.839678,36.963502,127.054582,1,1,1,0,4,8
4,4,59.8100,201904,12,2,6,1995,36.972390,127.084514,24,...,127.085154,382.401815,36.971743,127.088742,1,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1801223,1801223,114.8126,202311,25,0,5,2010,37.528394,126.659398,13,...,126.657114,398.113485,37.528189,126.654891,1,1,1,0,4,2
1801224,1801224,101.9088,202311,28,0,6,2010,37.528394,126.659398,13,...,126.657114,398.113485,37.528189,126.654891,1,1,1,0,4,2
1801225,1801225,114.7900,202312,3,0,19,2010,37.528394,126.659398,13,...,126.657114,398.113485,37.528189,126.654891,1,1,1,0,4,2
1801226,1801226,101.9088,202312,4,1,15,2010,37.528394,126.659398,13,...,126.657114,398.113485,37.528189,126.654891,1,1,1,0,4,2


In [4]:
test_data

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,...,nearest_school_longitude,nearest_park_distance,nearest_park_latitude,nearest_park_longitude,nearest_subway_num,nearest_school_num,nearest_park_num,num_of_subways_within_radius,num_of_schools_within_radius,num_of_parks_within_radius
0,0,84.9610,202404,12,1,14,2016,36.965423,127.048779,8,...,127.056980,288.442840,36.964653,127.045679,1,1,1,0,0,7
1,1,59.9000,202404,13,0,4,1997,36.963105,127.040678,27,...,127.056980,153.733042,36.961730,127.040502,1,1,1,0,0,5
2,2,39.2700,202404,29,0,5,1990,36.957089,127.047449,34,...,127.051762,272.286038,36.959505,127.047945,1,1,1,0,0,4
3,3,39.2700,202405,3,0,1,1990,36.957089,127.047449,34,...,127.051762,272.286038,36.959505,127.047945,1,1,1,0,0,4
4,4,46.9800,202406,2,0,4,1990,36.957089,127.047449,34,...,127.051762,272.286038,36.959505,127.047945,1,1,1,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150167,150167,115.5101,202402,27,0,17,2010,37.528394,126.659398,14,...,126.657114,398.113485,37.528189,126.654891,1,1,1,0,4,2
150168,150168,142.8738,202403,2,0,4,2010,37.528394,126.659398,14,...,126.657114,398.113485,37.528189,126.654891,1,1,1,0,4,2
150169,150169,142.8738,202403,16,1,13,2010,37.528394,126.659398,14,...,126.657114,398.113485,37.528189,126.654891,1,1,1,0,4,2
150170,150170,114.9285,202403,22,1,2,2010,37.528394,126.659398,14,...,126.657114,398.113485,37.528189,126.654891,1,1,1,0,4,2


In [5]:
# 위치 중복도 낮은 행 삭제
groups = train_data.groupby(["latitude", "longitude"])["index"].count()
conditioned_groups_index = groups[(groups >= 2) & (groups <= 5)].index # 이 범위를 파라미터로 조정하는걸로
small_groups = train_data[
    train_data["latitude"].isin(conditioned_groups_index.get_level_values(0)) &
    train_data["longitude"].isin(conditioned_groups_index.get_level_values(1))
]
train_data.drop(small_groups.index, axis=0, inplace=True)

In [6]:
# built_year > 2024 행 삭제
train_data = train_data[train_data["built_year"] < 2024]
train_data.reset_index(drop=True, inplace=True)

In [7]:
# log 변환
train_data, test_data = apply_log_transformation(train_data, test_data)

# Feature Select
train_data, test_data = select_features(train_data, test_data)

In [8]:
sorted_train_data = train_data.sort_values(by="deposit").reset_index(drop=True)
sorted_train_data["deposit_group"] = sorted_train_data.index // 180000
train_data = sorted_train_data
print(train_data.groupby('deposit_group')['deposit'].agg(['min', 'max', 'mean', 'count']))

                   min       max          mean   count
deposit_group                                         
0                300.0   15000.0  10983.803928  180000
1              15000.0   19764.0  17216.433078  180000
2              19764.0   24000.0  21511.396206  180000
3              24000.0   28000.0  25731.197522  180000
4              28000.0   32000.0  29986.217850  180000
5              32000.0   37800.0  34804.484267  180000
6              37800.0   44000.0  40498.528089  180000
7              44000.0   53000.0  48015.640311  180000
8              53000.0   69000.0  59822.927750  180000
9              69000.0  950000.0  96406.225640  170125


In [9]:
features = train_data.drop(columns=['deposit', 'log_deposit', 'deposit_group'])
target = train_data['deposit_group']

모델 학습에 필요한 라이브러리 import

In [10]:
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

In [11]:
# 2. `deposit_group`을 타겟으로 모델 훈련 (XGBoost 사용)
def train_group_classifier(train_data):
    # 독립변수와 종속변수 분리
    features = train_data.drop(columns=['deposit', 'log_deposit', 'deposit_group'])
    target = train_data['deposit_group']
    
    # XGBoost 모델 훈련
    classifier = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
    classifier.fit(features, target)
    
    return classifier

In [12]:
group_classifier = train_group_classifier(train_data)

In [13]:
# 3. 테스트 데이터의 `deposit_group` 예측
def predict_deposit_group(classifier, test_data):
    features = test_data
    predicted_groups = classifier.predict(features)
    test_data['predicted_group'] = predicted_groups
    return test_data

In [14]:
# 3. 테스트 데이터의 `deposit_group` 예측
test_data = predict_deposit_group(group_classifier, test_data)

In [17]:
# 4. `deposit_group` 별로 나눈 후 회귀 모델 생성 및 훈련 + Validation MAE 계산
def train_regressors_per_group_with_validation(train_data):
    group_models = {}
    mae_per_group = {}
    
    for group in train_data['deposit_group'].unique():
        # 그룹별 데이터 필터링
        group_data = train_data[train_data['deposit_group'] == group]
        X_group = group_data.drop(columns=['deposit', 'log_deposit', 'deposit_group'])
        y_group = group_data['deposit']
        
        # 학습 데이터와 검증 데이터 분리 (80% 학습, 20% 검증)
        X_train, X_valid, y_train, y_valid = train_test_split(X_group, y_group, test_size=0.2, random_state=42)
        
        # XGBoost Regressor with tuned parameters
        model = XGBRegressor(
            n_estimators=1000,         # 트리 개수 (1000개 트리 생성)
            learning_rate=0.05,        # 학습률 (0.05로 설정하여 안정적인 학습)
            max_depth=6,               # 트리의 최대 깊이
            subsample=0.8,             # 각 트리 학습에 사용될 데이터 비율
            colsample_bytree=0.8,      # 각 트리의 피처 샘플링 비율
            random_state=42            # 결과 재현성을 위한 시드 값
        )
        
        # 모델 훈련
        model.fit(X_train, y_train)
        
        # 검증 데이터에 대해 예측 수행
        y_valid_pred = model.predict(X_valid)
        
        # MAE 계산
        mae = mean_absolute_error(y_valid, y_valid_pred)
        mae_per_group[group] = mae
        
        # 각 그룹에 해당하는 모델 저장
        group_models[group] = model
    
    # 그룹별 MAE 출력
    print("Validation MAE per group:", mae_per_group)
    
    return group_models, mae_per_group

In [18]:
# 그룹별로 회귀 모델 훈련하고, 검증 데이터를 사용해 MAE 계산
group_models, mae_per_group = train_regressors_per_group_with_validation(train_data)


Validation MAE per group: {0: 1161.3406033062406, 1: 839.1927782389323, 2: 912.7632149522569, 3: 893.6327622070313, 4: 874.0783725585937, 5: 1104.0881034613715, 6: 1296.7795780164931, 7: 1909.1024914279515, 8: 2980.6971124131946, 9: 9040.408067941771}


In [22]:
# 5. 테스트 데이터의 `deposit_group` 별로 예측
def predict_per_group(test_data, group_models):
    # 예측값을 저장할 배열 초기화
    y_pred = np.zeros(len(test_data))
    
    # 그룹별로 데이터 분리 후 예측
    for group, model in group_models.items():
        group_data = test_data[test_data['predicted_group'] == group]
        
        # test_data에는 'deposit'이 없으므로 'deposit' 대신 'predicted_group'만 제외
        X_group = group_data.drop(columns=['predicted_group'])
        
        # 각 그룹에 대해 예측
        if len(X_group) > 0:  # 해당 그룹에 데이터가 있는 경우만 예측
            y_pred_group = model.predict(X_group)
            y_pred[test_data['predicted_group'] == group] = y_pred_group

    return y_pred


In [23]:
 # 5. `deposit_group` 별로 예측 진행
y_test_pred = predict_per_group(test_data, group_models)

In [24]:
y_test_pred

array([25270.41015625,  8187.62109375,  5354.25244141, ...,
       40653.1171875 , 34653.74609375, 34753.19140625])

In [27]:
sample_submission["deposit"] = y_test_pred

In [28]:
sample_submission

Unnamed: 0,index,deposit
0,0,25270.410156
1,1,8187.621094
2,2,5354.252441
3,3,5633.290039
4,4,6079.236816
...,...,...
150167,150167,34858.558594
150168,150168,40980.605469
150169,150169,40653.117188
150170,150170,34653.746094


In [29]:
sample_submission.to_csv("output.csv", index=False)