# 라이브러리 불러오기

In [15]:
import os
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings

warnings.filterwarnings('ignore')

# 랜덤 시드 설정

In [16]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [17]:
RAW_BASE_PATH = os.path.join("..", "..", "data", "raw")
PROCESSED_BASE_PATH = os.path.join("..", "..", "data", "processed")
# train_data = pd.read_csv(os.path.join(BASE_PATH, 'train.csv'))
# test_data = pd.read_csv(os.path.join(BASE_PATH, 'test.csv'))
raw = pd.read_csv(os.path.join(PROCESSED_BASE_PATH, 'all_apt_idx_recent_deposit.csv'))
sample_submission = pd.read_csv(os.path.join(RAW_BASE_PATH, 'sample_submission.csv'))

In [18]:
from src.pre_process.subway.subway_distance_feature_add import SubwayDistanceFeatureAddition

subway_info = pd.read_csv(os.path.join(RAW_BASE_PATH, 'subwayinfo.csv'))
SDFA = SubwayDistanceFeatureAddition(df=raw, subway_info=subway_info)
raw = SDFA.get_data()
raw.drop(columns=["index", "contract_year_month", "contract_day", "area", "area_price"], inplace=True)
raw.head()


Unnamed: 0,area_m2,contract_type,floor,built_year,latitude,longitude,age,deposit,contract_ymd,recent_deposit,apt_idx,area_m2_price,list_subway_idx_within_1km,nearest_subway_distance,nearest_subway_idx,num_subway_within_1km,category_interchange_within_1km,num_subway_within_500m,category_interchange_within_500m
0,72.32,2,16,2016,36.977063,126.928605,3,17000.0,2019-04-01,-999.0,15,235.066372,[],13424,94,0,0,0,0
1,22.27,2,11,2013,36.989441,126.842601,6,4000.0,2019-04-01,-999.0,36,179.61383,[],20096,40,0,0,0,0
2,59.94,2,9,1996,37.126312,127.070305,23,9000.0,2019-04-01,-999.0,53,150.15015,[],2005,69,0,0,0,0
3,59.76,2,1,1992,36.961599,126.918827,27,3000.0,2019-04-01,-999.0,64,50.200803,[],14892,94,0,0,0,0
4,58.764,2,12,2005,37.589086,127.206811,14,18000.0,2019-04-01,-999.0,67,306.309986,[454],312,454,1,1,1,1


In [None]:
from typing import List
from tqdm import tqdm

'''
해당 데이터는  all_apt_idx_recent_deposit.csv 기반으로 만들어진 데이터 입니다.
'''
columns = raw.columns
for i in tqdm(range(0, raw.shape[0])):
    subway_ls_i: List[int] = raw.loc[i, "list_subway_idx_within_1km"]
    for e in subway_ls_i:
        subway_id = f"sb_{e}"
        if subway_id not in columns:
            raw[subway_id] = 0
            columns = raw.columns
        raw.loc[i, subway_id] = 1
# raw.drop(columns=["nearest_subway_distance", "nearest_subway_idx", "list_subway_idx_within_1km"], inplace=True)
raw.head()
# raw.to_csv("V1.csv", index=False)

In [None]:
raw["recent_price"] = raw["recent_deposit"] / raw["area"]

In [None]:
corr_raw = raw.drop(
    columns=["contract_ymd", "nearest_subway_distance", "nearest_subway_idx", "list_subway_idx_within_1km", "latitude",
             "longitude"])
corr_result = None
target = "area_m2_price"
for c in tqdm(corr_raw.columns):
    if c == target:
        continue
    corr_row = corr_raw[[target, c]].corr()[1:]
    if corr_result is None:
        corr_result = corr_row
        continue

    corr_result = pd.concat([corr_result, corr_row], axis=0)[target]

corr_result.to_csv(f"corr_{target}_subway_one_hot.csv")
corr_result
# corr_raw

In [39]:
raw = raw[
    corr_result[corr_result > 0.08].index.append(
        "contract_ymd")]  #.to_csv("V1_apt_idx_recent_deposit_sb_one_hot_corr_ap_8.csv", index=False)

In [41]:

train_data = raw.loc[raw["deposit"] != -999]
test_data = raw.loc[raw["deposit"] == -999]

In [42]:
print("train, test_data, sample_submission shape : ", train_data.shape, test_data.shape, sample_submission.shape)
print("train data 상단 5개 정보 확인 : ", train_data.head())
print("test data 상단 5개 정보 확인 : ", test_data.head())
print("sample_submission 상단 5개 정보 확인 : ", sample_submission.head())
print('train data 변수 요약 정보 확인 : ', train_data.describe())
print("test data 변수 요약 정보 확인 : ", test_data.describe())



train, test_data, sample_submission shape :  (1801228, 45) (150172, 45) (150172, 2)
train data 상단 5개 정보 확인 :     floor  built_year  deposit  recent_deposit  apt_idx  num_subway_within_1km  \
0     16        2016  17000.0          -999.0       15                      0   
1     11        2013   4000.0          -999.0       36                      0   
2      9        1996   9000.0          -999.0       53                      0   
3      1        1992   3000.0          -999.0       64                      0   
4     12        2005  18000.0          -999.0       67                      1   

   category_interchange_within_1km  num_subway_within_500m  \
0                                0                       0   
1                                0                       0   
2                                0                       0   
3                                0                       0   
4                                1                       1   

   category_interchange_within

In [44]:
# holdout_start = "2023-07-01"
# holdout_end = "2023-12-01"
# holdout_data = train_data[
#     (train_data['contract_ymd'] >= holdout_start) & (train_data['contract_ymd'] <= holdout_end)]
# train_data = train_data[
#     ~((train_data['contract_ymd'] >= holdout_start) & (train_data['contract_ymd'] <= holdout_end))]
# # 학습 데이터와 정답 데이터 분리
target = "area_m2_price"
train_data
X_train = train_data.drop(columns=[target, ])
y_train = train_data[target]
X_holdout = train_data.drop(columns=[target])
y_holdout = train_data[target]
X_test = test_data.copy()
# LightGBM 모델 훈련
lgb_model = lgb.LGBMRegressor(random_state=RANDOM_SEED)
lgb_model.fit(X_train, y_train)
# Holdout 데이터셋에 대한 성능 확인
lgb_holdout_pred = lgb_model.predict(X_holdout)
lgb_holdout_mae = mean_absolute_error(y_holdout, lgb_holdout_pred)
print("Holdout 데이터셋 성능:")
print(f"LightGBM MAE: {lgb_holdout_mae:.2f}")
# Sample Submission 제출하기
lgb_test_pred = lgb_model.predict(X_test)
sample_submission['deposit'] = lgb_test_pred
sample_submission.to_csv('output.csv', index=False, encoding='utf-8-sig')

KeyError: "['area_price'] not found in axis"