## 라이브러리 불러오기

In [1]:
import os
# import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from sklearn.neighbors import BallTree
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## Data load

In [3]:
data_path: str = "~/house/data"
train_data: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train.csv"))
test_data: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv"))
sample_submission: pd.DataFrame = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))

In [4]:
# 금리, 지하철, 학교, 공원 정보 불러오기
interest_data: pd.DataFrame = pd.read_csv(os.path.join(data_path, "interestRate.csv"))
subway_data: pd.DataFrame = pd.read_csv(os.path.join(data_path, "subwayInfo.csv"))
school_data: pd.DataFrame = pd.read_csv(os.path.join(data_path, "schoolinfo.csv"))
park_data: pd.DataFrame = pd.read_csv(os.path.join(data_path, "parkInfo.csv"))

## 데이터 병합하기

### interestRate

In [5]:
merged_train = pd.merge(train_data, interest_data, left_on="contract_year_month", right_on="year_month", how="left")
merged_train = merged_train.drop(columns=["year_month"])
print(merged_train.head())

   index  area_m2  contract_year_month  contract_day  contract_type  floor  \
0      0  84.9981               201906            25              2      9   
1      1  84.9981               202003            26              2     20   
2      2  84.9981               202003            28              2      8   
3      3  59.3400               201907            15              2      1   
4      4  59.8100               201904            12              2      6   

   built_year   latitude   longitude  age  deposit  interest_rate  
0        2019  37.054314  127.045216    0  17000.0           1.78  
1        2019  37.054314  127.045216    1  23000.0           1.26  
2        2019  37.054314  127.045216    1  23000.0           1.26  
3        1986  36.964647  127.055847   33   5000.0           1.68  
4        1995  36.972390  127.084514   24   1800.0           1.85  


In [6]:
merged_test = pd.merge(test_data, interest_data, left_on="contract_year_month", right_on="year_month", how="left")
merged_test = merged_test.drop(columns=["year_month"])
print(merged_test.head())

   index  area_m2  contract_year_month  contract_day  contract_type  floor  \
0      0   84.961               202404            12              1     14   
1      1   59.900               202404            13              0      4   
2      2   39.270               202404            29              0      5   
3      3   39.270               202405             3              0      1   
4      4   46.980               202406             2              0      4   

   built_year   latitude   longitude  age  interest_rate  
0        2016  36.965423  127.048779    8           3.54  
1        1997  36.963105  127.040678   27           3.54  
2        1990  36.957089  127.047449   34           3.54  
3        1990  36.957089  127.047449   34           3.56  
4        1990  36.957089  127.047449   34            NaN  


### sklearn의 BallTree를 활용한 haversine 거리 계산 방식

In [7]:
def find_closest_distance_haversine(train_data, loc_df):
    # degree->radian 값으로 변환 for 삼각함수
    train_coords = np.radians(train_data[["latitude", "longitude"]].values)
    loc_coords = np.radians(loc_df[["latitude", "longitude"]].values)

    tree = BallTree(loc_coords, metric="haversine") # Ball Tree 생성 

    distances, indices = tree.query(train_coords, k=1) #가까운 1 지점만
    distances_meter = distances * 6371000 # meter값으로 변환

    closest_coords = loc_df[["latitude", "longitude"]].iloc[indices.flatten()].values # 가까운 지점 좌표

    result_df = pd.DataFrame({
        "index" : train_data.index,
        "closest_distance" : distances_meter.flatten(),
        "closest_latitude" : closest_coords[:, 0],
        "closest_longtitude" : closest_coords[:, 1]
    })

    return result_df


In [8]:
subway_result = find_closest_distance_haversine(train_data, subway_data)
subway_result.columns = ["index", "nearest_subway_distance", "nearest_subway_latitude", "nearest_subway_longtitude"]
train_data = pd.merge(train_data, subway_result, on="index")

In [9]:
subway_result = find_closest_distance_haversine(test_data, subway_data)
subway_result.columns = ["index", "nearest_subway_distance", "nearest_subway_latitude", "nearest_subway_longtitude"]
test_data = pd.merge(test_data, subway_result, on="index")

In [10]:
school_result = find_closest_distance_haversine(train_data, school_data)
school_result.columns = ["index", "nearest_school_distance", "nearest_school_latitude", "nearest_school_longtitude"]
train_data = pd.merge(train_data, school_result, on="index")

In [11]:
school_result = find_closest_distance_haversine(test_data, school_data)
school_result.columns = ["index", "nearest_school_distance", "nearest_school_latitude", "nearest_school_longtitude"]
test_data = pd.merge(test_data, school_result, on="index")

In [12]:
park_result = find_closest_distance_haversine(train_data, park_data)
park_result.columns = ["index", "nearest_park_distance", "nearest_park_latitude", "nearest_park_longtitude"]
train_data = pd.merge(train_data, park_result, on="index")

In [13]:
park_result = find_closest_distance_haversine(test_data, park_data)
park_result.columns = ["index", "nearest_park_distance", "nearest_park_latitude", "nearest_park_longtitude"]
test_data = pd.merge(test_data, park_result, on="index")

In [14]:
on = merged_train.columns.drop("interest_rate").tolist() # 병합 기준이 될 column 리스트
train_data = pd.merge(merged_train, train_data, on=on, how="left")
train_data = train_data.drop(columns=["index"])
train_data

Unnamed: 0,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit,interest_rate,nearest_subway_distance,nearest_subway_latitude,nearest_subway_longtitude,nearest_school_distance,nearest_school_latitude,nearest_school_longtitude,nearest_park_distance,nearest_park_latitude,nearest_park_longtitude
0,84.9981,201906,25,2,9,2019,37.054314,127.045216,0,17000.0,1.78,716.952948,37.056496,127.052819,156.120431,37.053232,127.046337,498.618918,37.051333,127.041019
1,84.9981,202003,26,2,20,2019,37.054314,127.045216,1,23000.0,1.26,716.952948,37.056496,127.052819,156.120431,37.053232,127.046337,498.618918,37.051333,127.041019
2,84.9981,202003,28,2,8,2019,37.054314,127.045216,1,23000.0,1.26,716.952948,37.056496,127.052819,156.120431,37.053232,127.046337,498.618918,37.051333,127.041019
3,59.3400,201907,15,2,1,1986,36.964647,127.055847,33,5000.0,1.68,3897.279708,36.990726,127.085159,214.559689,36.962943,127.056980,169.839678,36.963502,127.054582
4,59.8100,201904,12,2,6,1995,36.972390,127.084514,24,1800.0,1.85,2039.685349,36.990726,127.085159,1708.489263,36.987746,127.085154,382.401815,36.971743,127.088742
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1801223,114.8126,202311,25,0,5,2010,37.528394,126.659398,13,39000.0,4.00,1483.044562,37.524649,126.675539,224.754177,37.529291,126.657114,398.113485,37.528189,126.654891
1801224,101.9088,202311,28,0,6,2010,37.528394,126.659398,13,38000.0,4.00,1483.044562,37.524649,126.675539,224.754177,37.529291,126.657114,398.113485,37.528189,126.654891
1801225,114.7900,202312,3,0,19,2010,37.528394,126.659398,13,37000.0,3.84,1483.044562,37.524649,126.675539,224.754177,37.529291,126.657114,398.113485,37.528189,126.654891
1801226,101.9088,202312,4,1,15,2010,37.528394,126.659398,13,34400.0,3.84,1483.044562,37.524649,126.675539,224.754177,37.529291,126.657114,398.113485,37.528189,126.654891


In [15]:
on = merged_test.columns.drop("interest_rate").tolist() # 병합 기준이 될 column 리스트
test_data = pd.merge(merged_test, test_data, on=on, how="left")
test_data = test_data.drop(columns=["index"])
test_data

Unnamed: 0,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,interest_rate,nearest_subway_distance,nearest_subway_latitude,nearest_subway_longtitude,nearest_school_distance,nearest_school_latitude,nearest_school_longtitude,nearest_park_distance,nearest_park_latitude,nearest_park_longtitude
0,84.9610,202404,12,1,14,2016,36.965423,127.048779,8,3.54,4284.771362,36.990726,127.085159,779.057020,36.962943,127.056980,288.442840,36.964653,127.045679
1,59.9000,202404,13,0,4,1997,36.963105,127.040678,27,3.54,5004.558790,36.990726,127.085159,1448.494091,36.962943,127.056980,153.733042,36.961730,127.040502
2,39.2700,202404,29,0,5,1990,36.957089,127.047449,34,3.54,5021.183886,36.990726,127.085159,808.416163,36.950687,127.051762,272.286038,36.959505,127.047945
3,39.2700,202405,3,0,1,1990,36.957089,127.047449,34,3.56,5021.183886,36.990726,127.085159,808.416163,36.950687,127.051762,272.286038,36.959505,127.047945
4,46.9800,202406,2,0,4,1990,36.957089,127.047449,34,,5021.183886,36.990726,127.085159,808.416163,36.950687,127.051762,272.286038,36.959505,127.047945
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150167,115.5101,202402,27,0,17,2010,37.528394,126.659398,14,3.62,1483.044562,37.524649,126.675539,224.754177,37.529291,126.657114,398.113485,37.528189,126.654891
150168,142.8738,202403,2,0,4,2010,37.528394,126.659398,14,3.59,1483.044562,37.524649,126.675539,224.754177,37.529291,126.657114,398.113485,37.528189,126.654891
150169,142.8738,202403,16,1,13,2010,37.528394,126.659398,14,3.59,1483.044562,37.524649,126.675539,224.754177,37.529291,126.657114,398.113485,37.528189,126.654891
150170,114.9285,202403,22,1,2,2010,37.528394,126.659398,14,3.59,1483.044562,37.524649,126.675539,224.754177,37.529291,126.657114,398.113485,37.528189,126.654891


## EDA

In [None]:
print("train, test_data, sample_submission shape : ", train_data.shape, test_data.shape, sample_submission.shape)
print("train data 상단 5개 정보 확인 : ", train_data.head())
print("test data 상단 5개 정보 확인 : ", test_data.head())
print("sample_submission 상단 5개 정보 확인 : ", sample_submission.head())


In [None]:
print("train data 변수 요약 정보 확인 : ", train_data.describe())
print("test data 변수 요약 정보 확인 : ", test_data.describe())

### 연속형 변수 분포(히스토그램)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

train_data["area_m2"].hist(bins=30)

In [None]:
train_data["age"].hist(bins=30)

In [None]:
train_data.hist(bins=20, figsize=(15, 10))
plt.tight_layout()
plt.show()

- skewed 변수 : deposit, nearest~distance 변수 -> log 변환필요/ 이상치 

In [21]:
train_data["log_deposit"] = np.log1p(train_data["deposit"]) # X

In [None]:
train_data["log_deposit"].hist(bins=20, figsize=(6, 4))
plt.xlabel("Log-transformed deposit")
plt.show()

In [23]:
# X
train_data["log_nearest_subway_distance"] = np.log1p(train_data["nearest_subway_distance"])
train_data["log_nearest_school_distance"] = np.log1p(train_data["nearest_school_distance"])
train_data["log_nearest_park_distance"] = np.log1p(train_data["nearest_park_distance"])

### 변수 간 상관관계

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(train_data.corr(), annot=True, cmap="coolwarm", annot_kws={"size":8}, fmt=".2f")
plt.show()

1. age와 built_year -> 0.99 ; 둘 중 하나만 써도 될 것

2. 위도 경도와 근접 위치 간 상관관계가 1 ; 문제 없는건지 궁금!

=> 괜찮다면, **특정 거리 이내에 있는지 여부**?(이산화된 변수)

3. deposit(타겟) 과 area_m2 -> 0.52 ; 면적이 클수록 deposit이 증가

    => **area_m2을 범주화**해서 파생변수 생성 ?

### Scatter Plot

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(15, 10))

sns.scatterplot(x="area_m2", y="deposit", data=train_data, ax=axs[0, 0])
axs[0,0].set_title("Area vs Deposit")

sns.scatterplot(x="floor", y="deposit", data=train_data, ax=axs[0, 1])
axs[0,1].set_title("Floor vs Deposit")

sns.scatterplot(x="age", y="deposit", data=train_data, ax=axs[1, 0])
axs[1,0].set_title("Age vs Deposit")

sns.scatterplot(x="built_year", y="deposit", data=train_data, ax=axs[1, 1])
axs[1,1].set_title("Built_year vs Deposit")

plt.tight_layout()
plt.show()

1. 면적이 클수록 deposit이 높아지는 경향

2. 층수 - 뚜렷한 패턴 X (보통 높을수록 가격 높아지지 않나?)

3. 건물의 나이가 적을수록 deposit이 높아지는 경향 

4. 최근에 지어진 건물일 수록 deposit이 높음

#### 거리 변수

In [None]:
distance_columns = ["nearest_subway_distance", "nearest_school_distance", "nearest_park_distance"]
# X
# 각 거리 변수와 deposit 간의 산점도 시각화
for col in distance_columns:
    plt.figure(figsize=(6, 4))
    #sns.scatterplot(x=train_data[col], y=train_data["deposit"])
    sns.scatterplot(x=np.log1p(train_data[col]), y=train_data["deposit"])
    plt.title(f"{col} vs Deposit")
    plt.show()

#### 특정 거리 이내 여부

In [None]:
# 임계값 설정 (예: 800m 이내 여부) # X
threshold = 800  # 0.8km

# 각 거리 변수에 대해 이산 변수 생성
for col in distance_columns:
    train_data[f"{col}_within_{threshold}m"] = (train_data[col] <= threshold).astype(int)

# 결과 확인
print(train_data[[f"{col}_within_{threshold}m" for col in distance_columns]].head())


In [None]:
# 이산화된 변수와 deposit 간의 상관관계 분석
binary_distance_columns = [f"{col}_within_{threshold}m" for col in distance_columns]
correlation_with_deposit_binary = train_data[binary_distance_columns + ["deposit"]].corr()["deposit"].drop("deposit")
print(correlation_with_deposit_binary)


In [None]:
# 상호작용 변수 생성
train_data["count_within_800m"] = (
    train_data["nearest_subway_distance_within_800m"] +
    train_data["nearest_school_distance_within_800m"] +
    train_data["nearest_park_distance_within_800m"]
).astype(int)

# 결과 확인
print(train_data)

In [None]:
# 상관관계 분석
correlation = train_data[["count_within_800m", "deposit"]].corr()
print(correlation)

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x="count_within_800m", y="deposit", data=train_data)
plt.title("Count_within_800m vs deposit")
plt.xlabel("Count_within_800m")
plt.ylabel("deposit")
plt.grid(axis="y")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x="count_within_800m", y="deposit", data=train_data)
plt.title("Count_within_800m vs deposit")
plt.xlabel("Count_within_800m")
plt.ylabel("Deposit")
plt.grid(axis="y")
plt.show()

### 시간에 따른 패턴 분석

In [None]:
train_data["contract_year_month"] = pd.to_datetime(train_data["contract_year_month"], format="%Y%m")
train_data.groupby("contract_year_month")["deposit"].mean().plot()

- 2019 ~ 2023까지 deposit의 평균값이 전반적으로 상승
- 2022년의 변동성 : 부동산 규제 완화, 금리 인상, 수요 감소..?
>> 금리 인상-> interest_rate 와 추세 비교
[ interest_rate *5000+30000 -> 스케일 임의 조정 ]

In [None]:
# 평균 deposit과 interest_rate 계산
mean_data = train_data.groupby("contract_year_month").agg({
    "deposit": "mean",
    "interest_rate": "mean"
}).reset_index()

# 시각화
plt.figure(figsize=(14, 7))
plt.plot(mean_data["contract_year_month"], mean_data["deposit"], label="Mean Deposit", color="blue")
plt.plot(mean_data["contract_year_month"], mean_data["interest_rate"] * 5000 + 30000, label="Mean Interest Rate", color="orange")

# 플롯 설정
plt.title("Mean Deposit and Interest Rate Over Time")
plt.xlabel("Contract Year Month")
plt.ylabel("Value")
plt.legend()
plt.xticks(rotation=45)
plt.grid()
plt.tight_layout()
plt.show()

### area_m2 변수 활용

In [None]:
print(train_data["area_m2"].min())
print(train_data["area_m2"].max())

In [None]:
# area_m2를 범주화하여 임시 변수 생성
bins = [0, 50, 100, 150, 200, float("inf")]  # 범주 구간 설정
labels = ["0-50", "51-100", "101-150", "151-200", "201 이상"]
train_data["area_category"] = pd.cut(train_data["area_m2"], bins=bins, labels=labels, right=False)

# 범주화된 변수와 deposit 간의 평균 관계 시각화
plt.figure(figsize=(10, 6))
sns.barplot(x="area_category", y="deposit", data=train_data, estimator=lambda x: sum(x) / len(x))
plt.title("Average Deposit by Area Category")
plt.xlabel("Area Category (m2)")
plt.ylabel("Average Deposit")
plt.grid(axis="y")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x="area_category", y="deposit", data=train_data)
plt.title("Deposit by Area Category")
plt.xlabel("Area Category (m2)")
plt.ylabel("Deposit")
plt.grid(axis="y")
plt.show()

# train

## train_data 변수 생성

In [16]:
train_data["log_deposit"] = np.log1p(train_data["deposit"])
train_data["log_floor"] = np.log1p(train_data["floor"])
train_data["log_area_m2"] = np.log1p(train_data["area_m2"])
train_data["log_nearest_subway_distance"] = np.log1p(train_data["nearest_subway_distance"])
train_data["log_nearest_school_distance"] = np.log1p(train_data["nearest_school_distance"])
train_data["log_nearest_park_distance"] = np.log1p(train_data["nearest_park_distance"])

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [17]:
train_data["log_latitude"] = np.log1p(train_data["latitude"])
train_data["log_longitude"] = np.log1p(train_data["longitude"])

In [18]:
distance_columns = ["nearest_subway_distance", "nearest_school_distance", "nearest_park_distance"]
# 임계값 설정 (예: 800m 이내 여부)
threshold = 800  # 0.8km

for col in distance_columns:
    train_data[f"{col}_within_{threshold}m"] = (train_data[col] <= threshold).astype(int)

In [19]:
train_data["count_within_800m"] = (
    train_data["nearest_subway_distance_within_800m"] +
    train_data["nearest_school_distance_within_800m"] +
    train_data["nearest_park_distance_within_800m"]
).astype(int)

In [20]:
#train_data["area_category"] = pd.cut(test_data["area_m2"], bins=bins, labels=labels, right=False)

## test_data 변수 생성

In [21]:
#test_data["log_deposit"] = np.log1p(test_data["deposit"])
test_data["log_floor"] = np.log1p(test_data["floor"])
test_data["log_area_m2"] = np.log1p(test_data["area_m2"])
test_data["log_nearest_subway_distance"] = np.log1p(test_data["nearest_subway_distance"])
test_data["log_nearest_school_distance"] = np.log1p(test_data["nearest_school_distance"])
test_data["log_nearest_park_distance"] = np.log1p(test_data["nearest_park_distance"])

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [22]:
test_data["log_latitude"] = np.log1p(test_data["latitude"])
test_data["log_longitude"] = np.log1p(test_data["longitude"])

In [23]:
for col in distance_columns:
    test_data[f"{col}_within_{threshold}m"] = (test_data[col] <= threshold).astype(int)

In [24]:
test_data["count_within_800m"] = (
    test_data["nearest_subway_distance_within_800m"] +
    test_data["nearest_school_distance_within_800m"] +
    test_data["nearest_park_distance_within_800m"]
).astype(int)

In [25]:
#test_data["area_category"] = pd.cut(test_data["area_m2"], bins=bins, labels=labels, right=False)

In [26]:
train_data.describe().T

  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
area_m2,1801228.0,75.18833,25.526325,10.3215,59.75,77.15,84.96,317.36
contract_year_month,1801228.0,202133.1,135.281455,201904.0,202008.0,202111.0,202212.0,202312.0
contract_day,1801228.0,15.89943,8.625998,1.0,9.0,16.0,23.0,31.0
contract_type,1801228.0,1.203805,0.885279,0.0,0.0,2.0,2.0,2.0
floor,1801228.0,10.05197,6.973359,-4.0,5.0,9.0,14.0,68.0
built_year,1801228.0,2004.199,11.151317,1961.0,1995.0,2004.0,2015.0,2024.0
latitude,1801228.0,37.47721,0.162511,36.91791,37.379762,37.50227,37.582054,38.18194
longitude,1801228.0,126.9662,0.170122,126.4787,126.842159,126.997594,127.088811,127.6609
age,1801228.0,17.06597,11.127956,-3.0,7.0,17.0,26.0,62.0
deposit,1801228.0,38162.23,26103.774198,300.0,21000.0,32000.0,47800.0,1100000.0


## column 추출

In [27]:
columns_needed = ["log_area_m2", "contract_year_month", "log_floor", "built_year", "log_deposit", "log_latitude", "log_longitude", "interest_rate", "log_nearest_subway_distance", "log_nearest_school_distance", "log_nearest_park_distance", "count_within_800m"]
columns_needed_test = ["log_area_m2", "contract_year_month", "log_floor", "built_year", "log_latitude", "log_longitude", "interest_rate", "log_nearest_subway_distance", "log_nearest_school_distance", "log_nearest_park_distance", "count_within_800m"]
train_data = train_data[columns_needed]
test_data = test_data[columns_needed_test]

## Holdout 데이터셋 설정 
baseline 코드

In [28]:
holdout_start = 202307
holdout_end = 202312
holdout_data = train_data[(train_data["contract_year_month"] >= holdout_start) & (train_data["contract_year_month"] <= holdout_end)]
train_data = train_data[~((train_data["contract_year_month"] >= holdout_start) & (train_data["contract_year_month"] <= holdout_end))]

## 학습 데이터와 정답 데이터 분리

In [29]:
X_train = train_data.drop(columns=["log_deposit"])
y_train = train_data["log_deposit"]
X_holdout = holdout_data.drop(columns=["log_deposit"])
y_holdout = holdout_data["log_deposit"]
X_test = test_data.copy()

In [30]:
print(f"X_holdout shape: {X_holdout.shape}")

X_holdout shape: (206866, 11)


## LightGBM 모델 훈련

In [31]:
from sklearn.model_selection import KFold, cross_val_predict
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [32]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [33]:
def objective(trial):
    param = {
        "verbose" : -1,
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "learning_rate" : trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves" : trial.suggest_int("num_leaves", 20, 100),
        "max_depth" : trial.suggest_int("max_depth", 3, 10), # 과적합 방지
        #"min_child_weight" : trial.suggest_int("min_child_weight", 1, 5), # 과소적합 방지
        #"subsample" : trial.suggest_uniform("subsample", 0.5, 1.0), # 데이터 샘플링 비율, 과적합 방지
        #"colsample_bytree" : trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
        "random_state" : 42
    }

    model = lgb.LGBMRegressor(**param)
    
    y_pred = cross_val_predict(model, X_train, y_train, cv=kf)

    mae = mean_absolute_error(y_train, y_pred)


    return mae

In [34]:
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=50)

print("Best Hyperparameters : ", study.best_params)

[I 2024-10-09 22:12:07,165] A new study created in memory with name: no-name-f8290c00-85dd-49d8-a368-6f2fdbe00296
[I 2024-10-09 22:12:27,511] Trial 0 finished with value: 0.14639281987112557 and parameters: {'n_estimators': 144, 'learning_rate': 0.28570714885887566, 'num_leaves': 79, 'max_depth': 7}. Best is trial 0 with value: 0.14639281987112557.
[I 2024-10-09 22:12:40,880] Trial 1 finished with value: 0.1981048592205889 and parameters: {'n_estimators': 89, 'learning_rate': 0.055238410897498764, 'num_leaves': 24, 'max_depth': 9}. Best is trial 0 with value: 0.14639281987112557.
[I 2024-10-09 22:13:00,473] Trial 2 finished with value: 0.15972712785878457 and parameters: {'n_estimators': 200, 'learning_rate': 0.21534104756085318, 'num_leaves': 21, 'max_depth': 10}. Best is trial 0 with value: 0.14639281987112557.
[I 2024-10-09 22:13:23,354] Trial 3 finished with value: 0.18402424715937066 and parameters: {'n_estimators': 258, 'learning_rate': 0.07157834209670008, 'num_leaves': 34, 'max

Best Hyperparameters :  {'n_estimators': 286, 'learning_rate': 0.26962704251588226, 'num_leaves': 88, 'max_depth': 8}


In [35]:
best_params = study.best_params
# best_params = {"verbose": -1, "n_estimators": 286, "learning_rate": 0.26962704251588226, "num_leaves": 88, "max_depth": 8, "random_state" : 42}
lgb_model = lgb.LGBMRegressor(**best_params)

lgb_model.fit(X_train, y_train)

## Holdout 데이터셋에 대한 성능 확인

In [36]:
lgb_holdout_pred = lgb_model.predict(X_holdout)
lgb_holdout_mae = mean_absolute_error(y_holdout, lgb_holdout_pred)
print("Holdout 데이터셋 성능:")
print(f"LightGBM MAE: {lgb_holdout_mae}")

Holdout 데이터셋 성능:
LightGBM MAE: 0.12161249122188127


In [37]:
lgb_test_pred_log = lgb_model.predict(X_test)
lgb_test_pred = np.expm1(lgb_test_pred_log)

# Sample Submission 제출하기

In [39]:
sample_submission["deposit"] = lgb_test_pred
sample_submission.to_csv("output.csv", index=False, encoding="utf-8-sig")