In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

from lightgbm import LGBMRegressor


In [8]:
df = pd.read_csv('./data/final_df.csv')

train = df[df['_type']=='train'].drop(['_type','deposit'],axis=1)

x_data = train.drop('deposit_by_area',axis=1)
y_data = train['deposit_by_area']

x_data = pd.DataFrame(x_data)
y_data = pd.Series(y_data)

# 1. Performance Drop
변수를 하나씩 빼가며 성능의 증감 확인

In [9]:
# 기준 성능 평가 (모든 피처 포함)
model = LGBMRegressor(n_estimators=100)
model.fit(x_data, y_data)
baseline_pred = model.predict(x_data)
baseline_rmse = np.sqrt(mean_squared_error(y_data, baseline_pred))

# 각 피처의 중요도 평가
feature_importance = {}
for feature in x_data.columns:
    # 해당 피처 제거
    x_data_reduced = x_data.drop(columns=[feature])
    
    # 모델 재학습
    model.fit(x_data_reduced, y_data)
    reduced_pred = model.predict(x_data_reduced)
    reduced_rmse = np.sqrt(mean_squared_error(y_data, reduced_pred))
    
    # 성능 감소량 계산
    performance_drop = baseline_rmse - reduced_rmse
    feature_importance[feature] = performance_drop


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047534 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5906
[LightGBM] [Info] Number of data points in the train set: 1801228, number of used features: 42
[LightGBM] [Info] Start training from score 508.489415
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.178091 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5651
[LightGBM] [Info] Number of data points in the train set: 1801228, number of used features: 41
[LightGBM] [Info] Start training from score 508.489415
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.045590 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] 

In [10]:
importance_df = pd.DataFrame(list(feature_importance.items()), columns=['Feature', 'Performance Drop'])
importance_df_sorted = importance_df.sort_values(by='Performance Drop', ascending=False).reset_index(drop=True)
importance_df_sorted


Unnamed: 0,Feature,Performance Drop
0,closest_high_distance,0.110559
1,month_cos,0.048773
2,floor,0.040641
3,school_count_within_1km,0.016523
4,nearest_park_distance,0.008085
5,interest_rate_diff,0.006709
6,large_park_count_5km,0.004324
7,Is_Outside,0.0
8,weighted_park_score,-0.005717
9,contract_day,-0.008999


In [11]:
list(importance_df_sorted[importance_df_sorted['Performance Drop']>0]['Feature'])

['closest_high_distance',
 'month_cos',
 'floor',
 'school_count_within_1km',
 'nearest_park_distance',
 'interest_rate_diff',
 'large_park_count_5km']

# 2. RFE

In [12]:
def lgbm_rfe_4040_regressor(x_data, y_data, ratio=0.9, min_feats=40):
    feats = x_data.columns.tolist()
    archive = pd.DataFrame(columns=['model', 'n_feats', 'feats', 'score'])
    
    while True:
        model = LGBMRegressor(n_estimators=100)
        x_train, x_val, y_train, y_val = train_test_split(x_data[feats], y_data, random_state=4040)

        model.fit(x_train, y_train, eval_set=[(x_val, y_val)])
        
        val_pred = model.predict(x_val)
        
        score = np.sqrt(mean_absolute_error(y_val, val_pred))  # RMSE 사용
        
        n_feats = len(feats)
        print(n_feats, score)
        
        new_entry = pd.DataFrame({'model': [model], 'n_feats': [n_feats], 'feats': [feats], 'score': [score]})
        archive = pd.concat([archive, new_entry], ignore_index=True)
        
        # 피처 중요도 기반으로 피처 제거
        feat_imp = pd.Series(model.feature_importances_, index=feats).sort_values(ascending=False)
        
        next_n_feats = int(n_feats * ratio)
        
        # 최소 피처 개수보다 작아지면 종료
        if next_n_feats < min_feats:
            break
        else:
            feats = feat_imp.iloc[:next_n_feats].index.tolist()
    
    return archive


In [13]:
archive = lgbm_rfe_4040_regressor(x_data, y_data, ratio=0.9, min_feats=20)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.155346 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5911
[LightGBM] [Info] Number of data points in the train set: 1350921, number of used features: 42
[LightGBM] [Info] Start training from score 508.545301
43 7.687424326119822


  archive = pd.concat([archive, new_entry], ignore_index=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.105057 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5577
[LightGBM] [Info] Number of data points in the train set: 1350921, number of used features: 38
[LightGBM] [Info] Start training from score 508.545301
38 7.680103342806581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.098113 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5250
[LightGBM] [Info] Number of data points in the train set: 1350921, number of used features: 34
[LightGBM] [Info] Start training from score 508.545301
34 7.678064985117491
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.092983 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4470
[LightGBM] [Info] Number of data points in the train set: 1350921, num

In [14]:
archive

Unnamed: 0,model,n_feats,feats,score
0,LGBMRegressor(),43,"[index, area_m2, contract_year_month, contract...",7.687424
1,LGBMRegressor(),38,"[apt_deposit_mean, area_m2, contract_year_mont...",7.680103
2,LGBMRegressor(),34,"[apt_deposit_mean, area_m2, contract_year_mont...",7.678065
3,LGBMRegressor(),30,"[apt_deposit_mean, area_m2, contract_year_mont...",7.682378
4,LGBMRegressor(),27,"[apt_deposit_mean, area_m2, contract_year_mont...",7.685856
5,LGBMRegressor(),24,"[apt_deposit_mean, area_m2, contract_year_mont...",7.686079
6,LGBMRegressor(),21,"[apt_deposit_mean, area_m2, contract_year_mont...",7.690657


In [15]:
archive['feats'][5]

['apt_deposit_mean',
 'area_m2',
 'contract_year_month',
 'contract_type',
 'built_year',
 'deposit_mean',
 'distance_from_gangnam',
 'index',
 'floor',
 'interest_rate',
 'age',
 'total_large_park_area_10km',
 'cluster_kmeans',
 'longitude',
 'closest_high_distance',
 'latitude',
 'large_park_count_10km',
 'total_park_area_2000m',
 'month_sin',
 'weighted_park_score',
 'nearest_subway_distance_km',
 'school_count_within_1km',
 'interest_rate_diff',
 'subways_within_2km']