In [13]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

import wandb

In [14]:
wandb.login(key='6004e665028de7fe1304088c8f08e6a7abddab05')
wandb.init(project='level2_rf_hyperparameter_tuning')



VBox(children=(Label(value='0.040 MB of 0.040 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [16]:
BASE_PATH = os.path.join("../..", "data")

train_data = pd.read_csv(os.path.join(BASE_PATH, "v4_baek+recent+rank.csv"))

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1951400 entries, 0 to 1951399
Data columns (total 35 columns):
 #   Column                                Dtype  
---  ------                                -----  
 0   apt_idx                               int64  
 1   index                                 int64  
 2   area_m2                               float64
 3   contract_type                         int64  
 4   floor                                 int64  
 5   built_year                            int64  
 6   latitude                              float64
 7   longitude                             float64
 8   age                                   int64  
 9   deposit                               float64
 10  _type                                 object 
 11  area_m2_price                         float64
 12  recent_deposit                        float64
 13  contract_ymd                          object 
 14  grid_id                               float64
 15  nearest_park_di

In [4]:
# train_data["contract_ymd"] = pd.to_datetime(train_data["contract_ymd"], format="%Y-%m-%d")

# "_type" 칼럼을 기준으로 train, test 데이터 분리
train_df = train_data[train_data["_type"] == "train"]
test_df = train_data[train_data["_type"] == "test"]

# 타겟 변수인 deposit과 불필요하다고 판단되는 칼럼 제거
X_train = train_df.drop(columns=["deposit", "index", "_type", 'apt_idx', 'contract_ymd'])
y_train = train_df['deposit']
X_test = train_df.drop(columns=["deposit", "index", "_type", 'apt_idx', 'contract_ymd'])

# baseline code

In [4]:
BASE_PATH = os.path.join("../..", "data")

train_data = pd.read_csv(os.path.join(BASE_PATH, 'raw', "train.csv"))
test_data = pd.read_csv(os.path.join(BASE_PATH, 'raw', "test.csv"))
sample_submission = pd.read_csv(os.path.join(BASE_PATH, 'raw', "sample_submission.csv"))

In [5]:
columns_needed = ['area_m2', 'contract_year_month', 'contract_day', 'contract_type', 'floor', 'latitude', 'longitude',
                  'deposit']
columns_needed_test = ['area_m2', 'contract_year_month', 'contract_day', 'contract_type', 'floor', 'latitude',
                       'longitude']
train_data = train_data[columns_needed]
test_data = test_data[columns_needed_test]

In [6]:
holdout_start = 202307
holdout_end = 202312
holdout_data = train_data[
    (train_data['contract_year_month'] >= holdout_start) & (train_data['contract_year_month'] <= holdout_end)]
train_data = train_data[
    ~((train_data['contract_year_month'] >= holdout_start) & (train_data['contract_year_month'] <= holdout_end))]

In [7]:
X_train = train_data.drop(columns=['deposit'])
y_train = train_data['deposit']
X_holdout = holdout_data.drop(columns=['deposit'])
y_holdout = holdout_data['deposit']
X_test = test_data.copy()

In [8]:
# 하이퍼파라미터 그리드 설정
param_distributions = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [12]:
# 모델 초기화
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# 랜덤 서치 수행
rf_random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_distributions,
    n_iter=100,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=1,
    scoring='neg_mean_absolute_error'
)

rf_random_search.fit(X_train, y_train)

# 각 실험 결과를 WandB에 기록
cv_results = rf_random_search.cv_results_
for i in range(len(cv_results['params'])):
    wandb.log({
        'params': cv_results['params'][i],
        'mean_test_score': -cv_results['mean_test_score'][i],  # 음수 MSE를 양수로 변환
        'std_test_score': cv_results['std_test_score'][i],
        'rank_test_score': cv_results['rank_test_score'][i]
    })

# 모델 학습

wandb.log({
    'best_params': rf_random_search.best_params_,
    'best_score': rf_random_search.best_score_
})

wandb.finish()

# 최적의 모델 출력
best_rf = rf_random_search.best_estimator_

print("최적의 파라미터: ", rf_random_search.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END bootstrap=True, max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=   0.0s
[CV] END bootstrap=True, max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=   0.0s
[CV] END bootstrap=True, max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=   0.0s
[CV] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   0.0s
[CV] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   0.0s
[CV] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   0.0s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200;

KeyboardInterrupt: 

In [19]:
y_pred_test = rf.predict(X_holdout)

mae = mean_absolute_error(y_holdout, y_pred_test)

print(f"Holdout MAE: {mae:.2f}")

Holdout MAE: 4725.01


In [18]:
y_pred_test.mean()

38890.92249744313

# v5 진행

In [5]:
v5_data = pd.read_csv(os.path.join(BASE_PATH, "V5-2024-10-16-YDS-APT-RANK-RecentD.csv"))

- rf 학습시에 원핫인코딩 피처 사용 이상 없음 확인
- contract_type이 범주형 변수임에 따라, 원핫인코딩으로 변경

In [6]:
v5_edit_data = v5_data.copy()

v5_edit_data= pd.get_dummies(v5_edit_data, columns=['contract_type'], prefix='contract_type')

In [7]:
# 'contract_ymd'를 datetime 형식으로 변환한 후, YYYYMM 형식으로 변경하고 int64로 변환
v5_edit_data['contract_year_month'] = pd.to_datetime(v5_edit_data['contract_ymd'], format='%Y-%m-%d').dt.strftime('%Y%m').astype(np.int64)

In [8]:
numeric_v5 = v5_edit_data.drop(columns=['_type', 'contract_ymd'])
v5_corr = numeric_v5.corr()
v5_corr['deposit']

index                                   0.217950
area_m2                                 0.521623
floor                                   0.132336
built_year                              0.144960
latitude                                0.075605
longitude                               0.157374
age                                    -0.136479
deposit                                 1.000000
apt_idx                                 0.220488
area                                    0.521651
area_m2_price                           0.784008
nearest_subway_distance                -0.216313
nearest_subway_idx                     -0.048851
num_subway_within_1km                   0.314987
category_interchange_within_1km         0.292699
num_subway_within_500m                  0.221344
category_interchange_within_500m        0.208316
nearest_park_distance                  -0.071564
nearest_park_within_500.0m             -0.009379
has_park_within_500.0m                  0.065762
nearest_elementary_s

In [9]:
v5_train = v5_edit_data[v5_edit_data["_type"] == "train"]
v5_test = v5_edit_data[v5_edit_data["_type"] == "test"]

print('v5_train shape:', v5_train.shape)
print('v5_test shape:', v5_test.shape)

v5_train shape: (1801228, 41)
v5_test shape: (150172, 41)


In [10]:
holdout_start = 202307
holdout_end = 202312
v5_holdout_data = v5_train[
    (v5_train['contract_year_month'] >= holdout_start) & (v5_train['contract_year_month'] <= holdout_end)]
v5_train = v5_train[
    ~((v5_train['contract_year_month'] >= holdout_start) & (v5_train['contract_year_month'] <= holdout_end))]

In [162]:
print('v5_holdout_data shape:', v5_holdout_data.shape)
print('v5_train shape:', v5_train.shape)

v5_holdout_data shape: (206866, 41)
v5_train shape: (1594362, 41)


In [167]:
v5_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1594362 entries, 0 to 1801187
Data columns (total 41 columns):
 #   Column                                Non-Null Count    Dtype  
---  ------                                --------------    -----  
 0   index                                 1594362 non-null  int64  
 1   _type                                 1594362 non-null  object 
 2   area_m2                               1594362 non-null  float64
 3   floor                                 1594362 non-null  int64  
 4   built_year                            1594362 non-null  int64  
 5   latitude                              1594362 non-null  float64
 6   longitude                             1594362 non-null  float64
 7   age                                   1594362 non-null  int64  
 8   deposit                               1594362 non-null  float64
 9   apt_idx                               1594362 non-null  int64  
 10  area                                  1594362 non-null  flo

In [173]:
v5_X_train = v5_train.drop(columns=["deposit", "index", "_type", 'contract_ymd'])
v5_y_train = v5_train['deposit']
v5_X_holdout = v5_holdout_data.drop(columns=["deposit", "index", "_type", 'contract_ymd'])
v5_y_holdout = v5_holdout_data['deposit']
v5_X_test = v5_test.drop(columns=["deposit", "index", "_type", 'contract_ymd'])

In [174]:
print('v5_X_train.shape: ', v5_X_train.shape)
print('v5_y_train.shape: ', v5_y_train.shape)
print('v5_X_holdout.shape: ', v5_X_holdout.shape)
print('v5_y_holdout.shape: ', v5_y_holdout.shape)
print('v5_X_test.shape: ', v5_X_test.shape)

v5_X_train.shape:  (1594362, 37)
v5_y_train.shape:  (1594362,)
v5_X_holdout.shape:  (206866, 37)
v5_y_holdout.shape:  (206866,)
v5_X_test.shape:  (150172, 37)


In [175]:
rf = RandomForestRegressor(n_estimators=50, random_state=42)

rf.fit(v5_X_train, v5_y_train)

In [1]:
# 홀드아웃 데이터를 사용하여 예측 및 평가
v5_y_pred_holdout = rf.predict(v5_X_holdout)
mae = mean_absolute_error(v5_y_holdout, v5_y_pred_holdout)

print(f"Holdout MAE: {mae:.2f}")

NameError: name 'rf' is not defined

In [50]:
v5_test_pred = rf.predict(v5_X_test)

In [53]:
sample_submission['deposit'] = v3_test_pred
sample_submission.to_csv('rf_v2_1.csv', index=False, encoding='utf-8-sig')

In [32]:
print(v3_y_pred.shape)
print(sample_submission.shape)

(206866,)
(150172, 2)


In [52]:
sample_submission.head()

Unnamed: 0,index,deposit
0,0,25630.0
1,1,21410.0
2,2,13657.25
3,3,13657.85
4,4,15482.55
