## 필요한 패키지 불러오기

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
import wandb

## 데이터셋 준비 및 분할

In [2]:
# Set the seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Initialize Weights & Biases
wandb.init(project='real-estate-price-prediction', entity = 'qkfdksdldy')  # 수정 필요


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mqkfdksdldy[0m. Use [1m`wandb login --relogin`[0m to force relogin


CommError: failed to upsert bucket: returned error 403 Forbidden: {"errors":[{"message":"permission denied","path":["upsertBucket"],"extensions":{"code":"PERMISSION_ERROR"}}],"data":{"upsertBucket":null}}

In [2]:
# File paths
file_path = '../data/'

# Load the dataset
train_data = pd.read_csv(file_path + 'train.csv')
test_data = pd.read_csv(file_path + 'test.csv')
sample_submission = pd.read_csv(file_path + 'sample_submission.csv')

columns_needed = ['area_m2', 'contract_year_month', 'contract_day', 'contract_type', 'floor', 'latitude', 'longitude', 'deposit']
columns_needed_test = ['area_m2', 'contract_year_month', 'contract_day', 'contract_type', 'floor', 'latitude', 'longitude']
train_data = train_data[columns_needed]
test_data = test_data[columns_needed_test]

# Holdout 데이터 설정
holdout_start = 202307
holdout_end = 202312
holdout_data = train_data[(train_data['contract_year_month'] >= holdout_start) & (train_data['contract_year_month'] <= holdout_end)]
train_data = train_data[~((train_data['contract_year_month'] >= holdout_start) & (train_data['contract_year_month'] <= holdout_end))]

# Train/Test 데이터 분리
X_train = train_data.drop(columns=['deposit'])
y_train = train_data['deposit']
X_holdout = holdout_data.drop(columns=['deposit'])
y_holdout = holdout_data['deposit']
X_test = test_data.copy()

# Stratified Group K-Fold setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

NameError: name 'RANDOM_SEED' is not defined

## 모델 실행

### light GBM 모델

In [4]:
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
import wandb

# W&B 초기화
wandb.init(project='real-estate-price-prediction', name='LightGBM')

# LightGBM 하이퍼파라미터 그리드 설정
param_grid = {
    'num_leaves': [50],
    'max_depth': [5],
    'learning_rate': [0.05],
    'n_estimators': [20]
}

# LightGBM 모델 및 GridSearchCV 설정
lgb_model = lgb.LGBMRegressor()
grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=2)
grid_search.fit(X_train, y_train)

# 최적의 파라미터 및 성능 출력
print(f"최적의 파라미터: {grid_search.best_params_}")
print(f"최고 MAE: {-grid_search.best_score_:.2f}")

# Holdout 데이터셋 예측
lgb_holdout_pred = grid_search.predict(X_holdout)
lgb_holdout_mae = mean_absolute_error(y_holdout, lgb_holdout_pred)
wandb.log({"LightGBM MAE": lgb_holdout_mae})


[34m[1mwandb[0m: Currently logged in as: [33mqkfdksdldy[0m ([33mqkfdksdldy-sungkyunkwan-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.099119 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 914
[LightGBM] [Info] Number of data points in the train set: 1275489, number of used features: 7
[LightGBM] [Info] Start training from score 40222.736105
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.111486 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 915
[LightGBM] [Info] Number of data points in the train set: 1275489, number of used features: 7
[LightGBM] [Info] Start training from score 39133.097043


Exception ignored on calling ctypes callback function: <function _log_callback at 0x7fcd0cc2a7a0>
Traceback (most recent call last):
  File "/data/ephemeral/home/level2-competitiveds-recsys-05/.venv/lib/python3.11/site-packages/lightgbm/basic.py", line 255, in _log_callback
    def _log_callback(msg: bytes) -> None:
    
KeyboardInterrupt: 


No further splits with positive gain, best gain: -inf


### Lasso 모델

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

# W&B 초기화
wandb.init(project='real-estate-price-prediction', name='Lasso')

# Lasso 하이퍼파라미터 그리드 설정
param_grid = {
    'alpha': [0.01, 0.1, 1.0, 10.0]
}

# Lasso 모델 및 GridSearchCV 설정
lasso_model = Lasso()
grid_search = GridSearchCV(estimator=lasso_model, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=5)
grid_search.fit(X_train, y_train)

# 최적의 파라미터 및 성능 출력
print(f"최적의 파라미터: {grid_search.best_params_}")
print(f"최고 MAE: {-grid_search.best_score_:.2f}")

# Holdout 데이터셋 예측
lasso_holdout_pred = grid_search.predict(X_holdout)
lasso_holdout_mae = mean_absolute_error(y_holdout, lasso_holdout_pred)
wandb.log({"Lasso MAE": lasso_holdout_mae})



### Ridge 모델

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# W&B 초기화
wandb.init(project='real-estate-price-prediction', name='Ridge')

# Ridge 하이퍼파라미터 그리드 설정
param_grid = {
    'alpha': [0.01, 0.1, 1.0, 10.0]
}

# Ridge 모델 및 GridSearchCV 설정
ridge_model = Ridge()
grid_search = GridSearchCV(estimator=ridge_model, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=5)
grid_search.fit(X_train, y_train)

# 최적의 파라미터 및 성능 출력
print(f"최적의 파라미터: {grid_search.best_params_}")
print(f"최고 MAE: {-grid_search.best_score_:.2f}")

# Holdout 데이터셋 예측
ridge_holdout_pred = grid_search.predict(X_holdout)
ridge_holdout_mae = mean_absolute_error(y_holdout, ridge_holdout_pred)
wandb.log({"Ridge MAE": ridge_holdout_mae})

### Random Forest 모델

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# W&B 초기화
wandb.init(project='real-estate-price-prediction', name='Random Forest')

# Random Forest 하이퍼파라미터 그리드 설정
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5]
}

# Random Forest 모델 및 GridSearchCV 설정
rf_model = RandomForestRegressor(random_state=RANDOM_SEED)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=5)
grid_search.fit(X_train, y_train)

# 최적의 파라미터 및 성능 출력
print(f"최적의 파라미터: {grid_search.best_params_}")
print(f"최고 MAE: {-grid_search.best_score_:.2f}")

# Holdout 데이터셋 예측
rf_holdout_pred = grid_search.predict(X_holdout)
rf_holdout_mae = mean_absolute_error(y_holdout, rf_holdout_pred)
wandb.log({"Random Forest MAE": rf_holdout_mae})


### XGBoost 모델

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# W&B 초기화
wandb.init(project='real-estate-price-prediction', name='XGBoost')

# XGBoost 하이퍼파라미터 그리드 설정
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0]
}

# XGBoost 모델 및 GridSearchCV 설정
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=RANDOM_SEED)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=5)
grid_search.fit(X_train, y_train)

# 최적의 파라미터 및 성능 출력
print(f"최적의 파라미터: {grid_search.best_params_}")
print(f"최고 MAE: {-grid_search.best_score_:.2f}")

# Holdout 데이터셋 예측
xgb_holdout_pred = grid_search.predict(X_holdout)
xgb_holdout_mae = mean_absolute_error(y_holdout, xgb_holdout_pred)
wandb.log({"XGBoost MAE": xgb_holdout_mae})

### Linear Regression 모델

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

# W&B 초기화
wandb.init(project='real-estate-price-prediction', name='Linear Regression')

# Linear Regression 모델은 하이퍼파라미터가 없음. 기본 모델 학습
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Holdout 데이터셋 예측
lr_holdout_pred = lr_model.predict(X_holdout)
lr_holdout_mae = mean_absolute_error(y_holdout, lr_holdout_pred)
wandb.log({"Linear Regression MAE": lr_holdout_mae})

print("Holdout 데이터셋 성능:")
print(f"Linear Regression MAE: {lr_holdout_mae:.2f}")

## Streamlit 활용 예시

In [None]:
import streamlit as st
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Load your models (assumes models are trained and saved)
lgb_model = lgb.Booster(model_file='lightgbm_model.txt')
lasso_model = Lasso()  # Load your trained Lasso model
lasso_model.load('lasso_model.pkl')  # Assuming saved as a pickle
ridge_model = Ridge()  # Load your trained Ridge model
ridge_model.load('ridge_model.pkl')  # Assuming saved as a pickle
rf_model = RandomForestRegressor()  # Load your trained Random Forest model
rf_model.load('rf_model.pkl')  # Assuming saved as a pickle
xgb_model = xgb.XGBRegressor()  # Load your trained XGBoost model
xgb_model.load('xgboost_model.pkl')  # Assuming saved as a pickle
lr_model = LinearRegression()  # Load your trained Linear Regression model
lr_model.load('linear_model.pkl')  # Assuming saved as a pickle

# Streamlit UI setup
st.title("부동산 전세가 예측")

# User input fields
area = st.number_input("면적 (m²)", min_value=1.0, max_value=10000.0, value=50.0)
contract_year_month = st.number_input("계약 연도 및 월 (예: 202307)", min_value=202001, max_value=202312, value=202307)
contract_day = st.number_input("계약 일", min_value=1, max_value=31, value=1)
contract_type = st.selectbox("계약 유형", options=["전세", "월세", "매매"])
floor = st.number_input("층", min_value=0, max_value=50, value=1)
latitude = st.number_input("위도", min_value=36.0, max_value=38.5, value=37.5)
longitude = st.number_input("경도", min_value=126.0, max_value=128.0, value=127.0)

# Predict button
if st.button("예측하기"):
    # Create input DataFrame
    input_data = pd.DataFrame({
        'area_m2': [area],
        'contract_year_month': [contract_year_month],
        'contract_day': [contract_day],
        'contract_type': [contract_type],
        'floor': [floor],
        'latitude': [latitude],
        'longitude': [longitude]
    })
    
    # Process the input data as necessary
    # For example, convert categorical variables to numerical
    input_data['contract_type'] = input_data['contract_type'].map({'전세': 0, '월세': 1, '매매': 2})

    # Make predictions
    lgb_pred = lgb_model.predict(input_data)
    lasso_pred = lasso_model.predict(input_data)
    ridge_pred = ridge_model.predict(input_data)
    rf_pred = rf_model.predict(input_data)
    xgb_pred = xgb_model.predict(input_data)
    lr_pred = lr_model.predict(input_data)

    # Display the results
    st.subheader("예측 결과")
    st.write(f"LightGBM 예측: {lgb_pred[0]:.2f} 원")
    st.write(f"Lasso 예측: {lasso_pred[0]:.2f} 원")
    st.write(f"Ridge 예측: {ridge_pred[0]:.2f} 원")
    st.write(f"Random Forest 예측: {rf_pred[0]:.2f} 원")
    st.write(f"XGBoost 예측: {xgb_pred[0]:.2f} 원")
    st.write(f"Linear Regression 예측: {lr_pred[0]:.2f} 원")

## Submission 제출

In [None]:
lgb_test_pred = lgb_model.predict(X_test)
sample_submission['deposit'] = lgb_test_pred
sample_submission.to_csv('output.csv', index=False, encoding='utf-8-sig')