In [1]:
# 필요한 패키지 설치

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error # MSE
from sklearn.metrics import mean_absolute_error # MAE
from sklearn.metrics import mean_absolute_percentage_error # MAPE
from sklearn.metrics import mean_squared_log_error # MSLE

In [2]:
# 데이터 불러오기
# https://www.kaggle.com/datasets/harlfoxem/housesalesprediction
df = pd.read_csv("datasets/kc_house_data.csv")

# 데이터 샘플 확인
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [3]:
# 독립변수와 종속변수 분리하여 생성
x = df[[ 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 
       'sqft_living15', 'sqft_lot15']]
# 'id', 'date'는 키값에 해당하므로 변수에서 제외 해준다.
y = df[['price']]

# 학습셋과 테스트셋 분리하여 생성(7:3)
# df_train, df_test = train_test_split(df, test_size = 0.4) 
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.6, test_size=0.4)

In [4]:
# 다중회귀모델 생성
mreg = LinearRegression(fit_intercept=True)
mreg.fit(x_train, y_train)

# 테스트셋에 모델 적용
y_predict = mreg.predict(x_test)

In [5]:
# RMSE 산출 (MSE에 루트 적용)
MSE = mean_squared_error(y_test, y_predict)
RMSE = np.sqrt(MSE)
print(("RMSE : {:.2f}".format(RMSE)))

# MAE 산출
MAE = mean_absolute_error(y_test, y_predict)
print(("MAE : {:.2f}".format(MAE)))

# MAPE 산출
MAPE = mean_absolute_percentage_error(y_test, y_predict)
print(("MAPE : {:.2f}".format(MAPE)))

# RMSLE 산출 (MSLE에 루트 적용)

# 음수값 전처리
y_predict_df = pd.DataFrame(y_predict,columns=['price2'])
y_predict_df2 = y_predict_df.copy()
y_predict_df2.loc[y_predict_df2['price2'] < 0, 'price2'] = 0
y_predict_rmsle = y_predict_df2.to_numpy()

MSLE = mean_squared_log_error(y_test, (y_predict_rmsle))
RMSLE = np.sqrt(MSLE)
print(("RMSLE : {:.2f}".format(RMSLE)))

RMSE : 214125.86
MAE : 139072.03
MAPE : 0.29
RMSLE : 0.69


In [6]:
# RMSLE ver. 2
  
def rmsle(predicted_values, actual_values):
    
    # 테스트셋 y 값과 예측값에 +1 및 로그 
    log_y_test = np.log(y_test + 1)
    log_y_predict = np.log(y_predict + 1)

    # 테스트셋 y 값 - 예측값 및 제곱
    diff = log_y_predict - log_y_test
    diff_square = np.square(diff)

    # 차이값 평균 및 루트
    mean_diff = diff_square.mean()
    final_rmsle = np.sqrt(mean_diff)  

    return final_rmsle

rmsle(y_test, y_predict)

  log_y_predict = np.log(y_predict + 1)


price    0.3703
dtype: float64