In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.ensemble import BaggingRegressor, VotingRegressor, GradientBoostingRegressor, RandomForestRegressor
from scipy.stats import randint

In [3]:
apt = pd.read_csv('./apartment_price.csv')

train, test = train_test_split(apt, test_size=0.2, random_state = 7916, stratify = apt['gu'])

target = test['price'].copy()
test = test.drop('price', axis = 1)

all_data = pd.concat([train, test], axis = 0, ignore_index = True)

drop_features = ['apt_name', 'gu', 'dong', 'transaction_year_month', 'transaction_date', 'year_of_completion', 'commuting_vehicle_sum', 
                 'day_care_num', 'permission_year', 'park_area', 'one_park_area', 'citypark_area', 'one_citypark_area']
all_data = all_data.drop(drop_features, axis = 1)

X_train = all_data[~pd.isnull(all_data['price'])]
X_test = all_data[pd.isnull(all_data['price'])]

y_train = X_train['price']
X_train = X_train.drop(['price'], axis = 1)

X_test = X_test.drop(['price'], axis = 1)

log_y = np.log(y_train)

## (9) 랜덤포레스트 (대용)

- GridSearchCV 사용해서 파라미터 선택

In [None]:
rnd_forest = RandomForestRegressor(random_state=7916)

rf_params = {'n_estimators': [180, 200, 220], 'max_features' : [8, 9, 10], 'max_depth' : [10, 20, 30]}

gridsearch_forest = GridSearchCV(rnd_forest, rf_params, scoring = 'neg_mean_squared_error', cv = 5, n_jobs = -1)
gridsearch_forest.fit(X_train, log_y)

In [None]:
gridsearch_forest.best_params_

In [None]:
cvres = gridsearch_forest.cv_results_

for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(-mean_score, params)

- 랜덤포레스트 모델 ('max_depth': 20, 'max_features': 10, 'n_estimators': 220) mse = 0.0729584701779201

In [None]:
best_model_forest = gridsearch_forest.best_estimator_
best_model_forest

In [None]:
sorted(zip(best_model_forest.feature_importances_, X_train.columns), reverse = True)

## (10) Voting (Ensemble)

In [4]:
lin_reg = LinearRegression()
ridge = Ridge(alpha=0.1, random_state=7916)
rnd_forest = RandomForestRegressor(max_depth=20, max_features=10, n_estimators=220, random_state=7916)
svm = SVR()

In [7]:
voting_reg = VotingRegressor(estimators = [('lin', lin_reg), ('rid', ridge), ('rnd', rnd_forest), ('svmr', svm)],
                n_jobs=-1)

scores = cross_val_score(voting_reg, X_train, log_y, scoring = 'neg_mean_squared_error', cv = 2)
-scores.mean()

0.1333027429108106

- Voting 모델 mse =  0.1333027429108106