### 랜덤포레스트

In [1]:
import pandas as pd
final = pd.read_csv(r'final.csv', encoding = 'CP949')
final.head(5)

Unnamed: 0,시도,시군구,행정번호,년도,우울증환자수,평균연령,총인구수,스트레스인지율,고용률,코로나확진자수,1인당 정신건강예산(원),공공시설개수,우울증지수
0,서울특별시,서울종로구,1111000000,2018,14837,44.2,153065,34.2,59.6,0,5326,44,9.693268
1,서울특별시,서울중구,1114000000,2018,4352,44.6,125725,28.3,59.6,0,5326,42,3.461523
2,서울특별시,서울용산구,1117000000,2018,2370,43.6,228999,20.0,59.6,0,5326,53,1.034939
3,서울특별시,서울성동구,1120000000,2018,3657,42.0,308221,31.3,59.6,0,5326,50,1.186486
4,서울특별시,서울광진구,1121500000,2018,6917,41.4,355559,29.4,59.6,0,5326,47,1.945387


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [21]:
X = final[['우울증환자수', '평균연령', '총인구수', '스트레스인지율', '고용률', '코로나확진자수', '1인당 정신건강예산(원)', '공공시설개수']]
y = final['우울증지수']

rf = RandomForestRegressor()

# Pipeline 정의
pipeline = Pipeline([
    ('robustscaler', RobustScaler()),
    ('randomforestregressor', rf)
])

# 그리드 서치를 위한 하이퍼파라미터 그리드 설정
param_grid = {
    'randomforestregressor__n_estimators': [50, 100],
    'randomforestregressor__max_depth': [None, 10, 20],
    'randomforestregressor__min_samples_split': [5, 10],
    'randomforestregressor__min_samples_leaf': [10, 20, 30],
    'randomforestregressor__max_features': ['auto', 'sqrt']
}

# GridSearchCV 객체 생성
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# 그리드 서치 수행
grid_search.fit(X, y)

# 최적의 하이퍼파라미터와 모델 성능 출력
print("최적의 하이퍼파라미터:", grid_search.best_params_)

180 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
170 fits failed with the following error:
Traceback (most recent call last):
  File "C:\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_para

최적의 하이퍼파라미터: {'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'sqrt', 'randomforestregressor__min_samples_leaf': 10, 'randomforestregressor__min_samples_split': 5, 'randomforestregressor__n_estimators': 50}


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = make_pipeline(
    RobustScaler(),
    RandomForestRegressor(max_depth=None, max_features='sqrt', min_samples_leaf=10, min_samples_split=5, n_estimators=50)
)
model.fit(X_train, y_train)

# Train 세트에 대한 예측 및 평가
y_pred_train = model.predict(X_train)
mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_pred_train)

# 예측 및 성능 평가
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Train - RMSE:", round(rmse_train.mean(),4), "/ MSE:", round(mse_train.mean(),4), "/ r²Score:", round(r2_train.mean(),4))
print("Test - RMSE:", round(rmse.mean(),4), "/ MSE:", round(mse.mean(),4), "/ r²Score:", round(r2.mean(),4))

Train - RMSE: 0.6056 / MSE: 0.3667 / r²Score: 0.7697
Test - RMSE: 0.5919 / MSE: 0.3503 / r²Score: 0.7082
