## 라이브러리 불러오기

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## 데이터 불러오기

In [2]:
wine = pd.read_csv('./input/wine.csv')

In [3]:
wine.head()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


## quality가 3 ~ 6이면 'Good'  / 7 ~ 9이면 'Best' 할당

In [10]:
# list comprehension
wine['grade'] = ['Good' if x<7 else 'Best' for x in wine['quality']]

## 데이터 분할

In [11]:
wine.columns

Index(['fixed.acidity', 'volatile.acidity', 'citric.acid', 'residual.sugar',
       'chlorides', 'free.sulfur.dioxide', 'total.sulfur.dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'grade'],
      dtype='object')

In [12]:
col = wine.columns.tolist()
col.remove('quality')
col.remove('grade') # 종속변수 제외
col

['fixed.acidity',
 'volatile.acidity',
 'citric.acid',
 'residual.sugar',
 'chlorides',
 'free.sulfur.dioxide',
 'total.sulfur.dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol']

In [13]:
x_data = wine[col]
y_data = wine['grade'] # target

In [14]:
y_data

0       Good
1       Good
2       Good
3       Good
4       Good
        ... 
4893    Good
4894    Good
4895    Good
4896    Best
4897    Good
Name: grade, Length: 4898, dtype: object

In [15]:
# train, test split
# 굳이 값을 표준화해줄 필요는 없음 : 한 특성에만 반드시 영향받는 것이 아님
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=1)

In [17]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
# reshape 함수 써서 1열(2차원)로 바꿔줌
# type(y_train) ... series : np.array() 거쳐서

(3428, 11)
(1470, 11)
(3428,)
(1470,)


## Random Forest 모델 생성
### 참고 : https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [18]:
from sklearn.ensemble import RandomForestClassifier
# n_estimators : 나무의 수
# 그 외에 나머지 요인들은 의사결정 나무와 동일
# default : 100
# criterion : gini, entropy
# mean_samples_leaf, grid_search, max_feature (auto, sqrt, log2)
# 각 모델을 만들 때마다 max_feature가 각각 들어가게 됨
# ccp_alpha : 기존 오류에 +a, 마지막 노드의 개수 -> 너무 많아지면 비용 복잡도 train에 과적합, 오류값이 높아짐 
# 0-1 사이 값만 -> 음수가 들어가면 오히려 반대의 결과가 나옴

model = RandomForestClassifier()
model

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [19]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
model = RandomForestClassifier(n_jobs=-1) # 분산하여 속도를 빠르게 해줌, 비어있는 코어를 모두 활용하도록
model

In [22]:
x_data.columns

Index(['fixed.acidity', 'volatile.acidity', 'citric.acid', 'residual.sugar',
       'chlorides', 'free.sulfur.dioxide', 'total.sulfur.dioxide', 'density',
       'pH', 'sulphates', 'alcohol'],
      dtype='object')

In [20]:
# feature_importances_ : 변수 중요도 ... 발표에 용이
model.feature_importances_ # 가장 높은 수가 모델을 만드는데 가장 좋은 항목, 너무 안좋으면 빼버려야 함
# PCA 등 : 1200개 가량의 columns... 차원 축소

array([0.06689515, 0.0848353 , 0.06848756, 0.08338603, 0.08560192,
       0.08310574, 0.08230425, 0.11620287, 0.09167989, 0.07333515,
       0.16416614])

In [26]:
data = pd.DataFrame(model.feature_importances_, x_data.columns)

In [27]:
data.head()

Unnamed: 0,0
fixed.acidity,0.066895
volatile.acidity,0.084835
citric.acid,0.068488
residual.sugar,0.083386
chlorides,0.085602


In [28]:
dic = {'column' : x_data.columns.tolist(), 'feature_importance' : model.feature_importances_}

In [31]:
df = pd.DataFrame(dic)

In [32]:
# 정렬
df.sort_values(by='feature_importance', ascending = False)

Unnamed: 0,column,feature_importance
10,alcohol,0.164166
7,density,0.116203
8,pH,0.09168
4,chlorides,0.085602
1,volatile.acidity,0.084835
3,residual.sugar,0.083386
5,free.sulfur.dioxide,0.083106
6,total.sulfur.dioxide,0.082304
9,sulphates,0.073335
2,citric.acid,0.068488


In [None]:
# column이 많을 때는 heatmap 사용하는 것이 좋음

In [34]:
model = RandomForestClassifier(verbose=1, n_jobs=-1)
model.fit(X_train, y_train)
# n_jobs = -1이면 시간이 훨씬 더 짧음, verbose : default = 0, 1인 경우 진행 과정 표시

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.3s finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=1,
                       warm_start=False)

In [None]:
# 변수 중요도 데이터 프레임 만들기


In [37]:
# 모델 예측
y_pred = model.predict(X_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [38]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(y_pred, y_test)
confusion_matrix(y_pred, y_test)
# class가 imbalance ... 정확도가 87이라고 해도 좋은 모델이 아님 - 특이도
# train으로만 과적합이 된 상태 

array([[ 195,   41],
       [ 139, 1095]], dtype=int64)

In [None]:
# 적는 순서에 따라
confusion_matrix()

## grid search + cross_validation
#### parameter를 격자무늬 -> 학습을 하나하나 시키기 때문에 개수가 많을 수록 연산이 많아져 시간이 오래 걸림

In [39]:
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators' : [100, 200, 300],
             'max_features' : ['auto', 'sqrt', 'log2'],
             'ccp_alpha' : [0.1, 0.01, 0.001]}
grid_search = GridSearchCV(RandomForestClassifier(n_jobs=-1),
                           param_grid,
                           cv=5,
                           return_train_score=True)
# 데이터 나누는 수만큼 iteration이 돌아감
# return_train_score : accuracy 출력


In [42]:
%%time 
# 시간이 얼마나 지났는가
grid_search.fit(X_train, y_train)

Wall time: 2min 9s


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=-1,
                                              oob_score=False,
                                              random_

In [None]:
%%time


In [43]:
# best_params_ : best parameter
grid_search.best_params_

{'ccp_alpha': 0.001, 'max_features': 'auto', 'n_estimators': 200}

In [44]:
# best_score_ : best parameter로 돌린 cv 정확도 결과
grid_search.best_score_

0.8526794492562406

In [45]:
# best_estimator_ : 최고 성능을 낸 파라미터를 가진 모형
grid_search.best_estimator_
# 판단을 해서 다 들어감

RandomForestClassifier(bootstrap=True, ccp_alpha=0.001, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [47]:
# cv_results_ : cross validation 결과
pd.DataFrame(grid_search.cv_results_)
# parameter 따라서 모두 출력
# mean_train_score : 평균 출력
# 나무 100개일 때보다 갑자기 성능이 확 높아짐
# model = RandomForestClassifier가 아니고 grid_search.best_estimator_를 바로 갖다 쓰면 됨


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ccp_alpha,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.901036,1.113717,0.116388,0.005789,0.1,auto,100,"{'ccp_alpha': 0.1, 'max_features': 'auto', 'n_...",0.78863,0.78863,...,0.788215,0.000539,10,0.788111,0.788111,0.788476,0.788188,0.788188,0.788215,0.000135
1,0.580362,0.007729,0.114334,0.002249,0.1,auto,200,"{'ccp_alpha': 0.1, 'max_features': 'auto', 'n_...",0.78863,0.78863,...,0.788215,0.000539,10,0.788111,0.788111,0.788476,0.788188,0.788188,0.788215,0.000135
2,0.88458,0.004851,0.17656,0.048798,0.1,auto,300,"{'ccp_alpha': 0.1, 'max_features': 'auto', 'n_...",0.78863,0.78863,...,0.788215,0.000539,10,0.788111,0.788111,0.788476,0.788188,0.788188,0.788215,0.000135
3,0.341129,0.003835,0.112382,0.000173,0.1,sqrt,100,"{'ccp_alpha': 0.1, 'max_features': 'sqrt', 'n_...",0.78863,0.78863,...,0.788215,0.000539,10,0.788111,0.788111,0.788476,0.788188,0.788188,0.788215,0.000135
4,0.584462,0.006417,0.114246,0.003053,0.1,sqrt,200,"{'ccp_alpha': 0.1, 'max_features': 'sqrt', 'n_...",0.78863,0.78863,...,0.788215,0.000539,10,0.788111,0.788111,0.788476,0.788188,0.788188,0.788215,0.000135
5,0.887678,0.007157,0.197337,0.042213,0.1,sqrt,300,"{'ccp_alpha': 0.1, 'max_features': 'sqrt', 'n_...",0.78863,0.78863,...,0.788215,0.000539,10,0.788111,0.788111,0.788476,0.788188,0.788188,0.788215,0.000135
6,0.336291,0.007127,0.112617,0.000741,0.1,log2,100,"{'ccp_alpha': 0.1, 'max_features': 'log2', 'n_...",0.78863,0.78863,...,0.788215,0.000539,10,0.788111,0.788111,0.788476,0.788188,0.788188,0.788215,0.000135
7,0.582888,0.003207,0.113713,0.00224,0.1,log2,200,"{'ccp_alpha': 0.1, 'max_features': 'log2', 'n_...",0.78863,0.78863,...,0.788215,0.000539,10,0.788111,0.788111,0.788476,0.788188,0.788188,0.788215,0.000135
8,0.88882,0.018758,0.198675,0.043829,0.1,log2,300,"{'ccp_alpha': 0.1, 'max_features': 'log2', 'n_...",0.78863,0.78863,...,0.788215,0.000539,10,0.788111,0.788111,0.788476,0.788188,0.788188,0.788215,0.000135
9,0.344614,0.007594,0.112284,0.002919,0.01,auto,100,"{'ccp_alpha': 0.01, 'max_features': 'auto', 'n...",0.78863,0.78863,...,0.788215,0.000539,10,0.788111,0.788111,0.788476,0.788188,0.788188,0.788215,0.000135


In [48]:
grid_search.best_estimator_.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.001, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [49]:
# 혼동행렬
y_pred =  grid_search.best_estimator_.predict(X_test)

In [50]:
# ccp_alpha가 들어가서 좀 더 성능이 안좋아짐... 많지 않아서 underfitting이 일어남 ->  상황에 맞춰서
accuracy_score(y_pred, y_test)
confusion_matrix(y_pred, y_test)

array([[ 148,   43],
       [ 186, 1093]], dtype=int64)

## RandomForest Regressor

In [51]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [55]:
apt = pd.read_csv('./input/aptPrice.csv', engine = 'python') # C 언어 기반이기 때문에 

In [56]:
apt.head()
# 거래금액이 target, 연속형이기 때문에 classifier가 아님

Unnamed: 0,아파트명,전용면적,층,거래금액,최고층수,승강기수,총동수,연면적,주거전용면적,일반관리인원,...,경과년수,계단식,분양,위탁관리,개별난방,주차관제,브랜드사,로그금액,루트금액,변환금액
0,SK허브프리모,33.8,7,32900.0,20,4,1,35505.34,9680.09,6,...,11,0,1,1,1,0,1,10.401228,181.383571,3972.879806
1,대우디오빌,31.81,4,27000.0,15,2,1,9782.62,4260.59,3,...,13,0,1,1,1,0,1,10.203592,164.316767,3394.049026
2,신동아(22),35.73,1,58500.0,13,20,6,58816.99,45646.06,6,...,20,1,1,1,1,0,0,10.976782,241.867732,6284.383338
3,신동아(22),35.73,9,68000.0,13,20,6,58816.99,45646.06,6,...,20,1,1,1,1,0,0,11.127263,260.768096,7084.890404
4,신동아(22),84.52,6,92500.0,13,20,6,58816.99,45646.06,6,...,20,1,1,1,1,0,0,11.434964,304.138127,9053.280186


In [57]:
apt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1516 entries, 0 to 1515
Data columns (total 27 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   아파트명    1516 non-null   object 
 1   전용면적    1516 non-null   float64
 2   층       1516 non-null   int64  
 3   거래금액    1516 non-null   float64
 4   최고층수    1516 non-null   int64  
 5   승강기수    1516 non-null   int64  
 6   총동수     1516 non-null   int64  
 7   연면적     1516 non-null   float64
 8   주거전용면적  1516 non-null   float64
 9   일반관리인원  1516 non-null   int64  
 10  경비인원    1516 non-null   int64  
 11  청소인원    1516 non-null   int64  
 12  CCTV개수  1516 non-null   int64  
 13  총세대수    1516 non-null   int64  
 14  면적당관리비  1516 non-null   float64
 15  총주차대수   1516 non-null   int64  
 16  세대주차대수  1516 non-null   float64
 17  경과년수    1516 non-null   int64  
 18  계단식     1516 non-null   int64  
 19  분양      1516 non-null   int64  
 20  위탁관리    1516 non-null   int64  
 21  개별난방    1516 non-null   int64  
 22  

In [60]:
col = apt.columns.tolist()
col.remove('아파트명')
col.remove('로그금액') #종속변수와 관련도가 높음
col.remove('루트금액')
col.remove('변환금액')
col.remove('거래금액') # 종속변수
col

['전용면적',
 '층',
 '거래금액',
 '최고층수',
 '승강기수',
 '총동수',
 '연면적',
 '주거전용면적',
 '일반관리인원',
 '경비인원',
 '청소인원',
 'CCTV개수',
 '총세대수',
 '면적당관리비',
 '총주차대수',
 '세대주차대수',
 '경과년수',
 '계단식',
 '분양',
 '위탁관리',
 '개별난방',
 '주차관제',
 '브랜드사']

In [61]:
apt.head() # string type 변수는 

Unnamed: 0,아파트명,전용면적,층,거래금액,최고층수,승강기수,총동수,연면적,주거전용면적,일반관리인원,...,경과년수,계단식,분양,위탁관리,개별난방,주차관제,브랜드사,로그금액,루트금액,변환금액
0,SK허브프리모,33.8,7,32900.0,20,4,1,35505.34,9680.09,6,...,11,0,1,1,1,0,1,10.401228,181.383571,3972.879806
1,대우디오빌,31.81,4,27000.0,15,2,1,9782.62,4260.59,3,...,13,0,1,1,1,0,1,10.203592,164.316767,3394.049026
2,신동아(22),35.73,1,58500.0,13,20,6,58816.99,45646.06,6,...,20,1,1,1,1,0,0,10.976782,241.867732,6284.383338
3,신동아(22),35.73,9,68000.0,13,20,6,58816.99,45646.06,6,...,20,1,1,1,1,0,0,11.127263,260.768096,7084.890404
4,신동아(22),84.52,6,92500.0,13,20,6,58816.99,45646.06,6,...,20,1,1,1,1,0,0,11.434964,304.138127,9053.280186


In [62]:
from sklearn.model_selection import train_test_split
x_data = apt[col]
y_data = apt['거래금액']
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.3, random_state = 2)

In [65]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor (n_estimators = 500, n_jobs=-1)
model

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=500, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [66]:
# 모델 학습
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=500, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [68]:
# 예측
y_pred = model.predict(X_test)
y_pred
# regressor는 classifier와 달리 명확한 class 분류의 정답이 없으며 예측값과 실제값을 뻈을 때 차이가 없어야 좋은 모델,
# 시험 점수는 100점을 0으로 예측해도 오차가 100, 금액은 정확하게 차이가 난다는 domain 지식이 필요함
# RMSE - 오차 판별

array([ 86972.4  , 139998.   , 204902.   , 124232.   ,  99959.6  ,
        92364.8  , 199959.   ,  72150.   , 136260.   ,  53379.   ,
        99955.   , 188325.   , 146999.   , 184996.   , 146012.   ,
       127934.   , 227969.   ,  58700.2  , 149085.4  ,  78915.144,
       160000.   , 104993.   ,  59993.16 , 125001.   , 105010.8  ,
       160000.   , 199933.   ,  82124.8  , 131268.   , 164209.   ,
       123084.   , 115057.4  , 199928.   , 205035.   , 214966.   ,
        99554.2  , 106955.4  , 112749.8  , 151962.   , 205141.   ,
        37821.8  , 130000.   , 165010.   , 136978.   , 115015.   ,
       114991.4  ,  84029.6  , 201260.   , 127361.   , 136958.   ,
       240138.   ,  80394.256, 180000.   , 102028.   , 175025.   ,
       242288.   , 147969.4  ,  93708.4  , 114803.2  , 160000.   ,
        77944.488, 175025.   , 219128.   , 175993.8  , 110001.   ,
       184998.   , 140008.   , 175004.   , 128957.   , 185084.   ,
       173995.   , 144015.   , 209194.   ,  29021.8  ,  32180.

In [73]:
# RMSE
# np.sqrt((실제값-예측값)^2)
np.sqrt((y_test - y_pred)**2).sum() # 오차의 합
np.sqrt((y_test - y_pred)**2).sum()/len(y_test)

160.733854945055

In [70]:
a=[1,2]
b=[3,4]
#a-b # 계산 불가
np.array(a)-np.array(b)

array([-2, -2])