# 랜덤 포레스트

In [2]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.2, random_state=2021)

In [9]:
feature_name_df = pd.read_csv('./UCI HAR Dataset/features.txt', sep='\s+',
                                header=None, names=['col_index','col_name'])
feature_name_df.head()

Unnamed: 0,col_index,col_name
0,1,tBodyAcc-mean()-X
1,2,tBodyAcc-mean()-Y
2,3,tBodyAcc-mean()-Z
3,4,tBodyAcc-std()-X
4,5,tBodyAcc-std()-Y


In [6]:
import pandas as pd
def get_new_feature_name_df(old_df):
    dup_df = pd.DataFrame({'dup_cnt':feature_name_df.groupby('col_name').cumcount()})
    new_df = pd.merge(old_df.reset_index(), dup_df.reset_index())
    new_df['col_name'] = new_df[['col_name', 'dup_cnt']].\
        apply(lambda x: x[0]+'_'+str(x[1]) if x[1] > 0 else x[0], axis=1)
    new_df = new_df.drop(['index'], axis=1)
    return new_df

In [12]:
new_feature_df = get_new_feature_name_df(feature_name_df)
feature_list = list(new_feature_df.col_name.values)

In [13]:
X_train = pd.read_csv('./UCI HAR Dataset/train/X_train.txt', 
                      header=None, sep='\s+', names=feature_list)
X_test = pd.read_csv('./UCI HAR Dataset/test/X_test.txt', 
                     header=None, sep='\s+', names=feature_list)
y_train = pd.read_csv('./UCI HAR Dataset/train/y_train.txt', 
                      header=None, sep='\s+', names=['action'])
y_test = pd.read_csv('./UCI HAR Dataset/test/y_test.txt', 
                     header=None, sep='\s+', names=['action'])

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# 결정 트리에서 사용한 get_human_dataset( )을 이용해 학습/테스트용 DataFrame 반환
# X_train, X_test, y_train, y_test = get_human_dataset()

# 랜덤 포레스트 학습 및 별도의 테스트 셋으로 예측 성능 평가
rf_clf = RandomForestClassifier(random_state=0)
rf_clf.fit(X_train , y_train)
pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test , pred)
print('랜덤 포레스트 정확도: {0:.4f}'.format(accuracy))

랜덤 포레스트 정확도: 0.9253


In [18]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators':[10,30,50],
    'max_depth' : [8,12,16], 
    'min_samples_leaf' : [8, 12, 18],
    'min_samples_split' : [12,16,20]
}
# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state=156, n_jobs=-1)
grid_cv = GridSearchCV(rf_clf , param_grid=params , cv=3, n_jobs=-1 )
grid_cv.fit(X_train , y_train)

print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))
print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)

최고 예측 정확도: 0.9218
최적 하이퍼 파라미터:
 {'max_depth': 12, 'min_samples_leaf': 12, 'min_samples_split': 12, 'n_estimators': 50}


In [19]:
rf_clf1 = RandomForestClassifier(n_estimators=50, max_depth=12, min_samples_leaf=12, \
                                 min_samples_split=12, random_state=0)
rf_clf1.fit(X_train , y_train)
pred = rf_clf1.predict(X_test)
print('예측 정확도: {0:.4f}'.format(accuracy_score(y_test , pred)))

예측 정확도: 0.9216


In [22]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)

knn.get_params()


{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [23]:
knn.fit(X_train,y_train)
pred = knn.predict(X_test)
accuracy_score(y_test,pred)

0.9015948422124194