## 랜덤 포레스트
- 전체 학습 데이터에서 서브셋을 복원추출해서 뽑기 때문에 중복 데이터가 들어가거나 아예 들어가지 않는 데이터가 들어가고, 이렇게 해서 서브셋의 원소개수와 전체 데이터의 원소개수를 맞춘다.

In [3]:
import pandas as pd

feature_name_df = pd.read_csv('./data/UCI_HAR_Dataset/features.txt',
header = None, delim_whitespace=True, # sep = "\s+" 와 같은 효과
names=['col_index','col_name'])
feature_name_df.head()

Unnamed: 0,col_index,col_name
0,1,tBodyAcc-mean()-X
1,2,tBodyAcc-mean()-Y
2,3,tBodyAcc-mean()-Z
3,4,tBodyAcc-std()-X
4,5,tBodyAcc-std()-Y


In [6]:
def get_new_feature_name_df(old_df):
    dup_df = pd.DataFrame({'dup_cnt':feature_name_df.groupby('col_name').cumcount()})
    new_df = pd.merge(old_df.reset_index(),dup_df.reset_index())
    new_df.col_name = new_df.apply(lambda r: r.col_name + '_' +  str(r.dup_cnt) if r.dup_cnt > 0 else r.col_name, axis = 1)
    del new_df['index']
    return new_df

In [7]:
new_feature_df = get_new_feature_name_df(feature_name_df)
feature_list = list(new_feature_df.col_name.values)

In [9]:
X_train = pd.read_csv('./data/UCI_HAR_Dataset/train/X_train.txt', 
                      header=None, sep='\s+', names=feature_list)
X_test = pd.read_csv('./data/UCI_HAR_Dataset/test/X_test.txt', 
                     header=None, sep='\s+', names=feature_list)
y_train = pd.read_csv('./data/UCI_HAR_Dataset/train/y_train.txt', 
                      header=None, sep='\s+', names=['action'])
y_test = pd.read_csv('./data/UCI_HAR_Dataset/test/y_test.txt', 
                     header=None, sep='\s+', names=['action'])

랜덤 포레스트 모델 생성/학습/예측/평가

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'랜덤 포레스트 정확도: {acc:.4f}')

랜덤 포레스트 정확도: 0.9304


In [14]:
rf_clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [15]:
params = {
    'n_estimators': [90, 100, 110]
}

In [17]:
from sklearn.model_selection import GridSearchCV
grid_clf = GridSearchCV(rf_clf, param_grid = params, scoring='accuracy', cv=5)
grid_clf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'n_estimators': [90, 100, 110]}, scoring='accuracy')

In [18]:
print('GridSearchCV 최적 파라미터:', grid_clf.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid_clf.best_score_))

GridSearchCV 최적 파라미터: {'n_estimators': 90}
GridSearchCV 최고 정확도: 0.9181


In [21]:
df = pd.DataFrame(grid_clf.cv_results_)
df = df[['param_n_estimators','mean_test_score']]
df

Unnamed: 0,param_n_estimators,mean_test_score
0,90,0.918122
1,100,0.917441
2,110,0.91581


In [22]:
best_clf = grid_clf.best_estimator_
pred = best_clf.predict(X_test)
accuracy_score(y_test, pred)

0.9267051238547676

## K 최근접 이웃 (K-Nearest Neighbor)
- 학습하지 않고 군집화만

In [23]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [24]:
knn.fit(X_train, y_train)
pred = knn.predicT(X_test)
accuracy_score(y_test, pred)

AttributeError: 'KNeighborsClassifier' object has no attribute 'predicT'