# 사용자 행동 인식 데이터셋

## 기본 개념

### * pilot: 유의미한 중복값을 처리하는 방법
: cumcount + merge

In [3]:
import pandas as pd
df = pd.DataFrame([['a'],['a'],['a'],['b'],['b'],['a']],columns=['A'])
df

Unnamed: 0,A
0,a
1,a
2,a
3,b
4,b
5,a


In [4]:
# A칼럼의 유니크값들을 기준으로 그룹바이 + 누적카운트
df.groupby(['A']).cumcount()

0    0
1    1
2    2
3    0
4    1
5    3
dtype: int64

In [5]:
old_feature_name_df = pd.DataFrame([['a'],['a'],['a'],['b'],['b'],['a']],columns=['column_names'])
old_feature_name_df

feature_dup_df = pd.DataFrame(data=old_feature_name_df.groupby(['column_names']).cumcount(),columns=['dup_cnt'])
feature_dup_df

Unnamed: 0,dup_cnt
0,0
1,1
2,2
3,0
4,1
5,3


In [6]:
feature_dup_df = feature_dup_df.reset_index()
feature_dup_df

Unnamed: 0,index,dup_cnt
0,0,0
1,1,1
2,2,2
3,3,0
4,4,1
5,5,3


In [7]:
old_feature_name_df.reset_index()

Unnamed: 0,index,column_names
0,0,a
1,1,a
2,2,a
3,3,b
4,4,b
5,5,a


In [8]:
new_feature_name_df = pd.merge(old_feature_name_df.reset_index(),
                               feature_dup_df,how='outer')
new_feature_name_df

Unnamed: 0,index,column_names,dup_cnt
0,0,a,0
1,1,a,1
2,2,a,2
3,3,b,0
4,4,b,1
5,5,a,3


In [9]:
new_feature_name_df['column_names'] = new_feature_name_df[['column_names','dup_cnt']].apply(lambda x : x[0]+'_'+str(x[1]) if x[1]>0 else x[0],axis=1)
new_feature_name_df

Unnamed: 0,index,column_names,dup_cnt
0,0,a,0
1,1,a_1,1
2,2,a_2,2
3,3,b,0
4,4,b_1,1
5,5,a_3,3


## 의사결정나무 실습

In [10]:
import pandas as pd
import matplotlib.pyplot as plt

feature_name_df = pd.read_csv('./human_activity/features.txt',sep='\s+',header=None,names=['column_index','column_name'])

feature_name = feature_name_df.iloc[:,1].values.tolist()
print('전체 피쳐명에서 10개만 추출: ',feature_name[:10])

전체 피쳐명에서 10개만 추출:  ['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z', 'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z', 'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z', 'tBodyAcc-max()-X']


In [11]:
feature_name_df.head(10)

Unnamed: 0,column_index,column_name
0,1,tBodyAcc-mean()-X
1,2,tBodyAcc-mean()-Y
2,3,tBodyAcc-mean()-Z
3,4,tBodyAcc-std()-X
4,5,tBodyAcc-std()-Y
5,6,tBodyAcc-std()-Z
6,7,tBodyAcc-mad()-X
7,8,tBodyAcc-mad()-Y
8,9,tBodyAcc-mad()-Z
9,10,tBodyAcc-max()-X


In [25]:
feature_name_df

Unnamed: 0,column_index,column_name
0,1,tBodyAcc-mean()-X
1,2,tBodyAcc-mean()-Y
2,3,tBodyAcc-mean()-Z
3,4,tBodyAcc-std()-X
4,5,tBodyAcc-std()-Y
...,...,...
556,557,"angle(tBodyGyroMean,gravityMean)"
557,558,"angle(tBodyGyroJerkMean,gravityMean)"
558,559,"angle(X,gravityMean)"
559,560,"angle(Y,gravityMean)"


### 전처리

#### 중복된 피쳐명 확인

In [21]:
feature_dup_df = pd.DataFrame(data=feature_name_df.groupby('column_name').cumcount(), columns=['dup_cnt'])
feature_dup_df['dup_cnt']

0      0
1      0
2      0
3      0
4      0
      ..
556    0
557    0
558    0
559    0
560    0
Name: dup_cnt, Length: 561, dtype: int64

In [13]:
# 전처리 함수 (중복 피쳐명에 번호 붙여서 unique 값으로 만든다)
def get_new_feature_name_df(old_feature_name_df):
    feature_dup_df = pd.DataFrame(data=old_feature_name_df.groupby('column_name').cumcount(), columns=['dup_cnt'])

    feature_dup_df = feature_dup_df.reset_index()

    new_feature_name_df = pd.merge(old_feature_name_df.reset_index(), feature_dup_df, how='outer')
    new_feature_name_df['column_name'] = new_feature_name_df[['column_name','dup_cnt']].apply(
        lambda x : x[0]+'_'+str(x[1]) if x[1] > 0 else x[0],axis=1)
    
    return new_feature_name_df

In [14]:
import pandas as pd

def get_human_dataset( ):
    feature_name_df = pd.read_csv('./human_activity/features.txt',sep='\s+',header=None,names=['column_index','column_name'])
    
    #중복된 피쳐명 수정 함수 활용
    new_feature_name_df = get_new_feature_name_df(feature_name_df)
    feature_name = new_feature_name_df.iloc[:,1].values.tolist()
    
    X_train = pd.read_csv('./human_activity/train/X_train.txt',sep='\s+',names=feature_name)
    X_test = pd.read_csv('./human_activity/test/X_test.txt',sep='\s+',names=feature_name)
    
    y_train = pd.read_csv('./human_activity/train/y_train.txt',sep='\s+',header=None,names=['action'])
    y_test = pd.read_csv('./human_activity/test/y_test.txt',sep='\s+',header=None,names=['action'])
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = get_human_dataset()

In [15]:
print('## 학습 피처 데이터셋 info()')
print(X_train.info())

## 학습 피처 데이터셋 info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7352 entries, 0 to 7351
Columns: 561 entries, 1 to 561
dtypes: float64(561)
memory usage: 31.5 MB
None


In [16]:
print(y_train['action'].value_counts())

action
6    1407
5    1374
4    1286
1    1226
2    1073
3     986
Name: count, dtype: int64


### 모델 학습

#### * Basic Machine Learning 

In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# 의사결정나무, 랜덤포레스트, 로지스틱 회귀를 위한 사이킷런 Classifier 클래스 생성
dt_clf = DecisionTreeClassifier(random_state=11)


#DecisionTreeClassifier
dt_clf.fit(X_train,y_train)
dt_pred = dt_clf.predict(X_test)
print('DecisionTreeClassifier 정확도: {0:4f}'.format(accuracy_score(y_test,dt_pred)))


DecisionTreeClassifier 정확도: 0.861215


#### * Machine Learning with GridSearchCV

In [18]:
import pandas as pd
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth':[10,15,20],'min_samples_split':[5],'min_samples_leaf':[5]}
grid_dt_clf = GridSearchCV(dt_clf,param_grid = parameters,cv=8, refit=True,scoring='accuracy')
grid_dt_clf.fit(X_train,y_train)

scores_df_dt = pd.DataFrame(grid_dt_clf.cv_results_)

KeyboardInterrupt: 

In [None]:
scores_df_dt[['params','mean_test_score','rank_test_score','split0_test_score','split1_test_score','split2_test_score','split3_test_score','split4_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
0,"{'max_depth': 10, 'min_samples_leaf': 5, 'min_...",0.873912,1,0.845484,0.891186,0.88901,0.81284,0.855277
1,"{'max_depth': 15, 'min_samples_leaf': 5, 'min_...",0.865479,3,0.832427,0.881393,0.881393,0.808487,0.856366
2,"{'max_depth': 20, 'min_samples_leaf': 5, 'min_...",0.865887,2,0.832427,0.881393,0.881393,0.808487,0.856366


In [None]:
print('GridSearchCV 최적 하이퍼 파라미터: ',grid_dt_clf.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid_dt_clf.best_score_))
best_dt_clf = grid_dt_clf.best_estimator_

GridSearchCV 최적 하이퍼 파라미터:  {'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 5}
GridSearchCV 최고 정확도: 0.8739


In [None]:
# GridSearchCV의 최적 하이퍼 파라미터로 학습된 Estimator로 예측 및 평가 수행

best_dt_pred = best_dt_clf.predict(X_test)

accuracy = accuracy_score(y_test,best_dt_pred)

print('테스트 세트에서의 DecisionTreeClassifier 정확도: {0:.4f}'.format(accuracy))

테스트 세트에서의 DecisionTreeClassifier 정확도: 0.8755
