#**스마트폰 센서 데이터 기반 모션 분류**
# 단계3 : 단계별 모델링


## 0.미션

단계별로 나눠서 모델링을 수행하고자 합니다.  

* 단계1 : 정적(0), 동적(1) 행동 분류 모델 생성
* 단계2 : 세부 동작에 대한 분류모델 생성
    * 단계1 모델에서 0으로 예측 -> 정적 행동 3가지 분류 모델링
    * 단계1 모델에서 1으로 예측 -> 동적 행동 3가지 분류 모델링 
* 모델 통합
    * 두 단계 모델을 통합하고, 새로운 데이터에 대해서 최종 예측결과와 성능평가가 나오도록 함수로 만들기
* 성능 비교
    * 기본 모델링의 성능과 비교
    * 모든 모델링은 [다양한 알고리즘 + 성능 튜닝]을 수행해야 합니다.
    * 성능 가이드
            * Accuracy : 0.980~1.00


## 1.환경설정

* 세부 요구사항
    - 경로 설정 : 다음의 두가지 방법 중 하나를 선택하여 폴더를 준비하고 데이터를 로딩하시오.
        * 1) 로컬 수행(Ananconda)
            * 제공된 압축파일을 다운받아 압축을 풀고
            * anaconda의 root directory(보통 C:/Users/< ID > 에 project 폴더를 만들고, 복사해 넣습니다.
        * 2) 구글콜랩
            * 구글 드라이브 바로 밑에 project 폴더를 만들고, 
            * 데이터 파일을 복사해 넣습니다.
    
    - 기본적으로 필요한 라이브러리를 import 하도록 코드가 작성되어 있습니다. 
        * 필요하다고 판단되는 라이브러리를 추가하세요.


### (1) 경로 설정

#### 1) 로컬 수행(Anaconda)
* project 폴더에 필요한 파일들을 넣고, 본 파일을 열었다면, 별도 경로 지정이 필요하지 않습니다.

In [1]:
path = 'C:/Users/User/program/mini_pjt/mini_3/'

#### 2) 구글 콜랩 수행

* 구글 드라이브 연결

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# path = '/content/drive/MyDrive/project/'

### (2) 라이브러리 불러오기

#### 1) 라이브러리 로딩

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 필요하다고 판단되는 라이브러리를 추가하세요.
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import *

#### 2) 제공 함수 생성
* 변수 중요도를 시각화할 수 있는 함수를 제공합니다.
* 입력 : 
    * importance : 트리모델의 변수 중요도(예: model.feature_importances_)
    * names : 변수 이름 목록(예 : x_train.columns
    * result_only  : 변수 중요도 순으로 데이터프레임만 return할지, 그래프도 포함할지 결정. False이면 결과 데이터프레임 + 그래프
    * topn : 중요도 상위 n개만 표시. all 이면 전체.
* 출력 : 
    * 중요도 그래프 : 중요도 내림차순으로 정렬
    * 중요도 데이터프레임 : 중요도 내림차순으로 정렬

In [5]:
# 변수의 특성 중요도 계산하기
def plot_feature_importance(importance, names, result_only = False, topn = 'all'):
    feature_importance = np.array(importance)
    feature_name = np.array(names)

    data={'feature_name':feature_name,'feature_importance':feature_importance}
    fi_temp = pd.DataFrame(data)

    #변수의 특성 중요도 순으로 정렬하기
    fi_temp.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    fi_temp.reset_index(drop=True, inplace = True)

    if topn == 'all' :
        fi_df = fi_temp.copy()
    else :
        fi_df = fi_temp.iloc[:topn]

    #변수의 특성 중요도 그래프로 그리기
    if result_only == False :
        plt.figure(figsize=(10,20))
        sns.barplot(x='feature_importance', y='feature_name', data = fi_df)

        plt.xlabel('importance')
        plt.ylabel('feature name')
        plt.grid()

    return fi_df

### (3) 데이터 불러오기

* 주어진 데이터셋
    * data01_train.csv : 학습 및 검증용
    * data01_test.csv : 테스트용
    * feature.csv : feature 이름을 계층구조로 정리한 데이터

* 세부 요구사항
    * 칼럼 삭제 : data01_train.csv와 data01_test.csv 에서 'subject' 칼럼은 불필요하므로 삭제합니다.

#### 1) 데이터로딩

In [6]:
train = pd.read_csv('data01_train.csv')
test = pd.read_csv('data01_test.csv')
feature = pd.read_csv('features.csv')

In [7]:
train.drop('subject', axis=1, inplace=True)
test.drop('subject', axis=1, inplace=True)

In [8]:
train.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Activity
0,0.288508,-0.009196,-0.103362,-0.988986,-0.962797,-0.967422,-0.989,-0.962596,-0.96565,-0.929747,...,-0.487737,-0.816696,-0.042494,-0.044218,0.307873,0.07279,-0.60112,0.331298,0.165163,STANDING
1,0.265757,-0.016576,-0.098163,-0.989551,-0.994636,-0.987435,-0.990189,-0.99387,-0.987558,-0.937337,...,-0.23782,-0.693515,-0.062899,0.388459,-0.765014,0.771524,0.345205,-0.769186,-0.147944,LAYING
2,0.278709,-0.014511,-0.108717,-0.99772,-0.981088,-0.994008,-0.997934,-0.982187,-0.995017,-0.942584,...,-0.535287,-0.829311,0.000265,-0.525022,-0.891875,0.021528,-0.833564,0.202434,-0.032755,STANDING
3,0.289795,-0.035536,-0.150354,-0.231727,-0.006412,-0.338117,-0.273557,0.014245,-0.347916,0.008288,...,-0.004012,-0.408956,-0.255125,0.612804,0.747381,-0.072944,-0.695819,0.287154,0.111388,WALKING
4,0.394807,0.034098,0.091229,0.088489,-0.106636,-0.388502,-0.010469,-0.10968,-0.346372,0.584131,...,-0.157832,-0.563437,-0.044344,-0.845268,-0.97465,-0.887846,-0.705029,0.264952,0.137758,WALKING_DOWNSTAIRS


In [9]:
test.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Activity
0,0.284379,-0.021981,-0.116683,-0.99249,-0.97964,-0.963321,-0.992563,-0.977304,-0.958142,-0.93885,...,-0.509523,-0.850065,-0.018043,0.092304,0.07422,-0.714534,-0.671943,-0.018351,-0.185733,SITTING
1,0.27744,-0.028086,-0.118412,-0.99662,-0.927676,-0.972294,-0.997346,-0.931405,-0.971788,-0.939837,...,-0.210792,-0.613367,-0.022456,-0.155414,0.247498,-0.112257,-0.826816,0.184489,-0.068699,STANDING
2,0.305833,-0.041023,-0.087303,0.00688,0.1828,-0.237984,0.005642,0.028616,-0.236474,0.016311,...,0.579587,0.394388,-0.362616,0.171069,0.576349,-0.688314,-0.743234,0.272186,0.053101,WALKING
3,0.276053,-0.016487,-0.108381,-0.995379,-0.983978,-0.975854,-0.995877,-0.98528,-0.974907,-0.941425,...,-0.566291,-0.841455,0.289548,0.079801,-0.020033,0.291898,-0.639435,-0.111998,-0.123298,SITTING
4,0.271998,0.016904,-0.078856,-0.973468,-0.702462,-0.86945,-0.97981,-0.711601,-0.856807,-0.92076,...,0.447577,0.214219,0.010111,0.114179,-0.830776,-0.325098,-0.840817,0.116237,-0.096615,STANDING


#### 2) 기본 정보 조회

In [10]:
train.describe()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
count,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,...,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0
mean,0.274811,-0.017799,-0.109396,-0.603138,-0.509815,-0.604058,-0.628151,-0.525944,-0.605374,-0.46549,...,0.126955,-0.305883,-0.623548,0.008524,-0.001185,0.00934,-0.007099,-0.491501,0.059299,-0.054594
std,0.067614,0.039422,0.058373,0.448807,0.501815,0.417319,0.424345,0.485115,0.413043,0.544995,...,0.249176,0.322808,0.310371,0.33973,0.447197,0.60819,0.476738,0.509069,0.29734,0.278479
min,-0.503823,-0.684893,-1.0,-1.0,-0.999844,-0.999667,-1.0,-0.999419,-1.0,-1.0,...,-0.965725,-0.979261,-0.999765,-0.97658,-1.0,-1.0,-1.0,-1.0,-1.0,-0.980143
25%,0.262919,-0.024877,-0.121051,-0.992774,-0.97768,-0.980127,-0.993602,-0.977865,-0.980112,-0.936067,...,-0.02161,-0.541969,-0.845985,-0.122361,-0.294369,-0.481718,-0.373345,-0.811397,-0.018203,-0.141555
50%,0.277154,-0.017221,-0.108781,-0.943933,-0.844575,-0.856352,-0.948501,-0.849266,-0.849896,-0.878729,...,0.133887,-0.342923,-0.712677,0.010278,0.005146,0.011448,-0.000847,-0.709441,0.182893,0.003951
75%,0.288526,-0.01092,-0.098163,-0.24213,-0.034499,-0.26269,-0.291138,-0.068857,-0.268539,-0.01369,...,0.288944,-0.127371,-0.501158,0.154985,0.28503,0.499857,0.356236,-0.51133,0.248435,0.111932
max,1.0,1.0,1.0,1.0,0.916238,1.0,1.0,0.967664,1.0,1.0,...,0.9467,0.989538,0.956845,1.0,1.0,0.998702,0.996078,0.977344,0.478157,1.0


In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5881 entries, 0 to 5880
Columns: 562 entries, tBodyAcc-mean()-X to Activity
dtypes: float64(561), object(1)
memory usage: 25.2+ MB


In [12]:
train.shape

(5881, 562)

In [13]:
test.describe()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
count,1471.0,1471.0,1471.0,1471.0,1471.0,1471.0,1471.0,1471.0,1471.0,1471.0,...,1471.0,1471.0,1471.0,1471.0,1471.0,1471.0,1471.0,1471.0,1471.0,1471.0
mean,0.273198,-0.017281,-0.108123,-0.614634,-0.515427,-0.607533,-0.63995,-0.53076,-0.609256,-0.481055,...,0.118651,-0.311513,-0.632273,0.009324,0.015666,0.006272,-0.00151,-0.481737,0.055771,-0.064194
std,0.079989,0.045957,0.049082,0.44848,0.506094,0.424243,0.422994,0.489381,0.418536,0.542756,...,0.258112,0.313792,0.296179,0.324864,0.452616,0.608954,0.483028,0.522714,0.298124,0.281645
min,-1.0,-1.0,-0.418354,-0.999717,-0.999873,-1.0,-0.999867,-1.0,-0.999879,-0.948723,...,-1.0,-0.995357,-0.994664,-0.937468,-0.990492,-0.995222,-0.969066,-0.99938,-0.995073,-1.0
25%,0.263787,-0.024792,-0.120733,-0.992669,-0.979082,-0.98107,-0.993498,-0.979214,-0.980659,-0.936791,...,-0.035193,-0.546342,-0.844547,-0.119166,-0.265533,-0.485998,-0.3803,-0.81406,-0.017413,-0.148445
50%,0.277322,-0.017187,-0.108124,-0.952426,-0.867309,-0.86989,-0.958705,-0.873891,-0.863451,-0.890491,...,0.135282,-0.347433,-0.706699,0.005049,0.023421,-0.005036,0.002408,-0.708911,0.178814,-0.002243
75%,0.288058,-0.010238,-0.096606,-0.245405,-0.030639,-0.260223,-0.29726,-0.058824,-0.256657,-0.030692,...,0.289832,-0.125796,-0.51691,0.135698,0.31269,0.518184,0.374583,-0.486534,0.248126,0.096674
max,0.63151,0.359587,0.543939,0.899922,0.78259,0.931308,0.950758,0.602458,0.784041,0.821218,...,0.860512,0.941113,0.89421,0.980889,0.991899,0.994366,0.979522,1.0,0.432496,0.992766


In [14]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1471 entries, 0 to 1470
Columns: 562 entries, tBodyAcc-mean()-X to Activity
dtypes: float64(561), object(1)
memory usage: 6.3+ MB


In [15]:
test.shape

(1471, 562)

## 2.데이터 전처리

* 세부 요구사항
    - Label 추가 : data 에 Activity_dynamic 를 추가합니다. Activity_dynamic은 과제1에서 is_dynamic과 동일한 값입니다.
    - x와 y1, y2로 분할하시오.
        * y1 : Activity
        * y2 : Activity_dynamic
    - train : val = 8 : 2 혹은 7 : 3
    - random_state 옵션을 사용하여 다른 모델과 비교를 위해 성능이 재현되도록 합니다.

In [16]:
train['Activity_dynamic'] = train['Activity'].map({'STANDING':0, 'SITTING':0, 'LAYING':0, 
                  'WALKING':1, 'WALKING_UPSTAIRS':1, 'WALKING_DOWNSTAIRS':1})
train

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Activity,Activity_dynamic
0,0.288508,-0.009196,-0.103362,-0.988986,-0.962797,-0.967422,-0.989000,-0.962596,-0.965650,-0.929747,...,-0.816696,-0.042494,-0.044218,0.307873,0.072790,-0.601120,0.331298,0.165163,STANDING,0
1,0.265757,-0.016576,-0.098163,-0.989551,-0.994636,-0.987435,-0.990189,-0.993870,-0.987558,-0.937337,...,-0.693515,-0.062899,0.388459,-0.765014,0.771524,0.345205,-0.769186,-0.147944,LAYING,0
2,0.278709,-0.014511,-0.108717,-0.997720,-0.981088,-0.994008,-0.997934,-0.982187,-0.995017,-0.942584,...,-0.829311,0.000265,-0.525022,-0.891875,0.021528,-0.833564,0.202434,-0.032755,STANDING,0
3,0.289795,-0.035536,-0.150354,-0.231727,-0.006412,-0.338117,-0.273557,0.014245,-0.347916,0.008288,...,-0.408956,-0.255125,0.612804,0.747381,-0.072944,-0.695819,0.287154,0.111388,WALKING,1
4,0.394807,0.034098,0.091229,0.088489,-0.106636,-0.388502,-0.010469,-0.109680,-0.346372,0.584131,...,-0.563437,-0.044344,-0.845268,-0.974650,-0.887846,-0.705029,0.264952,0.137758,WALKING_DOWNSTAIRS,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5876,0.277194,-0.012389,-0.131974,-0.994046,-0.940578,-0.917337,-0.994261,-0.932830,-0.908088,-0.936219,...,-0.690363,-0.034888,-0.261437,-0.391477,-0.877612,-0.912365,0.114009,0.080146,SITTING,0
5877,0.191568,0.013328,-0.105174,-0.126969,-0.121729,-0.327480,-0.192523,-0.109923,-0.295286,0.078644,...,-0.879215,0.721718,0.623151,0.866858,-0.445660,-0.690278,0.303194,-0.044188,WALKING_UPSTAIRS,1
5878,0.267981,-0.018348,-0.107440,-0.991303,-0.989881,-0.990313,-0.992386,-0.988852,-0.991237,-0.936099,...,-0.886851,0.060173,0.228739,0.684400,-0.216665,0.620363,-0.437247,-0.571840,LAYING,0
5879,0.212787,-0.048130,-0.121001,-0.041373,0.052449,-0.585361,-0.100714,0.023353,-0.554707,0.219814,...,-0.053556,0.260880,0.551742,-0.943773,-0.862899,-0.718009,0.292856,0.024920,WALKING_UPSTAIRS,1


In [17]:
target1 = 'Activity'
target2 = 'Activity_dynamic'

x = train.drop(target1, axis=1)
y1 = train.loc[:, target1]
x = x.drop(target2, axis=1)
y2 = train.loc[:, target2]

x_train, x_val, y_train, y_val = train_test_split(x, y1, test_size=0.2, random_state=2024)
x_train, x_val, y_train2, y_val2 = train_test_split(x, y2, test_size=0.2, random_state=2024)

## **3.단계별 모델링**

![](https://github.com/DA4BAM/image/blob/main/step%20by%20step.png?raw=true)

### (1) 단계1 : 정적/동적 행동 분류 모델

* 세부 요구사항
    * 정적 행동(Laying, Sitting, Standing)과 동적 행동(동적 : Walking, Walking-Up, Walking-Down)을 구분하는 모델 생성.
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

#### 1) Logistic Regression

In [18]:
model_lr = LogisticRegression()

In [21]:
model_lr.fit(x_train, y_train2)

In [24]:
y_pred_1 = model_lr.predict(x_val)

In [27]:
# 평가
print('accuracy :',accuracy_score(y_val2, y_pred_1))
print('='*60)
print(confusion_matrix(y_val2, y_pred_1))
print('='*60)
print(classification_report(y_val2, y_pred_1))

accuracy : 1.0
[[637   0]
 [  0 540]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       637
           1       1.00      1.00      1.00       540

    accuracy                           1.00      1177
   macro avg       1.00      1.00      1.00      1177
weighted avg       1.00      1.00      1.00      1177



#### 2) RandomForest

In [28]:
model_rf = RandomForestClassifier()
model_rf.fit(x_train, y_train2)
y_pred_2 = model_rf.predict(x_val)

In [29]:
# 평가
print('accuracy :',accuracy_score(y_val2, y_pred_2))
print('='*60)
print(confusion_matrix(y_val2, y_pred_2))
print('='*60)
print(classification_report(y_val2, y_pred_2))

accuracy : 1.0
[[637   0]
 [  0 540]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       637
           1       1.00      1.00      1.00       540

    accuracy                           1.00      1177
   macro avg       1.00      1.00      1.00      1177
weighted avg       1.00      1.00      1.00      1177



### (2) 단계2-1 : 정적 동작 세부 분류

* 세부 요구사항
    * 정적 행동(Laying, Sitting, Standing)인 데이터 추출
    * Laying, Sitting, Standing 를 분류하는 모델을 생성
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

In [40]:
s_data = train.loc[train['Activity_dynamic'] == 0]
target = 'Activity'
s_x = s_data.drop(target, axis=1)
s_y = s_data.loc[:, target]

i_train, i_val, t_train, t_val = train_test_split(s_x, s_y, test_size=0.3, random_state=2024)

In [42]:
model_lr2 = LogisticRegression()
model_lr2.fit(i_train, t_train)
s_pred = model_lr2.predict(i_val)

# 평가
print('accuracy :',accuracy_score(t_val, s_pred))
print('='*60)
print(confusion_matrix(t_val, s_pred))
print('='*60)
print(classification_report(t_val, s_pred))

accuracy : 0.9742533470648815
[[358   0   0]
 [  0 293  14]
 [  0  11 295]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00       358
     SITTING       0.96      0.95      0.96       307
    STANDING       0.95      0.96      0.96       306

    accuracy                           0.97       971
   macro avg       0.97      0.97      0.97       971
weighted avg       0.97      0.97      0.97       971



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [43]:
model_rf2 = RandomForestClassifier()
model_rf2.fit(i_train, t_train) 
s_pred = model_rf2.predict(i_val)

# 평가
print('accuracy :',accuracy_score(t_val, s_pred))
print('='*60)
print(confusion_matrix(t_val, s_pred))
print('='*60)
print(classification_report(t_val, s_pred))

accuracy : 0.9742533470648815
[[358   0   0]
 [  0 291  16]
 [  0   9 297]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00       358
     SITTING       0.97      0.95      0.96       307
    STANDING       0.95      0.97      0.96       306

    accuracy                           0.97       971
   macro avg       0.97      0.97      0.97       971
weighted avg       0.97      0.97      0.97       971



In [32]:
model_lr2 = LogisticRegression()
model_lr2.fit(x_train, y_train)
s_pred = model_lr2.predict(x_val)

# 평가
print('accuracy :',accuracy_score(y_val, s_pred))
print('='*60)
print(confusion_matrix(y_val, s_pred))
print('='*60)
print(classification_report(y_val, s_pred))

accuracy : 0.9821580288870009
[[215   0   0   0   0   0]
 [  0 202  10   0   0   0]
 [  0  11 199   0   0   0]
 [  0   0   0 195   0   0]
 [  0   0   0   0 154   0]
 [  0   0   0   0   0 191]]
                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       215
           SITTING       0.95      0.95      0.95       212
          STANDING       0.95      0.95      0.95       210
           WALKING       1.00      1.00      1.00       195
WALKING_DOWNSTAIRS       1.00      1.00      1.00       154
  WALKING_UPSTAIRS       1.00      1.00      1.00       191

          accuracy                           0.98      1177
         macro avg       0.98      0.98      0.98      1177
      weighted avg       0.98      0.98      0.98      1177



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [33]:
model_rf2 = RandomForestClassifier()
model_rf2.fit(x_train, y_train)
s_pred = model_rf2.predict(x_val)

# 평가
print('accuracy :',accuracy_score(y_val, s_pred))
print('='*60)
print(confusion_matrix(y_val, s_pred))
print('='*60)
print(classification_report(y_val, s_pred))

accuracy : 0.9813084112149533
[[215   0   0   0   0   0]
 [  0 206   6   0   0   0]
 [  0   7 203   0   0   0]
 [  0   0   0 194   1   0]
 [  0   0   0   1 151   2]
 [  0   0   0   0   5 186]]
                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       215
           SITTING       0.97      0.97      0.97       212
          STANDING       0.97      0.97      0.97       210
           WALKING       0.99      0.99      0.99       195
WALKING_DOWNSTAIRS       0.96      0.98      0.97       154
  WALKING_UPSTAIRS       0.99      0.97      0.98       191

          accuracy                           0.98      1177
         macro avg       0.98      0.98      0.98      1177
      weighted avg       0.98      0.98      0.98      1177



### (3) 단계2-2 : 동적 동작 세부 분류

* 세부 요구사항
    * 동적 행동(Walking, Walking Upstairs, Walking Downstairs)인 데이터 추출
    * Walking, Walking Upstairs, Walking Downstairs 를 분류하는 모델을 생성
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

In [44]:
s_data = train.loc[train['Activity_dynamic'] == 1]
target = 'Activity'
s_x = s_data.drop(target, axis=1)
s_y = s_data.loc[:, target]

i_train, i_val, t_train, t_val = train_test_split(s_x, s_y, test_size=0.3, random_state=2024)

In [45]:
model_lr2 = LogisticRegression()
model_lr2.fit(x_train, y_train) 
s_pred = model_lr2.predict(x_val)

# 평가
print('accuracy :',accuracy_score(y_val, s_pred))
print('='*60)
print(confusion_matrix(y_val, s_pred))
print('='*60)
print(classification_report(y_val, s_pred))

accuracy : 0.9821580288870009
[[215   0   0   0   0   0]
 [  0 202  10   0   0   0]
 [  0  11 199   0   0   0]
 [  0   0   0 195   0   0]
 [  0   0   0   0 154   0]
 [  0   0   0   0   0 191]]
                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       215
           SITTING       0.95      0.95      0.95       212
          STANDING       0.95      0.95      0.95       210
           WALKING       1.00      1.00      1.00       195
WALKING_DOWNSTAIRS       1.00      1.00      1.00       154
  WALKING_UPSTAIRS       1.00      1.00      1.00       191

          accuracy                           0.98      1177
         macro avg       0.98      0.98      0.98      1177
      weighted avg       0.98      0.98      0.98      1177



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [46]:
model_rf2 = RandomForestClassifier()
model_rf2.fit(x_train, y_train) 
s_pred = model_rf2.predict(x_val)

# 평가
print('accuracy :',accuracy_score(y_val, s_pred))
print('='*60)
print(confusion_matrix(y_val, s_pred))
print('='*60)
print(classification_report(y_val, s_pred))

accuracy : 0.9787595581988106
[[215   0   0   0   0   0]
 [  0 203   9   0   0   0]
 [  0   8 202   0   0   0]
 [  0   0   0 193   2   0]
 [  0   0   0   0 153   1]
 [  0   0   0   0   5 186]]
                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       215
           SITTING       0.96      0.96      0.96       212
          STANDING       0.96      0.96      0.96       210
           WALKING       1.00      0.99      0.99       195
WALKING_DOWNSTAIRS       0.96      0.99      0.97       154
  WALKING_UPSTAIRS       0.99      0.97      0.98       191

          accuracy                           0.98      1177
         macro avg       0.98      0.98      0.98      1177
      weighted avg       0.98      0.98      0.98      1177



### (4) 분류 모델 합치기


* 세부 요구사항
    * 두 단계 모델을 통합하고, 새로운 데이터(test)에 대해서 최종 예측결과와 성능평가가 나오도록 함수로 만들기
    * 데이터 파이프라인 구축 : test데이터가 로딩되어 전처리 과정을 거치고, 예측 및 성능 평가 수행

![](https://github.com/DA4BAM/image/blob/main/pipeline%20function.png?raw=true)

#### 1) 함수 만들어서 분류모델 합치기

In [72]:
def evaluate_second_target(train, target1, target2, test_size=0.2, random_state=2024):
    """
    주어진 데이터에 대해 두 번째 타겟 변수에 대한 두 가지 모델을 평가하는 함수
    
    - train (DataFrame): 학습에 사용할 데이터셋
    - target1 (str): 첫 번째 타겟 변수 이름
    - target2 (str): 두 번째 타겟 변수 이름
    - test_size (float, optional): 테스트 데이터셋의 비율 (기본값: 0.2)
    - random_state (int, optional): 랜덤 시드 값 (기본값: 2024)
    """
    # 특징 데이터(x)와 타겟 데이터(y) 설정
    x = train.drop([target1, target2], axis=1)
    y1 = train[target1]
    y2 = train[target2]
    
    # 데이터 분할
    x_train, x_val, y_train1, y_val1 = train_test_split(x, y1, test_size=test_size, random_state=random_state)
    x_train, x_val, y_train2, y_val2 = train_test_split(x, y2, test_size=test_size, random_state=random_state)

    # 모델 학습 (로지스틱 회귀)
    model_lr = LogisticRegression()
    model_lr.fit(x_train, y_train2)
    
    # 로지스틱 회귀 모델 평가
    y_pred_lr = model_lr.predict(x_val)
    print("Logistic Regression 모델 평가 결과:")
    print('accuracy :', accuracy_score(y_val2, y_pred_lr))
    print('='*60)
    print(confusion_matrix(y_val2, y_pred_lr))
    print('='*60)
    print(classification_report(y_val2, y_pred_lr))
    print('='*60)
    
    # 모델 학습 (랜덤 포레스트)
    model_rf = RandomForestClassifier(random_state=random_state)
    model_rf.fit(x_train, y_train1)
    
    # 랜덤 포레스트 모델 평가
    y_pred_rf = model_rf.predict(x_val)
    print("Random Forest 모델 평가 결과:")
    print('accuracy :', accuracy_score(y_val1, y_pred_rf))
    print('='*60)
    print(confusion_matrix(y_val1, y_pred_rf))
    print('='*60)
    print(classification_report(y_val1, y_pred_rf))
    print('='*60)

In [73]:
evaluate_second_target(train, 'Activity', 'Activity_dynamic')

Logistic Regression 모델 평가 결과:
accuracy : 1.0
[[637   0]
 [  0 540]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       637
           1       1.00      1.00      1.00       540

    accuracy                           1.00      1177
   macro avg       1.00      1.00      1.00      1177
weighted avg       1.00      1.00      1.00      1177

Random Forest 모델 평가 결과:
accuracy : 0.9804587935429057
[[215   0   0   0   0   0]
 [  0 205   7   0   0   0]
 [  0   9 201   0   0   0]
 [  0   0   0 193   2   0]
 [  0   0   0   1 152   1]
 [  0   0   0   0   3 188]]
                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       215
           SITTING       0.96      0.97      0.96       212
          STANDING       0.97      0.96      0.96       210
           WALKING       0.99      0.99      0.99       195
WALKING_DOWNSTAIRS       0.97      0.99      0.98       154
  WALKING_UPSTAIRS       0.

#### 2) test 셋으로 예측하고 평가하기

In [95]:
def evaluate_second_target(train, target1, target2, test_size=0.2, random_state=2024):
    """
    주어진 데이터에 대해 두 번째 타겟 변수에 대한 두 가지 모델을 평가하고, 그리드 서치를 사용하여 최적의 파라미터를 찾아 출력하는 함수
    
    - train (DataFrame): 학습에 사용할 데이터셋
    - target1 (str): 첫 번째 타겟 변수 이름
    - target2 (str): 두 번째 타겟 변수 이름
    - test_size (float, optional): 테스트 데이터셋의 비율 (기본값: 0.2)
    - random_state (int, optional): 랜덤 시드 값 (기본값: 2024)
    """
    # 특징 데이터(x)와 타겟 데이터(y) 설정
    x = train.drop([target1, target2], axis=1)
    y1 = train[target1]
    y2 = train[target2]
    
    # 데이터 분할
    x_train, x_val, y_train1, y_val1 = train_test_split(x, y1, test_size=test_size, random_state=random_state)
    x_train, x_val, y_train2, y_val2 = train_test_split(x, y2, test_size=test_size, random_state=random_state)

    # 모델 학습 (로지스틱 회귀)
    param_grid_lr = {
        'n_estimators': [205, 220, 240],
        'max_depth' : [23, 30, 50, 60],
}
    grid_search_lr = GridSearchCV(RandomForestClassifier(), param_grid_lr, cv=5, scoring='accuracy')
    grid_search_lr.fit(x_train, y_train2)
    best_lr_model = grid_search_lr.best_estimator_
    best_lr_params = grid_search_lr.best_params_
    best_lr_score = grid_search_lr.best_score_
    
    # 로지스틱 회귀 모델 평가
    y_pred_lr = best_lr_model.predict(x_val)
    print("Logistic Regression 모델 평가 결과:")
    print('accuracy :', accuracy_score(y_val2, y_pred_lr))
    print('='*60)
    print(confusion_matrix(y_val2, y_pred_lr))
    print('='*60)
    print(classification_report(y_val2, y_pred_lr))
    print('='*60)
    print("최적의 파라미터:", best_lr_params)
    print("최고 성능:", best_lr_score)
    print('='*60)
    
    # 모델 학습 (랜덤 포레스트)
    param_grid_rf = {
        'n_estimators': [205, 220, 240],
        'max_depth' : [23, 30, 50, 60],
    }
    grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=random_state), param_grid_rf, cv=5, scoring='accuracy')
    grid_search_rf.fit(x_train, y_train1)
    best_rf_model = grid_search_rf.best_estimator_
    best_rf_params = grid_search_rf.best_params_
    best_rf_score = grid_search_rf.best_score_
    
    # 랜덤 포레스트 모델 평가
    y_pred_rf = best_rf_model.predict(x_val)
    print("Random Forest 모델 평가 결과:")
    print('accuracy :', accuracy_score(y_val1, y_pred_rf))
    print('='*60)
    print(confusion_matrix(y_val1, y_pred_rf))
    print('='*60)
    print(classification_report(y_val1, y_pred_rf))
    print('='*60)
    print("최적의 파라미터:", best_rf_params)
    print("최고 성능:", best_rf_score)
    print('='*60)

In [97]:
# 함수 호출
evaluate_second_target(train, 'Activity', 'Activity_dynamic')

Logistic Regression 모델 평가 결과:
accuracy : 1.0
[[637   0]
 [  0 540]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       637
           1       1.00      1.00      1.00       540

    accuracy                           1.00      1177
   macro avg       1.00      1.00      1.00      1177
weighted avg       1.00      1.00      1.00      1177

최적의 파라미터: {'max_depth': 23, 'n_estimators': 205}
최고 성능: 0.9991496144888868


KeyboardInterrupt: 

* 성능 평가