# 0. 코드 설명

data/raw 디렉토리에서 가공되지 않은 데이터를 가져와 1차 전처리를 시행함
* NaN 제거
* Label Normalization
* 실험 데이터와 라벨을 대응시킨 후, 한 데이터프레임으로 묶기
* 필요 없는 데이터 제거

# 1. 라이브러리 가져오기

In [52]:
import pandas as pd
import numpy as np
import glob
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy import signal

# 2. 데이터 불러오기

### 레이블 데이터 불러오기

In [53]:
# 레이블 데이터 불러오기
experiment_result = pd.read_csv("../../data/raw/train.csv")
print(f'train.csv : {experiment_result.shape}')
experiment_result.head()

train.csv : (25, 7)


Unnamed: 0,No,material,feedrate,clamp_pressure,tool_condition,machining_finalized,passed_visual_inspection
0,1,aluminum,6,4.0,unworn,yes,yes
1,2,aluminum,20,4.0,unworn,yes,yes
2,3,aluminum,6,3.0,unworn,yes,yes
3,4,aluminum,6,2.5,unworn,no,
4,5,aluminum,20,3.0,unworn,no,


### NaN 제거

In [54]:
# 공정 중지로 육안 검사를 시행하지 않은 경우(NaN) no로 해줌
experiment_result['passed_visual_inspection'] = experiment_result['passed_visual_inspection'].fillna('no')
experiment_result.head()

Unnamed: 0,No,material,feedrate,clamp_pressure,tool_condition,machining_finalized,passed_visual_inspection
0,1,aluminum,6,4.0,unworn,yes,yes
1,2,aluminum,20,4.0,unworn,yes,yes
2,3,aluminum,6,3.0,unworn,yes,yes
3,4,aluminum,6,2.5,unworn,no,no
4,5,aluminum,20,3.0,unworn,no,no


### 실험 데이터 불러오기

In [55]:
# 실험 데이터 파일 경로 저장(순서 랜덤 -> sort)
all_files = glob.glob("../../data/raw/experiment_*.csv")
all_files.sort()
all_files

['../../data/raw/experiment_01.csv',
 '../../data/raw/experiment_02.csv',
 '../../data/raw/experiment_03.csv',
 '../../data/raw/experiment_04.csv',
 '../../data/raw/experiment_05.csv',
 '../../data/raw/experiment_06.csv',
 '../../data/raw/experiment_07.csv',
 '../../data/raw/experiment_08.csv',
 '../../data/raw/experiment_09.csv',
 '../../data/raw/experiment_10.csv',
 '../../data/raw/experiment_11.csv',
 '../../data/raw/experiment_12.csv',
 '../../data/raw/experiment_13.csv',
 '../../data/raw/experiment_14.csv',
 '../../data/raw/experiment_15.csv',
 '../../data/raw/experiment_16.csv',
 '../../data/raw/experiment_17.csv',
 '../../data/raw/experiment_18.csv',
 '../../data/raw/experiment_19.csv',
 '../../data/raw/experiment_20.csv',
 '../../data/raw/experiment_21.csv',
 '../../data/raw/experiment_22.csv',
 '../../data/raw/experiment_23.csv',
 '../../data/raw/experiment_24.csv',
 '../../data/raw/experiment_25.csv']

In [56]:
experiment_tmp = pd.read_csv(all_files[0])
print(f'experiment_01.csv : {experiment_tmp.shape}')
experiment_tmp.head(3)

experiment_01.csv : (1055, 48)


Unnamed: 0,X_ActualPosition,X_ActualVelocity,X_ActualAcceleration,X_SetPosition,X_SetVelocity,X_SetAcceleration,X_CurrentFeedback,X_DCBusVoltage,X_OutputCurrent,X_OutputVoltage,...,S_CurrentFeedback,S_DCBusVoltage,S_OutputCurrent,S_OutputVoltage,S_OutputPower,S_SystemInertia,M_CURRENT_PROGRAM_NUMBER,M_sequence_number,M_CURRENT_FEEDRATE,Machining_Process
0,202,4.0,4.0,202,4.0,4.0,0.18,0.0207,329,2.77,...,0.524,2.74e-19,329,0.0,6.96e-07,16,1,0,50,Starting
1,202,-6.8,-346.0,202,-9.6,-354.0,-10.9,0.186,328,23.3,...,-0.288,2.74e-19,328,0.0,-5.27e-07,16,1,4,50,Prep
2,200,-13.8,-2.25,200,-13.9,3.999905,-8.59,0.14,328,30.6,...,0.524,2.74e-19,328,0.0,9.1e-07,16,1,7,50,Prep


### 데이터 묶어주기

In [57]:
# train과 experiment 파일을 frames로 묶음
frames = []
k = 1

for filename in all_files:
    frame = pd.read_csv(filename) # 불러온 데이터를 frame에 저장
    exp_result_row = experiment_result[experiment_result['No'] == k] # 반복 할 때마다 train에서 row 한 줄 씩 가져옴.
    frame['exp_num'] = k # 실험 번호 행 추가
    
    #add experiment settings to features
    frame['material'] = exp_result_row.iloc[0]['material']
    frame['feedrate'] = exp_result_row.iloc[0]['feedrate']
    frame['clamp_pressure'] = exp_result_row.iloc[0]['clamp_pressure']
    
    #add experiment result to features
    frame['tool_condition'] = exp_result_row.iloc[0]['tool_condition']
    frame['machining_finalized'] = exp_result_row.iloc[0]['machining_finalized']
    frame['passed_visual_inspection'] = exp_result_row.iloc[0]['passed_visual_inspection']
    
    frames.append(frame)
    k = k+1

df = pd.concat(frames, ignore_index = True)
print(df.shape)
df.head(3)

(32048, 55)


Unnamed: 0,X_ActualPosition,X_ActualVelocity,X_ActualAcceleration,X_SetPosition,X_SetVelocity,X_SetAcceleration,X_CurrentFeedback,X_DCBusVoltage,X_OutputCurrent,X_OutputVoltage,...,M_sequence_number,M_CURRENT_FEEDRATE,Machining_Process,exp_num,material,feedrate,clamp_pressure,tool_condition,machining_finalized,passed_visual_inspection
0,202.0,4.0,4.0,202.0,4.0,4.0,0.18,0.0207,329,2.77,...,0,50,Starting,1,aluminum,6,4.0,unworn,yes,yes
1,202.0,-6.8,-346.0,202.0,-9.6,-354.0,-10.9,0.186,328,23.3,...,4,50,Prep,1,aluminum,6,4.0,unworn,yes,yes
2,200.0,-13.8,-2.25,200.0,-13.9,3.999905,-8.59,0.14,328,30.6,...,7,50,Prep,1,aluminum,6,4.0,unworn,yes,yes


In [58]:
# 무결성 확인
df.isnull().values.any()

False

In [59]:
# 데이터 형식 파악
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32048 entries, 0 to 32047
Data columns (total 55 columns):
X_ActualPosition            32048 non-null float64
X_ActualVelocity            32048 non-null float64
X_ActualAcceleration        32048 non-null float64
X_SetPosition               32048 non-null float64
X_SetVelocity               32048 non-null float64
X_SetAcceleration           32048 non-null float64
X_CurrentFeedback           32048 non-null float64
X_DCBusVoltage              32048 non-null float64
X_OutputCurrent             32048 non-null int64
X_OutputVoltage             32048 non-null float64
X_OutputPower               32048 non-null float64
Y_ActualPosition            32048 non-null float64
Y_ActualVelocity            32048 non-null float64
Y_ActualAcceleration        32048 non-null float64
Y_SetPosition               32048 non-null float64
Y_SetVelocity               32048 non-null float64
Y_SetAcceleration           32048 non-null float64
Y_CurrentFeedback         

### Label Normalization
Starting과 end가 다른 레이블에 비해 적기 때문에, 다른 레이블로 normalize 해줌.
- Starting -> Prep
- end -> End

In [60]:
df['Machining_Process'].value_counts().sort_index()

End              3207
Layer 1 Down     3548
Layer 1 Up       5344
Layer 2 Down     3012
Layer 2 Up       3853
Layer 3 Down     2805
Layer 3 Up       3335
Prep             2394
Repositioning    4541
Starting            1
end                 8
Name: Machining_Process, dtype: int64

In [61]:
df.replace({'Machining_Process': {'Starting':'Prep','end':'End'}}, inplace=True)
df['Machining_Process'].value_counts().sort_index()

End              3215
Layer 1 Down     3548
Layer 1 Up       5344
Layer 2 Down     3012
Layer 2 Up       3853
Layer 3 Down     2805
Layer 3 Up       3335
Prep             2395
Repositioning    4541
Name: Machining_Process, dtype: int64

### Feature Engineering 진행

In [62]:
df['Machining_Process'] = LabelEncoder().fit_transform(df['Machining_Process']).astype(np.int64)
df['tool_condition'] = LabelEncoder().fit_transform(df['tool_condition']).astype(np.int8)
df['machining_finalized'] = LabelEncoder().fit_transform(df['machining_finalized']).astype(np.int8)
df['passed_visual_inspection'] = LabelEncoder().fit_transform(df['passed_visual_inspection']).astype(np.int8)
# 시각화 결과, 모든 실험에서 0
df.drop(['Z_CurrentFeedback','Z_DCBusVoltage','Z_OutputCurrent','Z_OutputVoltage'], axis=1, inplace=True)
# 모두 동일한 특징
df.drop(['material'], axis=1, inplace=True)

df.head(3)

Unnamed: 0,X_ActualPosition,X_ActualVelocity,X_ActualAcceleration,X_SetPosition,X_SetVelocity,X_SetAcceleration,X_CurrentFeedback,X_DCBusVoltage,X_OutputCurrent,X_OutputVoltage,...,M_CURRENT_PROGRAM_NUMBER,M_sequence_number,M_CURRENT_FEEDRATE,Machining_Process,exp_num,feedrate,clamp_pressure,tool_condition,machining_finalized,passed_visual_inspection
0,202.0,4.0,4.0,202.0,4.0,4.0,0.18,0.0207,329,2.77,...,1,0,50,7,1,6,4.0,0,1,1
1,202.0,-6.8,-346.0,202.0,-9.6,-354.0,-10.9,0.186,328,23.3,...,1,4,50,7,1,6,4.0,0,1,1
2,200.0,-13.8,-2.25,200.0,-13.9,3.999905,-8.59,0.14,328,30.6,...,1,7,50,7,1,6,4.0,0,1,1


## 첫 번째 파일 저장

In [63]:
# 복사 및 실험번호 제거
prepare_df = df.copy()
prepare_df.drop(['exp_num'], axis=1, inplace=True)

# 현재 작업 디렉토리
path = os.getcwd()

# path 초기화
os.chdir(path)

#save dataFrame
os.chdir("../../data/processed")
prepare_df.to_csv("prepared_data.csv", index=None)

# 작업 디렉토리 원 상태로
os.chdir(path)

### FFT Features 생성

In [64]:
for col in ['ActualPosition','ActualVelocity','ActualAcceleration','CurrentFeedback','DCBusVoltage','OutputCurrent','OutputVoltage','OutputPower']:
    dt = 1 #1초마다 수집한 데이터
    for i in range(1,26):
        for ax in ['X','Y','Z','S']:
            try:
                f = df[df['exp_num']==i].reset_index()[f'{ax}_{col}'] #실험 i 번째의 ax_col 데이터
            except:
                continue
                
            N = len(f) # 한 실험에서 row 길이
            t = np.arange(0, N*dt, dt) # 0 ~ N/dt 까지 dt 간격으로 만듬. len(t) == N
            freq = np.linspace(0, 1.0/dt, N) # 0 ~ 1.0/dt를 N등분. len(freq) = N
            F = np.fft.fft(f) # 푸리에 변환. 주파수로 만들어줌. 복소수 형태
            F_abs = np.abs(F) / (N/2) # abs -> magnitude 그래프를 얻겠다 && N/2 -> Normalize 해준다. len(F_abs) = N
            F_abs[0] = F_abs[0] / 2 # 첫번째만 2로 나눠준다. 왜인지 모르겠지만 첫번째 값만 매우 큼.
            maximal_idx = signal.argrelmax(F_abs, order=1)[0] # 배열의 최댓값의 인덱스를 오름차순으로 반환. local maximum 찾기. 길이는 F_abs/3 정도

            # local maximum 중 최대값.
            high_amp = np.max(F_abs[maximal_idx]) if len(maximal_idx) > 0 else 0
            # local maximum 중 최대값이 있는 Index와 같은 곳의 freq[]를 사용. 무슨 의미인지는 모르겠음.
            high_freq = freq[maximal_idx][np.argmax(F_abs[maximal_idx])] if len(maximal_idx) > 0 else 0
            
            # 실험마다 할당.
            df.loc[df['exp_num']==i,f'{ax}_{col}_High_Amp'] = high_amp
            df.loc[df['exp_num']==i,f'{ax}_{col}_High_Freq'] = high_freq
            df.loc[df['exp_num']==i,f'{ax}_{col}_High_Amp_Freq'] = high_amp * high_freq

In [65]:
print(df.shape)
df.head()

(32048, 131)


Unnamed: 0,X_ActualPosition,X_ActualVelocity,X_ActualAcceleration,X_SetPosition,X_SetVelocity,X_SetAcceleration,X_CurrentFeedback,X_DCBusVoltage,X_OutputCurrent,X_OutputVoltage,...,S_OutputVoltage_High_Amp_Freq,X_OutputPower_High_Amp,X_OutputPower_High_Freq,X_OutputPower_High_Amp_Freq,Y_OutputPower_High_Amp,Y_OutputPower_High_Freq,Y_OutputPower_High_Amp_Freq,S_OutputPower_High_Amp,S_OutputPower_High_Freq,S_OutputPower_High_Amp_Freq
0,202.0,4.0,4.0,202.0,4.0,4.0,0.18,0.0207,329,2.77,...,0.02224,0.000492,0.019924,1e-05,0.000725,0.014231,1e-05,0.011293,0.335863,0.003793
1,202.0,-6.8,-346.0,202.0,-9.6,-354.0,-10.9,0.186,328,23.3,...,0.02224,0.000492,0.019924,1e-05,0.000725,0.014231,1e-05,0.011293,0.335863,0.003793
2,200.0,-13.8,-2.25,200.0,-13.9,3.999905,-8.59,0.14,328,30.6,...,0.02224,0.000492,0.019924,1e-05,0.000725,0.014231,1e-05,0.011293,0.335863,0.003793
3,198.0,-14.0,4.0,198.0,-13.9,3.999905,-6.11,0.13,327,30.3,...,0.02224,0.000492,0.019924,1e-05,0.000725,0.014231,1e-05,0.011293,0.335863,0.003793
4,197.0,-13.9,-14.8,196.0,-13.9,4.000095,-5.7,0.114,328,30.5,...,0.02224,0.000492,0.019924,1e-05,0.000725,0.014231,1e-05,0.011293,0.335863,0.003793


In [66]:
feature_df = df.copy()
feature_df.drop(['exp_num'], axis=1, inplace=True)

# 현재 작업 디렉토리
path = os.getcwd()

# path 초기화
os.chdir(path)

#save dataFrame
os.chdir("../../data/processed")
feature_df.to_csv("feature_data.csv", index=None)

# 작업 디렉토리 원 상태로
os.chdir(path)