# isee_machine & isee_quality
- isee_quality의 모든 정보가 'NG' 즉, 모두 비정상이라는 가정하에 매칭해봄
- NG 라벨링 part
- 18.06.12

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
isee_machine = pd.read_csv('./data/isee_machine_data_20180901_to_20190331_C123.csv',encoding= 'cp949')

# 시간 type 변경
isee_machine['creationTime'] = pd.to_datetime(isee_machine['creationTime'],format = '%m/%d/%Y %H:%M')

# 인덱스 오름차순으로 재정렬
isee_machine = isee_machine.sort_index(ascending = False).reset_index(drop=True)

In [3]:
# 일자컬럼 추가 (object)
isee_machine['creationDate'] = isee_machine['creationTime'].dt.date.astype(str)

In [4]:
isee_machine['creationHour'] = isee_machine['creationTime'].dt.hour

In [5]:
isee_machine['YM'] = isee_machine['creationTime'].dt.strftime('%Y-%m').astype(str)

In [6]:
# 시간차 계산 (초)
creationtime_diff = []
creationtime = isee_machine['creationTime']
for i in range(creationtime.shape[0]):
    if i == 0:
        creationtime_diff.append(0)
    elif i == (creationtime.shape[0]-1) : 
        creationtime_diff.append(3600)
    else:
        diff_seconds = (creationtime[i] - creationtime[i-1]).total_seconds()
        creationtime_diff.append(diff_seconds)
isee_machine['creationTime_diff'] = creationtime_diff   

In [7]:
print('총 데이터 건: ',isee_machine.shape[0])

총 데이터 건:  236696


### cycle 컬럼 추가
- cycle 기준(초) > 1h == 3600 sec

In [8]:
cycle_std_sec = 3600

cycle_num = []
last_cycle_idx = isee_machine[isee_machine.creationTime_diff >= cycle_std_sec].index.tolist()
for i, last_idx in enumerate(last_cycle_idx):
    if i == 0:
        frist_idx = 0
    elif i == len(last_cycle_idx)-1:
        last_idx = last_idx+1
        
    num = [i+1 for _ in range(last_idx-frist_idx)]
    cycle_num.extend(num)
        
    frist_idx = last_idx

isee_machine['CycleNum'] = cycle_num
cycle_len = len(set(cycle_num))

### 가동 (초기,중기, 말기) 라벨링
- beginning_std_min : 초기기준 (시각~ 00분까지 / 분으로 계산하여 삽입)
- end_std_min: 말기기준(00분 ~ 끝 /  분으로 계산하여 삽입)

In [9]:
beginning_std_min = 5
end_std_min = 5

operation_label = []
cycle_num = list(isee_machine['CycleNum'].unique())
for c in cycle_num : 
    data = isee_machine[isee_machine.CycleNum == c ].reset_index(drop = True)
    frist_cycle_time = data['creationTime'].iloc[0]
    last_cycle_time = data['creationTime'].iloc[-1]
    cnt = 0
    for idx, time in enumerate(data['creationTime']):
        if time <= (frist_cycle_time + timedelta(minutes = +beginning_std_min)):  # 초기 
            operation_label.append(0)
        elif time >= (last_cycle_time + timedelta(minutes = -end_std_min)): # 말기 
            operation_label.append(2)
        else : 
            operation_label.append(1)
        
isee_machine['Operation_label'] = operation_label

In [10]:
### 실제 라벨링# 한 cycle에 다른 날짜 확인 >  63개
oneCycle_diffDate = {}
cycle_num = list(isee_machine['CycleNum'].unique())
for c in cycle_num : 
    data = isee_machine[isee_machine.CycleNum == c ].reset_index(drop = True)
    date = data['creationDate'].unique()
    if date.shape[0] != 1:
        oneCycle_diffDate[c] = date
len(oneCycle_diffDate)

63

### NG라벨링
- quality 파일에 있는 모든 데이터를 이상이라고 봄

In [11]:
quality = pd.read_csv('./data/isee_quality_data_20180901_to_201902.csv',encoding= 'cp949')
quality['CreationTime']  = pd.to_datetime(quality['CreationTime'],format = '%m/%d/%Y %H:%M')

qua_createtime = pd.Series(quality.groupby(['CreationTime']).count().reset_index()['CreationTime'])

- Qua_Mach_YN(quality와 매칭되지 않는 기간) == 0

In [12]:
isee_machine['Qua_Match_YN'] = 0
isee_machine['Qua_Match_YN'].loc[isee_machine['creationTime'] <= qua_createtime.iloc[-1]] = 1

In [19]:
mer_df = pd.merge(isee_machine[isee_machine.Qua_Match_YN == 1], pd.DataFrame(qua_createtime), how = 'left',left_on='creationTime', right_on='CreationTime')

mer_df['CreationTime'].loc[~mer_df.CreationTime.isnull()] = 1
mer_df['CreationTime'].loc[mer_df.CreationTime.isnull()] = 0

In [20]:
mer_df = mer_df.rename(columns = {'CreationTime':'NG'})

In [21]:
print('매칭되지 않은 라벨 건: ',len(qua_createtime) - len(mer_df[mer_df.NG == 1]['creationTime'].unique()))
print('매칭된 라벨 건: ',len(mer_df[mer_df.NG == 1]['creationTime'].unique()))

매칭되지 않은 라벨 건:  499
매칭된 라벨 건:  4720


### NG5 라벨링
- 불량 시간 5분전도 불량으로 라벨링

In [22]:
# 불량 발생 5분 전도 불량으로 책정
add_f_min = 5 
add_date = []
for date in qua_createtime:
    minutes = 0
    for i in range(1,add_f_min+1):   
        minutes = i*60
        add_date.append(date-timedelta(0,minutes))
add_date = pd.Series(add_date)        

In [23]:
qua_createtime_5 = pd.Series(pd.concat([qua_createtime,add_date]).unique(),name = 'NG5')

In [27]:
mer_df = pd.merge(mer_df, pd.DataFrame(qua_createtime_5), how = 'left',left_on='creationTime', right_on='NG5')

mer_df['NG5'].loc[~mer_df.NG5.isnull()] = 1
mer_df['NG5'].loc[mer_df.NG5.isnull()] = 0

In [28]:
# mer_df[mer_df.creationTime.astype(str).str.startswith('2018-09-07 02:2')][['creationTime','NG','NG5']]

In [29]:
print('qua_createtime_5 : ',len(qua_createtime_5))
print('매칭되지 않은 라벨 건: ',len(qua_createtime_5) - len(mer_df[mer_df.NG5 == 1]['creationTime'].unique()))
print('매칭된 않은 라벨 건: ',len(mer_df[mer_df.NG5 == 1]['creationTime'].unique()))

qua_createtime_5 :  18883
매칭되지 않은 라벨 건:  1769
매칭된 않은 라벨 건:  17114


In [30]:
print('총 데이터 건: ',mer_df.shape[0])
print('cycle 수: ',mer_df['CycleNum'].unique().shape[0])
print('NG라벨 불량 건: ',mer_df[mer_df.NG == 1].shape[0])
print('NG라벨 정상 건: ',mer_df[mer_df.NG != 1].shape[0])
print('NG5라벨 불량 건: ',mer_df[mer_df.NG5 == 1].shape[0])
print('NG5라벨 정상 건: ',mer_df[mer_df.NG5 != 1].shape[0])

총 데이터 건:  185496
cycle 수:  124
NG라벨 불량 건:  7540
NG라벨 정상 건:  177956
NG5라벨 불량 건:  27461
NG5라벨 정상 건:  158035


In [31]:
mer_df.to_csv('./data/machine_NG.csv',index = False)

### 원본(isee_machine)에 NG, NG5 붙이기
- NG : 0정상, 1이상
- NG5 : 0정상, 1이상
##### ***관련 없는 (Qua_Match_YN == 1) 부분도 정상으로 됨

In [32]:
isee_machine['NG'] = 0
isee_machine['NG'].iloc[mer_df.index] = mer_df['NG']

In [33]:
isee_machine['NG5'] = 0
isee_machine['NG5'].loc[mer_df.index] = mer_df['NG5']

In [34]:
print('총 데이터 건: ',isee_machine.shape[0])
print('cycle 수: ',isee_machine['CycleNum'].unique().shape[0])
print('NG라벨 불량 건: ',isee_machine[isee_machine.NG == 1].shape[0])
print('NG라벨 정상 건: ',isee_machine[isee_machine.NG == 0].shape[0])
print('NG5라벨 불량 건: ',isee_machine[isee_machine.NG5 == 1].shape[0])
print('NG5라벨 정상 건: ',isee_machine[isee_machine.NG5 == 0].shape[0])

총 데이터 건:  236696
cycle 수:  160
NG라벨 불량 건:  7540
NG라벨 정상 건:  229156
NG5라벨 불량 건:  27461
NG5라벨 정상 건:  209235


In [35]:
isee_machine.to_csv('./data/isee_machine_NG.csv',index = False)

### 불량 cycle check

In [27]:
# y_cycle_num = list(mer_df[mer_df.NG == 1]['CycleNum'].unique())

# for num in y_cycle_num:
#     date = list(mer_df[(mer_df.NG == 1) & (mer_df.CycleNum == num)]['creationDate'].unique())
#     print('Cycle {} : {}'.format(num,date))