#Prediction Data Preprocessing
Input: 
> 沖壓機清洗後資料

> 瞬測儀清洗後資料

Output: 
> test: test data (data_num, sequence_num, feature_num)

> label: test data start from which label(ID) (data_num, 1)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# below is for loading data from google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

sequence_num = 100  #資料期數
slide_num = 10      #產生序列的slide間隔數, if slide_num = 5, 序列為n ~ n+sequence_num, n+5 ~ n+5+sequence_num
print("Sequence number: ", sequence_num)
print("Slide number: ", slide_num)

# put your folder path, ex: '/content/drive/MyDrive/.../'
folder_path = 'folderpath'

Mounted at /content/drive
Sequence number:  100
Slide number:  10


In [None]:
# put your data path, ex: '/content/drive/MyDrive/.../'
input_data_machine = '沖壓機 file path' 
input_data_rate = '瞬測儀 file path'
data_machine = pd.read_excel(input_data_machine, usecols=["編號","frequency","Speed","Status"])
data_rate = pd.read_excel(input_data_rate)
data_machine = data_machine.dropna()
data_rate = data_rate.dropna()

print(data_machine.head())
print(data_rate.head())
print(data_machine.shape)
print(data_rate.shape)

All package

In [None]:
# 只抓特定料號
data_rate = data_rate[data_rate["料號"] == "0162B00100"]

By specific package

In [None]:
# 只抓特定料號、特定機台工單號
data_rate = data_rate[data_rate["料號"] == "0162B00100"]
data_rate = data_rate[data_rate["產生檢驗單號的時間"].str[14:] == "D-001_101183683"]

By specific machine

In [None]:
# 只抓特定料號、特定機台
data_rate = data_rate[data_rate["料號"] == "0162B00100"]
data_rate = data_rate[data_rate["產生檢驗單號的時間"].str[14:19] == "D-005"]

In [None]:
"""
將瞬測儀和機台數據資料分包&配對
"""
data_machine.set_index("編號", inplace=True)
all_data_machineId = np.array(data_machine.index.drop_duplicates(keep='first').values)
data_machine.reset_index(inplace=True)

data_rate.set_index("產生檢驗單號的時間", inplace=True)
all_data_rateId = np.array(data_rate.index.drop_duplicates(keep='first').values)
data_rate.reset_index(inplace=True)

# 取得瞬測儀機台工單號序列
all_data_rateId_drop = pd.DataFrame()
for data_rateId in all_data_rateId:
  rateId = pd.Series(data_rateId[14:29]) #機台工單號
  all_data_rateId_drop = pd.concat([all_data_rateId_drop, rateId])

all_data_rateId_drop = all_data_rateId_drop.drop_duplicates() #去除重複機台工單號
all_data_rateId_drop = np.array(all_data_rateId_drop.values)

print("沖壓機不重複編號個數: ", all_data_machineId.size)
print("瞬測儀不重複機台工單號個數: ", all_data_rateId_drop.size)

pkg_num = 0

"""
機台資料以同機台同工單號分包
"""
for data_rateId in all_data_rateId_drop:
  data_machineId = str(data_rateId.tolist())[2:17] #指定為瞬測儀機台工單號

  pkg_num += 1
  globals()['x_'+str(pkg_num)] = data_machine[data_machine["編號"].str.contains(data_machineId)] #取得該機台工單號的機台資料

max_pkg_num = pkg_num
print()
print("Total package number: ", max_pkg_num)

"""
計算筆數小於sequence_num筆包數
"""
usable_pkg = 0
lessthan = 0

for pkg_num in range(1, max_pkg_num+1):
  if len(globals()['x_'+str(pkg_num)]) < sequence_num:
    lessthan += 1
  else:
    if usable_pkg == 0:
      first_pkg = pkg_num 
    usable_pkg += 1

print("Less than sequence_num data package: ",lessthan)
print("Usable package number: ",usable_pkg)
print("First package number: ",first_pkg)

沖壓機不重複編號個數:  351091
瞬測儀不重複機台工單號個數:  86

Total package number:  86
Less than sequence_num data package:  13
Usable package number:  73
First package number:  1


In [None]:
"""
機台資料取同機台同工單以slide_num筆產生sequence_num筆序列, 計算sequence筆數
"""
sequence_pkg_num = 0

for pkg_num in range(1, max_pkg_num+1):
  if len(globals()['x_'+str(pkg_num)]) < sequence_num:
    pass
  else:
    for start_num in range(0, len(globals()['x_'+str(pkg_num)])-sequence_num, slide_num):
      sequence_pkg_num += 1
      globals()['x_sequence_'+str(sequence_pkg_num)] = globals()['x_'+str(pkg_num)][start_num:start_num+sequence_num]

max_sequence_pkg_num = sequence_pkg_num

for sequence_pkg_num in range(1, max_sequence_pkg_num+1):
  globals()['label_sequence_'+str(sequence_pkg_num)] = pd.Series([globals()['x_sequence_'+str(sequence_pkg_num)]["編號"].iloc[0]]) #沖壓機序列第一筆之編號
  globals()['x_sequence_'+str(sequence_pkg_num)] = globals()['x_sequence_'+str(sequence_pkg_num)].drop(["編號"], axis=1)           #沖壓機序列

print("Usable sequence package number: ",max_sequence_pkg_num)
print(globals()['x_sequence_'+str(max_sequence_pkg_num)].shape)       #(sequence_num, feature_num)
print(globals()['label_sequence_'+str(max_sequence_pkg_num)].values)  #編號

Usable sequence package number:  21564
(100, 3)
['2021112318:29_D-001_101368684']


In [None]:
"""
依序存進data & label, 將pd格式轉為np格式
"""
for sequence_pkg_num in range(1, max_sequence_pkg_num+1):
  if sequence_pkg_num == 1:
    data = pd.DataFrame(globals()['x_sequence_'+str(sequence_pkg_num)])  
    label = pd.DataFrame(globals()['label_sequence_'+str(sequence_pkg_num)])   
  else:
    data = pd.concat([data, pd.DataFrame(globals()['x_sequence_'+str(sequence_pkg_num)])])
    label = pd.concat([label, pd.DataFrame(globals()['label_sequence_'+str(sequence_pkg_num)])])

data = data.values
label = label.values
print(data.shape)     #(data_num*sequence_num, feature_num)
print(label.shape)    #(data_num, output_num)

(2156400, 3)
(21564, 1)


In [None]:
"""
將data從(data_num*sequence_num, feature_num)轉為(data_num, sequence_num, feature_num)
"""
slide_size = 0
data_temp = []

for sequence_pkg_num in range(1, max_sequence_pkg_num+1):
  data_temp.append(data[slide_size:(slide_size+sequence_num), 0:3])
  slide_size += sequence_num

data = np.array(data_temp)
print(data.shape)  #(data_num, sequence_num, feature_num)

(21564, 100, 3)


In [None]:
"""
將test data & label儲存為npy檔
"""
test = data
label = label

print(test.shape)
print(label.shape)

np.save(folder_path + 'test_all.npy', test)
np.save(folder_path + 'label_all.npy', label)
print("Done")

(21564, 100, 3)
(21564, 1)
Done
