# Data pre-processing for WM-811k and MixedWM38

## Import required libraries

In [1]:
import pickle
import numpy as np
import pandas as pd

from pathlib import Path
from skimage.transform import resize

## WM-811k

In [4]:
DIM = 64
raw_pkl_path = Path("../dataset/LSWMD.pkl")
save_path = Path("../dataset/WM811k")

if not save_path.exists():
    save_path.mkdir()


In [6]:
df = pd.read_pickle(raw_pkl_path)
df = df.drop(['waferIndex', 'trianTestLabel', 'lotName'], axis=1)

df['failureNum']=df.failureType
mapping_type={'Center':0,'Donut':1,'Edge-Loc':2,'Edge-Ring':3,'Loc':4,'Random':5,'Scratch':6,'Near-full':7,'none':8}
df=df.replace({'failureNum':mapping_type})

#use labeled wafer maps only
df_withlabel = df[(df['failureNum']>=0)]

#Drop wafers with die size smaller than 100
df_withlabel = df_withlabel.drop(df_withlabel[df_withlabel['dieSize']<100].index.tolist()).reset_index()

  op = lambda x: operator.eq(x, b)


In [7]:
X = df_withlabel.waferMap
X_binary = [np.where(x<=1,0,1) for x in X] #binarize the wafers 
X_resize = np.array([resize(x,(DIM,DIM), order=1 ,preserve_range=True, anti_aliasing=False) for x in X_binary]) #bi-linear interpolation
X_resize = (X_resize.reshape(-1,DIM,DIM,1)*255).astype(np.uint8)
y = np.array(df_withlabel['failureNum']).astype(np.int)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  """


In [8]:
"""
save each resized wafer map as numpy array (.npy)
"""

labelednpyPath = []
labeled_total = X_resize.shape[0]

npy_root = save_path/"wafermaps"

if not npy_root.exists():
    npy_root.mkdir()

for i in range(labeled_total):

    fname = str(df_withlabel['index'][i])
    
    np.save(npy_root/fname, X_resize[i])
    labelednpyPath.append((npy_root/fname).name+'.npy')

    if i%1000 ==0:
        print('{}/{} done'.format(i, labeled_total))

0/172946 done
1000/172946 done
2000/172946 done
3000/172946 done
4000/172946 done
5000/172946 done
6000/172946 done
7000/172946 done
8000/172946 done
9000/172946 done
10000/172946 done
11000/172946 done
12000/172946 done
13000/172946 done
14000/172946 done
15000/172946 done
16000/172946 done
17000/172946 done
18000/172946 done
19000/172946 done
20000/172946 done
21000/172946 done
22000/172946 done
23000/172946 done
24000/172946 done
25000/172946 done
26000/172946 done
27000/172946 done
28000/172946 done
29000/172946 done
30000/172946 done
31000/172946 done
32000/172946 done
33000/172946 done
34000/172946 done
35000/172946 done
36000/172946 done
37000/172946 done
38000/172946 done
39000/172946 done
40000/172946 done
41000/172946 done
42000/172946 done
43000/172946 done
44000/172946 done
45000/172946 done
46000/172946 done
47000/172946 done
48000/172946 done
49000/172946 done
50000/172946 done
51000/172946 done
52000/172946 done
53000/172946 done
54000/172946 done
55000/172946 done
56000

In [9]:
df_withlabel['npyPath'] = labelednpyPath
df_withlabel.drop(['index','waferMap','dieSize','failureType'],axis=1,inplace=True)

In [10]:
csv_root = save_path
df_withlabel.to_csv(save_path/'labeled.csv',index = False)

In [11]:
df_withlabel = pd.read_csv('../dataset/WM811k/labeled.csv') # to check

In [12]:
"""
test set is comprise of 20% of entire labeled wafer maps
"""

df_labeled_validation = df_withlabel.sample(frac = 0.2)
df_withlabel.drop(df_labeled_validation.index, axis = 0, inplace = True)
df_labeled_validation.to_csv(csv_root/'labeled_validation.csv',index = False)

df_withlabel.to_csv(csv_root/'labeled_training.csv', index = False)

## MixedWM38
Each wafer in MixedWM38 is already has identical size (52 $\times$ 52) 

In [13]:
raw_npz_path = Path('../dataset/MixedWM38.npz')

save_path = Path('../dataset/MixedWM38')

if not save_path.exists():
    save_path.mkdir()


In [14]:
data = np.load(raw_npz_path)

In [16]:
waferPaths = []
labelPaths = []

wafer_npy_root = save_path/'wafermaps'
label_npy_root = save_path/'labels'

if not wafer_npy_root.exists():
    wafer_npy_root.mkdir()

if not label_npy_root.exists():
    label_npy_root.mkdir()

for idx, (wafer, label) in enumerate(zip(data['arr_0'],data['arr_1'])):

    fname = str(idx)
    
    np.save(wafer_npy_root/fname, wafer)
    np.save(label_npy_root/fname, label)

    waferPaths.append((wafer_npy_root/fname).name+'.npy')
    labelPaths.append((label_npy_root/fname).name+'.npy')

    if idx%1000 ==0:
        print('{}/{} done'.format(i, labeled_total))

In [19]:
df = pd.DataFrame({'wafernpyPath':waferPaths,'labelnpyPath':labelPaths})

In [21]:
df.head()

Unnamed: 0,wafernpyPath,labelnpyPath
0,0.npy,0.npy
1,1.npy,1.npy
2,2.npy,2.npy
3,3.npy,3.npy
4,4.npy,4.npy


In [22]:
csv_path = save_path/'dataset.csv'
df.to_csv(csv_path,index=False)

In [23]:
df_validation = df.sample(frac = 0.2)
df.drop(df_validation.index, axis = 0, inplace = True)
df_validation.to_csv(save_path/'validation.csv',index = False)

df.to_csv(save_path/'training.csv', index = False)