%%

In [None]:
import os

In [None]:
import pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import sys

In [None]:
import matplotlib.pylab as pl

set data path

In [None]:
dir_path = pathlib.Path().absolute()  # 현재 이 노트북 파일 위치
file = './MOM4_data_201229.csv'  # 현재 이 노트북 파일 기준 디렉토리 위치에서 데이터셋의 위치
data_path = os.path.join(dir_path, file)
print('MOM4 data located at:', data_path)

read data

In [None]:
df = pd.read_csv(data_path, index_col=False).drop(['Unnamed: 0'], axis=1)
df.reset_index(drop=True, inplace=True)
assert df.isnull().sum().sum() == 0
# find which (PartType, Orientation, Job) is missing

In [None]:
dfgroups = df.groupby(['PartType', 'Orient.', 'Job'])
reps = 15
stencils = 5 * 5  # solder offset
t = tqdm(dfgroups)
for idx, (name, group) in enumerate(t):
	t.set_description(f'Checking: {name}')

In [None]:
	chiptype, orient, jobtype = name
	if chiptype == 'R0402':
		continue
	# 중복되는 job에 대해 15개 안되면 프린트
	jobcount = dict()
	for index, row in group.iterrows():
		job = row['Job']
		try:
			jobcount[job] += 1
		except:
			jobcount[job] = 1
	for key, values in jobcount.items():
		if values < (stencils * reps):
			print('Missing:\t', chiptype, f'/ {orient} deg /', jobtype, ':', 375 - jobcount[key])

%%

find which CRD is missing


    <br>
CRD: B if R0402, C if 0603, D if R1005<br>
orient 0:  376-750 for each job<br>
orient 90: 1-375 for each job<br>


pull dataframe with conditions as found above

In [None]:
missing_chip = {0: 'R0603-0-Job_2_2', 1: 'R1005-0-Job_8_5', 2: 'R1005-90-Job_0_3'}

In [None]:
temp1 = df.loc[(df['PartType'] == 'R0603') & (df['Orient.'] == 0) & (df['Job'] == 'Job_2_2')]
temp2 = df.loc[(df['PartType'] == 'R1005') & (df['Orient.'] == 0) & (df['Job'] == 'Job_8_5')]
temp3 = df.loc[(df['PartType'] == 'R1005') & (df['Orient.'] == 90) & (df['Job'] == 'Job_0_3')]
temps = [temp1, temp2, temp3]

In [None]:
expected_crds_90deg = np.arange(1, 375, 1)
expected_crds_0deg = np.arange(376, 750, 1)

In [None]:
t = tqdm(temps)
for i, temp in enumerate(t):
	crds_orig = temp['CRD'].values
	crds = [int(crd[1:]) for crd in crds_orig]
	orient = temp['Orient.'].values[0]
	chiptype = temp['PartType'].values[0]
	t.set_description(f'Checking: ({chiptype}, {orient}, {crds_orig[i]})')
	if orient == 0:
		for item in expected_crds_0deg:
			if item not in crds:
				if chiptype == 'R0603':
					print(f'Missing CRD in {missing_chip[i]}:\tC{item}')
				elif chiptype == 'R1005':
					print(f'Missing CRD in {missing_chip[i]}:\tD{item}')
	else:
		for item in expected_crds_90deg:
			if item not in crds:
				if chiptype == 'R0603':
					print(f'Missing CRD in {missing_chip[i]}:\tC{item}')
				elif chiptype == 'R1005':
					print(f'Missing CRD in {missing_chip[i]}:\tD{item}')

%%

In [None]:
missing_crd = {0: 'C395', 1: 'D492', 2: 'D143'}

%%

append columns: mean SPI x AND y

In [None]:
if not 'SPI_X_AVG' in list(df.columns):
	avg_spi_x = df.loc[:, ['SPI_X1', 'SPI_X2']].mean(axis=1)
	avg_spi_y = df.loc[:, ['SPI_Y1', 'SPI_Y2']].mean(axis=1)
	df.insert(11, 'SPI_X_AVG', avg_spi_x)
	df.insert(12, 'SPI_Y_AVG', avg_spi_y)
df['SPI_L_PERCENT'] = ""
df['SPI_W_PERCENT'] = ""

In [None]:
temp1 = df.loc[(df['PartType'] == 'R0603') & (df['Orient.'] == 0) & (df['Job'] == 'Job_2_2')]
temp2 = df.loc[(df['PartType'] == 'R1005') & (df['Orient.'] == 0) & (df['Job'] == 'Job_8_5')]
temp3 = df.loc[(df['PartType'] == 'R1005') & (df['Orient.'] == 90) & (df['Job'] == 'Job_0_3')]
temps = [temp1, temp2, temp3]

In [None]:
missing_case = [['R0603',0,'Job_2_2'],['R1005',0,'Job_8_5'],['R1005',90,'Job_0_3']]
# %%

## visualize (x, y)

x1 = 'SPI_X1'<br>
y1 = 'SPI_Y1'<br>
x2 = 'SPI_X2'<br>
y2 = 'SPI_Y2'

In [None]:
xavg = 'SPI_X_AVG'
yavg = 'SPI_Y_AVG'

temp#_df: contains SPI_X_AVG and SPI_Y_AVG

In [None]:
temp1_df = temp1[[xavg, yavg]]
temp2_df = temp2[[xavg, yavg]]
temp3_df = temp3[[xavg, yavg]]

clustering

In [None]:
from collections import defaultdict

In [None]:
def list_duplicates(seq):
	tally = defaultdict(list)
	for i, item in enumerate(seq):
		tally[item].append(i)
	return ((key, locs) for key, locs in tally.items() if len(locs) > 1)

1. scikit learn - Kmeans

In [None]:
from sklearn.cluster import KMeans

In [None]:
missing_sample = dict()

In [None]:
fig = plt.figure(figsize=(18, 5), facecolor='white')
temp_dfs = [temp1_df, temp2_df, temp3_df]
# for temp_idx, temp_df in enumerate(temp_dfs):
jobs = ['Job_%d_%d'%(i,j) for i in range(9) for j in range(9)]
temp_idx = 0
for chip_type in ['R0402','R0603','R1005'] :
	for orient in [0, 90] :
		for job in jobs :
			temp_df = df.loc[(df['PartType'] == chip_type) & (df['Orient.'] == orient) & (df['Job'] == job)]
			temp_df = temp_df[['SPI_L', 'SPI_W']]
			kmeans = KMeans(n_clusters=25).fit(temp_df)
			centroids = kmeans.cluster_centers_
			label = kmeans.labels_.astype(int)

In [None]:
			l_means = KMeans(n_clusters=5).fit(centroids[:, 0].reshape(-1,1))
			l_centroids = l_means.cluster_centers_
			_label = np.argsort(l_centroids.reshape(-1))
			_label = np.asarray(range(5))[_label]
			_l_label = l_means.labels_.astype(int)
			l_label = _label[_l_label]

In [None]:
			w_means = KMeans(n_clusters=5).fit(centroids[:, 1].reshape(-1,1))
			w_centroids = w_means.cluster_centers_
			_label = np.argsort(w_centroids.reshape(-1))
			_label = np.asarray(range(5))[_label]
			_w_label = w_means.labels_.astype(int)
			w_label = _label[_w_label]

In [None]:
			df.set_value(temp_df.index.get_values(),'SPI_L_PERCENT',10 * (l_label[label] - 2))
			df.set_value(temp_df.index.get_values(),'SPI_W_PERCENT',10 * (w_label[label] - 2))

In [None]:
			if [chip_type, orient, job] in missing_case :
				missing_sample[temp_idx] = []
				for dup in sorted(list_duplicates(kmeans.labels_)):
					if len(temp_df.iloc[dup[1], :]) < 15:
						missing_sample[temp_idx] = (dup[0], dup[1])
						chip_orient_job = missing_chip[temp_idx]
						print(
							f'{chip_orient_job}: centroid {dup[0]} (Count: {len(temp_df.iloc[dup[1], :])})')  # ' \n {temp_df.iloc[dup[1],:]} \n') # lists corresponding row indices for each centroid
					else:
						continue
				temp_idx += 1
			# ax = fig.add_subplot(1, 3, temp_idx)
			# ax.scatter(temp_df[xavg], temp_df[yavg], c=kmeans.labels_.astype(float), s=50, alpha=0.5, label='SPI_AVG')
			# ax.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50, label='centroid')
			# ax.set_xlabel(f'{xavg} (\u03BCm)')
			# ax.set_ylabel(f'{yavg} (\u03BCm)')
			# ax.legend(loc='upper right')
			# ax.grid()
			# ax.set_title(missing_chip[temp_idx])
# plt.show()
# fig.savefig('./stencils.png')

check: (parttype, orient, job) sample

In [None]:
print('Sample:\n')
print(missing_chip[0])
print(temps[0].head(1))

check centroid values<br>
missing sample contains (centroid number, index in temp_dfs[i] for the centroid)<br>
temp_dfs contains dataframe corresponding to the chosen indices (for the centroids)

In [None]:
print('Centroids:\n')
for i, (k, v) in enumerate(missing_sample.items()):
	print(f'{missing_chip[k]}:\n(index: {v}), \n(df: {temp_dfs[i].iloc[v[1], :].head(1)})')

%%

imputation: KNN Imputer(2 neighbors)

In [None]:
from sklearn.impute import KNNImputer

In [None]:
imputer = KNNImputer(n_neighbors=2, weights="uniform")
df_temp_save = df.copy()
# df_temp_save = df.copy().drop(columns=['SPI_VOLUME1', 'SPI_VOLUME2', 'SPI_R','PRE_R','POST_R','SPI_VOLUME_DIFF'])
for i in range(len(missing_sample)):
	# tadd a nan row, impute NaN, add to original by index, reset index
	tmp = temps[i].iloc[missing_sample[i][1], :].append(pd.Series(dtype=float), ignore_index=True)

In [None]:
	# columns to impute:
	cols_to_impute = ['X', 'Y', 'SPI_VOLUME_MEAN', 'SPI_X1', 'SPI_Y1', 'SPI_X2', 'SPI_Y2', 'SPI_X_AVG', 'SPI_Y_AVG', \
					  'SPI_L1', 'SPI_W1', 'SPI_L2', 'SPI_W2', 'SPI_L', 'SPI_W', \
					  'PRE_X', 'PRE_Y', 'PRE_A', 'PRE_L', 'PRE_W', \
					  'POST_X', 'POST_Y', 'POST_A', 'POST_L', 'POST_W', 'Orient.']
	# drop CRD and PartType for imputation
	tmp = tmp[cols_to_impute]

In [None]:
	transformed = pd.DataFrame(imputer.fit_transform(tmp), columns=cols_to_impute).round(3)
	imputed = transformed.iloc[len(transformed) - 1, :]

In [None]:
	centroid_index_start = temp_dfs[i].iloc[missing_sample[i][1], :].index[0]
	centroid_index_end = temp_dfs[i].iloc[missing_sample[i][1], :].index[-1]
	print('size changed: from', len(temp_dfs[i].iloc[missing_sample[i][1], :]), \
		  'to', len(transformed), '/ indices corresp. to each centroid (from orig df): from', centroid_index_start, \
		  'to', centroid_index_end)

In [None]:
	job, parttype, chipl, chipw, orient = df_temp_save.iloc[centroid_index_end, :][
		['Job', 'PartType', 'Chip_L', 'Chip_W', 'Orient.']]
	for col in list(df_temp_save.columns):
		if col not in cols_to_impute:
			# append random value to these columns
			if col == 'Job':
				imputed[col] = job
			elif col == 'PartType':
				imputed[col] = parttype
			elif col == 'Chip_L':
				imputed[col] = chipl
			elif col == 'Chip_W':
				imputed[col] = chipw
			elif col == 'Orient.':
				imputed[col] = orient
			elif col == 'CRD':
				imputed[col] = missing_crd[i]
			else:
				imputed[col] = np.nan
	assert len(imputed.index) == len(list(df_temp_save.columns))
	#     print(imputed)
	df_temp_save = df_temp_save.append(imputed, ignore_index=True)

scan for missing (parttype, orientation, job):<br>
>> if none printed, then none missing!<br>
>> all 15 reps, 25 stencil jobs, 81 chip jobs present

In [None]:
dfgroups = df_temp_save.groupby(['PartType', 'Orient.', 'Job'])
reps = 3 * 5  # chip offset
stencils = 5 * 5  # solder offset
t = tqdm(dfgroups)
for idx, (name, group) in enumerate(t):
	t.set_description(f'Checking: {name}')
	chiptype, orient, jobtype = name
	# 중복되는 job에 대해 15개 안되면 프린트
	jobcount = dict()
	for index, row in group.iterrows():
		job = row['Job']
		try:
			jobcount[job] += 1
		except:
			jobcount[job] = 1
	for key, values in jobcount.items():
		if values < (stencils * reps):
			print(chiptype, f'/ {orient} deg /', jobtype, ':', jobcount[key])

In [None]:
df_temp_save = df_temp_save[['X', 'Y', 'SPI_VOLUME_MEAN', 'SPI_X1', 'SPI_Y1', 'SPI_X2', 'SPI_Y2', 'SPI_X_AVG', 'SPI_Y_AVG', \
					  'SPI_L1', 'SPI_W1', 'SPI_L2', 'SPI_W2', 'SPI_L', 'SPI_W', 'SPI_L_PERCENT', 'SPI_W_PERCENT', \
					  'PRE_X', 'PRE_Y', 'PRE_A', 'PRE_L', 'PRE_W', \
					  'POST_X', 'POST_Y', 'POST_A', 'POST_L', 'POST_W', 'Orient.']]
# df_imputed = df_temp_save.drop(columns=['SPI_VOLUME1', 'SPI_VOLUME2', 'SPI_R', 'PRE_R', 'POST_R', 'SPI_VOLUME_DIFF'])
df_temp_save.to_csv('./test.csv')

%%

df_imputed.head(5)