# Surfscan Data Analysis

Purpose: This code provides a temporary solution for interpretation of Surfscan summary output files. As is, the Surfscan files may be hard to intepret and provides unnecessary information for operations. Our goal is to be able to parse these text files and provide a digestible output. 

In [4]:
#First, Downloading all of the necessary packages
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import scipy as sp
import plotly.express as px
from pathlib import Path
import os

In [5]:
#!{sys.executable} -m pip install --upgrade pip
#Magical code to install packages you don't have already
#import sys
#!{sys.executable} -m pip install plotly

In [6]:
#First, we will try to convert the text file into a csv file that we can begin to manipulate
# file_path = "../../Wafer Particles/Wafer-Particles/Summary_TXT_Files/ZZBARESI_HT_MANUTEST_20221205_1600.txt"
print("Data_Summary_Analysis is running.")
df = pd.read_csv(file_path, encoding= 'unicode_escape')

#Adding a column name so that it is easier to call out. The column header had spaces.
df.columns=['Parse1']
#Removing the separation dashes
df= df[df.Parse1 !='--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------']


#Trying to clean up the file and the unecessary spaces using two steps
#STEP 1: Trimming space from the ends of each values
#Let's define what it means to trim the dataframe of unecessary spaces
def trim_all_columns(df):
    #Trim whitespace from ends of each value across all series in dataframe
    trim_strings = lambda x: x.strip() if isinstance(x, str) else x
    return df.applymap(trim_strings)

#Now, let's utilize our trim definition for our dataframe
df = trim_all_columns(df)

#STEP2: Replacing every all greater that 1 space [' +'] with 1 space [' ']. Reason for this is so we can split the df by space
df = df['Parse1'].str.replace(' +',' ', regex = True)
df

0                          Session: ZZBARESI_HT_MANUTEST
1                                      Lot ID: LONG TEST
3                Wafer Info DCO Totals DCO Haze DCO Bins
4      Side/Wafer ID Grade Src/Dest All Lpd LpdN LpdE...
5      [#] [#] [#] [#] [#] [#] [mm] [#] [mm²] [ppm] [...
                             ...                        
338    Min 18 12 NA NA NA NA NA 2 0.000 0.001 0.000 0...
339    Max 1223 948 NA NA NA NA NA 275 1.483 0.005 0....
340    Mean 388.8 342.1 NA NA NA NA NA 46.71 0.296 0....
341    Std. Dev. 352.0 308.7 NA NA NA NA NA 54.04 0.3...
343                                 100 wafer(s) scanned
Name: Parse1, Length: 332, dtype: object

In [7]:
#Here, we split the df by space. We are creating a total of 23 columns, therefore we use number 22 because (0,1,2,3,...,22)
df = df.str.split(' ',22, expand=True)

#Adding proper column headers 
df.columns = ['Side','Grade','Source/Destination','All','Lpd','LpdN',
    'LpsED','uScr','Scr','Slip','Area (#)','Area (mm^2)','Avg','Median','STDV','Bin1','Bin2','Bin3',
    'Bin4','Bin5','Bin6','Bin7','Bin8'] 
df

  df = df.str.split(' ',22, expand=True)


Unnamed: 0,Side,Grade,Source/Destination,All,Lpd,LpdN,LpsED,uScr,Scr,Slip,...,Median,STDV,Bin1,Bin2,Bin3,Bin4,Bin5,Bin6,Bin7,Bin8
0,Session:,ZZBARESI_HT_MANUTEST,,,,,,,,,...,,,,,,,,,,
1,Lot,ID:,LONG,TEST,,,,,,,...,,,,,,,,,,
3,Wafer,Info,DCO,Totals,DCO,Haze,DCO,Bins,,,...,,,,,,,,,,
4,Side/Wafer,ID,Grade,Src/Dest,All,Lpd,LpdN,LpdES,µScr,Scr,...,Median,STDV,Bin1,Bin2,Bin3,Bin4,Bin5,Bin6,Bin7,Bin8
5,[#],[#],[#],[#],[#],[#],[mm],[#],[mm²],[ppm],...,[#],[#],[#],[#],[#],[#],[#],,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,Min,18,12,,,,,,2,0.000,...,0,0,0,0,0,0,0,0,,
339,Max,1223,948,,,,,,275,1.483,...,77,31,263,249,188,138,99,35,,
340,Mean,388.8,342.1,,,,,,46.71,0.296,...,8.640,3.790,97.39,82.56,65.66,46.02,28.98,9.010,,
341,Std.,Dev.,352.0,308.7,,,,,,54.04,...,0.001,13.90,6.114,96.16,81.17,64.82,45.23,28.55,7.933,


In [8]:
#Here, we identify the index of the row we want to stop
#We want to stop the code when the 'WaferID' column reads 'wafer(s)'
row = df[df['Grade'] == 'wafer(s)'].index.tolist()[0]

#Keeping only the rows ip to the row we identfied
df = df.iloc[:row-3]
df

Unnamed: 0,Side,Grade,Source/Destination,All,Lpd,LpdN,LpsED,uScr,Scr,Slip,...,Median,STDV,Bin1,Bin2,Bin3,Bin4,Bin5,Bin6,Bin7,Bin8
0,Session:,ZZBARESI_HT_MANUTEST,,,,,,,,,...,,,,,,,,,,
1,Lot,ID:,LONG,TEST,,,,,,,...,,,,,,,,,,
3,Wafer,Info,DCO,Totals,DCO,Haze,DCO,Bins,,,...,,,,,,,,,,
4,Side/Wafer,ID,Grade,Src/Dest,All,Lpd,LpdN,LpdES,µScr,Scr,...,Median,STDV,Bin1,Bin2,Bin3,Bin4,Bin5,Bin6,Bin7,Bin8
5,[#],[#],[#],[#],[#],[#],[mm],[#],[mm²],[ppm],...,[#],[#],[#],[#],[#],[#],[#],,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,Min,20,13,,,,,,3,0.000,...,6,1,0,0,0,0,0,0,,
109,Max,1440,1158,,,,,,283,1.592,...,138,65,274,252,188,138,99,35,,
110,Mean,415.6,365.9,,,,,,49.67,0.317,...,23.72,10.46,98.89,83.18,65.66,46.02,28.98,9.010,,
111,Std.,Dev.,381.1,335.9,,,,,,54.83,...,0.009,25.02,11.35,97.92,81.87,64.82,45.23,28.55,7.933,


Now that we have have filtered to the DCO data that we are interested in, it is time to continue parsing the dataframe to extract relevant data.

In [9]:
#Deleting some irrelavent columns
del df['LpdN']
del df['LpsED']
del df['uScr']
del df['Scr']
del df['Slip']
del df['Area (#)']
del df['Area (mm^2)']
del df['Avg']
del df['Median']
del df['STDV']


In [10]:
# Removes extension from file name
fileName = Path(file_path).stem
print(fileName)

# Extracts date and time from file name
dateTime = ''.join(i for i in fileName if i.isdigit())
print(dateTime)

date = dateTime[0:8]
print(date)

time = dateTime[8:]
print(time)

# Extracts session from dataframe
session = df.iloc[0]['Grade']
print(session)

# Extracts Lot ID from dataframe
a = df.iloc[1]['Source/Destination']
b = df.iloc[1]['All']
lotID = " ".join([a, b])
print(lotID)

ZZBARESI_HT_MANUTEST_20221205_1600
202212051600
20221205
1600
ZZBARESI_HT_MANUTEST
LONG TEST


In [11]:
# Adds additional columns to dataframe
df = df.assign(
    Session = lambda x: session,
    Lot_ID = lambda x: lotID,
    Date = lambda x: date,
    Time = lambda x: time
)
df

Unnamed: 0,Side,Grade,Source/Destination,All,Lpd,Bin1,Bin2,Bin3,Bin4,Bin5,Bin6,Bin7,Bin8,Session,Lot_ID,Date,Time
0,Session:,ZZBARESI_HT_MANUTEST,,,,,,,,,,,,ZZBARESI_HT_MANUTEST,LONG TEST,20221205,1600
1,Lot,ID:,LONG,TEST,,,,,,,,,,ZZBARESI_HT_MANUTEST,LONG TEST,20221205,1600
3,Wafer,Info,DCO,Totals,DCO,,,,,,,,,ZZBARESI_HT_MANUTEST,LONG TEST,20221205,1600
4,Side/Wafer,ID,Grade,Src/Dest,All,Bin1,Bin2,Bin3,Bin4,Bin5,Bin6,Bin7,Bin8,ZZBARESI_HT_MANUTEST,LONG TEST,20221205,1600
5,[#],[#],[#],[#],[#],[#],[#],[#],[#],[#],,,,ZZBARESI_HT_MANUTEST,LONG TEST,20221205,1600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,Min,20,13,,,0,0,0,0,0,0,,,ZZBARESI_HT_MANUTEST,LONG TEST,20221205,1600
109,Max,1440,1158,,,274,252,188,138,99,35,,,ZZBARESI_HT_MANUTEST,LONG TEST,20221205,1600
110,Mean,415.6,365.9,,,98.89,83.18,65.66,46.02,28.98,9.010,,,ZZBARESI_HT_MANUTEST,LONG TEST,20221205,1600
111,Std.,Dev.,381.1,335.9,,11.35,97.92,81.87,64.82,45.23,28.55,7.933,,ZZBARESI_HT_MANUTEST,LONG TEST,20221205,1600


In [12]:
# Removes irrelevant rows
df = df.drop(df[df.Side != 'F'].index).reset_index(drop=True)
df

Unnamed: 0,Side,Grade,Source/Destination,All,Lpd,Bin1,Bin2,Bin3,Bin4,Bin5,Bin6,Bin7,Bin8,Session,Lot_ID,Date,Time
0,F,Rejected,1/25-1/25,833,775,50,38,216,162,129,110,59,11,ZZBARESI_HT_MANUTEST,LONG TEST,20221205,1600
1,F,Rejected,1/24-1/24,708,658,14,9,178,163,131,96,47,20,ZZBARESI_HT_MANUTEST,LONG TEST,20221205,1600
2,F,Rejected,1/23-1/23,750,695,23,20,223,158,120,82,55,14,ZZBARESI_HT_MANUTEST,LONG TEST,20221205,1600
3,F,Rejected,1/22-1/22,657,617,15,7,188,129,126,76,60,16,ZZBARESI_HT_MANUTEST,LONG TEST,20221205,1600
4,F,Rejected,1/21-1/21,646,544,14,6,161,144,91,73,39,16,ZZBARESI_HT_MANUTEST,LONG TEST,20221205,1600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,F,Rejected,1/5-1/5,46,35,12,8,1,6,3,2,2,1,ZZBARESI_HT_MANUTEST,LONG TEST,20221205,1600
96,F,Rejected,1/4-1/4,63,48,24,5,4,2,3,7,2,1,ZZBARESI_HT_MANUTEST,LONG TEST,20221205,1600
97,F,Rejected,1/3-1/3,57,45,18,9,2,3,4,2,3,4,ZZBARESI_HT_MANUTEST,LONG TEST,20221205,1600
98,F,Rejected,1/2-1/2,32,25,11,5,0,1,4,1,1,2,ZZBARESI_HT_MANUTEST,LONG TEST,20221205,1600


In [13]:
# Exports table as an excel file
# Paste path for parsed output folder before '{fileName}.csv' followed by a forward slash
# If on windows, replace backslash (\) with double backslash(\\)

df.to_csv(f'../../Wafer Particles/Wafer-Particles/Parsed_Output/{fileName}.csv', encoding='utf-8', index=False)