# Surfscan Data Analysis

Purpose: This code provides a temporary solution for interpretation of Surfscan summary output files. As is, the Surfscan files may be hard to intepret and provides unnecessary information for operations. Our goal is to be able to parse these text files and provide a digestible output. 

In [75]:
#First, Downloading all of the necessary packages
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import scipy as sp
import plotly.express as px
from pathlib import Path
import os

In [76]:
#!{sys.executable} -m pip install --upgrade pip
#Magical code to install packages you don't have already
#import sys
#!{sys.executable} -m pip install plotly

In [77]:
#First, we will try to convert the text file into a csv file that we can begin to manipulate
file_path = r'C:\Users\anhtu\Desktop\Wafer Particles\Wafer-Particles\Summary_TXT_Files\ZZBARESI_HT_20221013_1657.txt'
df = pd.read_csv(file_path, encoding= 'unicode_escape')

#Adding a column name so that it is easier to call out. The column header had spaces.
df.columns=['Parse1']
#Removing the separation dashes
df= df[df.Parse1 !='--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------']


#Trying to clean up the file and the unecessary spaces using two steps
#STEP 1: Trimming space from the ends of each values
#Let's define what it means to trim the dataframe of unecessary spaces
def trim_all_columns(df):
    #Trim whitespace from ends of each value across all series in dataframe
    trim_strings = lambda x: x.strip() if isinstance(x, str) else x
    return df.applymap(trim_strings)

#Now, let's utilize our trim definition for our dataframe
df = trim_all_columns(df)

#STEP2: Replacing every all greater that 1 space [' +'] with 1 space [' ']. Reason for this is so we can split the df by space
df = df['Parse1'].str.replace(' +',' ', regex = True)
df

0                                  Session: ZZBARESI_HT
1                                               Lot ID:
3               Wafer Info DCO Totals DCO Haze DCO Bins
4     Side/Wafer ID Grade Src/Dest All Lpd LpdN LpdE...
5     [#] [#] [#] [#] [#] [#] [mm] [#] [mm²] [ppm] [...
7     F Rejected 1/22-1/22 187 116 NA NA NA NA NA 71...
8     F Rejected 1/16-1/16 166 58 NA NA NA NA NA 108...
9     F Rejected 1/10-1/10 299 124 NA NA NA NA NA 17...
10    F Rejected 1/5-1/5 67 35 NA NA NA NA NA 32 0.0...
12    Min 67 35 NA NA NA NA NA 32 0.000 0.024 0.023 ...
13    Max 299 124 NA NA NA NA NA 175 0.298 0.041 0.0...
14    Mean 179.8 83.25 NA NA NA NA NA 96.50 0.094 0....
15    Std. Dev. 95.17 43.58 NA NA NA NA NA 60.84 0.1...
17                                   4 wafer(s) scanned
18                               SP1 Long Wafer Summary
19                                 Session: ZZBARESI_HT
20                                              Lot ID:
22              Wafer Info DWO Totals DWO Haze D

In [78]:
#Here, we split the df by space. We are creating a total of 23 columns, therefore we use number 22 because (0,1,2,3,...,22)
df = df.str.split(' ',22, expand=True)

#Adding proper column headers 
df.columns = ['Side','Grade','Source/Destination','All','Lpd','LpdN',
    'LpsED','uScr','Scr','Slip','Area (#)','Area (mm^2)','Avg','Median','STDV','Bin1','Bin2','Bin3',
    'Bin4','Bin5','Bin6','Bin7','Bin8'] 
df

  df = df.str.split(' ',22, expand=True)


Unnamed: 0,Side,Grade,Source/Destination,All,Lpd,LpdN,LpsED,uScr,Scr,Slip,...,Median,STDV,Bin1,Bin2,Bin3,Bin4,Bin5,Bin6,Bin7,Bin8
0,Session:,ZZBARESI_HT,,,,,,,,,...,,,,,,,,,,
1,Lot,ID:,,,,,,,,,...,,,,,,,,,,
3,Wafer,Info,DCO,Totals,DCO,Haze,DCO,Bins,,,...,,,,,,,,,,
4,Side/Wafer,ID,Grade,Src/Dest,All,Lpd,LpdN,LpdES,µScr,Scr,...,Median,STDV,Bin1,Bin2,Bin3,Bin4,Bin5,Bin6,Bin7,Bin8
5,[#],[#],[#],[#],[#],[#],[mm],[#],[mm²],[ppm],...,[#],[#],[#],[#],[#],[#],[#],,,
7,F,Rejected,1/22-1/22,187,116,,,,,,...,0.023,0.002,3,5,2,13,4,10,3,10
8,F,Rejected,1/16-1/16,166,58,,,,,,...,0.041,0.002,4,5,6,2,4,1,1,5
9,F,Rejected,1/10-1/10,299,124,,,,,,...,0.040,0.016,10,7,8,7,4,6,6,13
10,F,Rejected,1/5-1/5,67,35,,,,,,...,0.028,0.002,4,2,3,3,1,3,0,3
12,Min,67,35,,,,,,32,0.000,...,3,2,2,2,1,1,0,3,,


In [79]:
#Here, we identify the index of the row we want to stop
#We want to stop the code when the 'WaferID' column reads 'wafer(s)'
row = df[df['Grade'] == 'wafer(s)'].index.tolist()[0]

#Keeping only the rows ip to the row we identfied
df = df.iloc[:row-3]
df

Unnamed: 0,Side,Grade,Source/Destination,All,Lpd,LpdN,LpsED,uScr,Scr,Slip,...,Median,STDV,Bin1,Bin2,Bin3,Bin4,Bin5,Bin6,Bin7,Bin8
0,Session:,ZZBARESI_HT,,,,,,,,,...,,,,,,,,,,
1,Lot,ID:,,,,,,,,,...,,,,,,,,,,
3,Wafer,Info,DCO,Totals,DCO,Haze,DCO,Bins,,,...,,,,,,,,,,
4,Side/Wafer,ID,Grade,Src/Dest,All,Lpd,LpdN,LpdES,µScr,Scr,...,Median,STDV,Bin1,Bin2,Bin3,Bin4,Bin5,Bin6,Bin7,Bin8
5,[#],[#],[#],[#],[#],[#],[mm],[#],[mm²],[ppm],...,[#],[#],[#],[#],[#],[#],[#],,,
7,F,Rejected,1/22-1/22,187,116,,,,,,...,0.023,0.002,3,5,2,13,4,10,3,10
8,F,Rejected,1/16-1/16,166,58,,,,,,...,0.041,0.002,4,5,6,2,4,1,1,5
9,F,Rejected,1/10-1/10,299,124,,,,,,...,0.040,0.016,10,7,8,7,4,6,6,13
10,F,Rejected,1/5-1/5,67,35,,,,,,...,0.028,0.002,4,2,3,3,1,3,0,3
12,Min,67,35,,,,,,32,0.000,...,3,2,2,2,1,1,0,3,,


Now that we have have filtered to the DCO data that we are interested in, it is time to continue parsing the dataframe to extract relevant data.

In [80]:
#Deleting some irrelavent columns
del df['Side']
del df['LpdN']
del df['LpsED']
del df['uScr']
del df['Scr']
del df['Slip']
del df['Area (#)']
del df['Area (mm^2)']
del df['Avg']
del df['Median']
del df['STDV']

df.drop([4,5])



Unnamed: 0,Grade,Source/Destination,All,Lpd,Bin1,Bin2,Bin3,Bin4,Bin5,Bin6,Bin7,Bin8
0,ZZBARESI_HT,,,,,,,,,,,
1,ID:,,,,,,,,,,,
3,Info,DCO,Totals,DCO,,,,,,,,
7,Rejected,1/22-1/22,187,116,3.0,5.0,2.0,13.0,4.0,10.0,3.0,10.0
8,Rejected,1/16-1/16,166,58,4.0,5.0,6.0,2.0,4.0,1.0,1.0,5.0
9,Rejected,1/10-1/10,299,124,10.0,7.0,8.0,7.0,4.0,6.0,6.0,13.0
10,Rejected,1/5-1/5,67,35,4.0,2.0,3.0,3.0,1.0,3.0,0.0,3.0
12,67,35,,,2.0,2.0,1.0,1.0,0.0,3.0,,
13,299,124,,,8.0,13.0,4.0,10.0,6.0,13.0,,
14,179.8,83.25,,,4.75,6.25,3.25,5.0,2.5,7.75,,


In [81]:
#Playing with extracting information from the file name itself. 
x= 'name_of_txt_file.txt'

#removing the last 4 characters (which is the '.txt') of the file name
x= x[:-4]

#splitting the file name separacted by underscore
y= x.split('_')
y

['name', 'of', 'txt', 'file']