In [1]:
import re
import csv
import shutil

import numpy as np
import pandas as pd

from pathlib import Path

In [2]:
np.random.seed(42)

In [3]:
TRAIN_DIRECTORY = True
TEST_DIRECTORY = False

In [4]:
class NoFileException(Exception):
    '''
        This exception is raised when no csv files are found in the directory.
    '''
    def __init__(self, dirName: str):
        self.message = f'No csv files found in "{dirName}" directory.'
        super().__init__(self.message)

In [5]:
class NoGoodFileException(Exception):
    '''
        This exception is raised when no good files are found in the directory.
    '''
    def __init__(self, dirName: str):
        self.message = f'No good files found in "{dirName}" directory.'
        super().__init__(self.message)

In [6]:
def validateFileName(filepath: str) -> bool:
    '''
        Validate Filename to be of correct format - Wafer_[8 digit date (ddmmyyyy)]_[6 digit time (hhmmss)].csv
        
        Inputs:
            filepath: str => Path of the file with filename at the end.
        
        Outputs:
            validName: bool => True if validation is successful, else False.
        
        Exceptions:
            TypeError => This exception is raised if the filepath is not of correct data type.
    '''
    
    if type(filepath) != str:
        raise TypeError('Invalid datatype of "filepath" parameter.')
    
    validName = False
    filename = filepath.split('\\')[-1]
    pattern = re.compile('wafer_[0-3][\d][01][\d][12][\d]{3}_[0-2][\d][0-5][\d][0-5][\d].csv')
    
    if pattern.match(filename.lower()):
        validName = True
    
    return validName

In [7]:
def validateColumns(filepath: str, trainDir: bool) -> bool:
    '''
        Validate if all the columns are proper in the given file based on directory the file is stored in.
        
        Inputs:
            filepath: str => Path of the file with filename at the end.
            trainDir: bool => True if the directory is train directory, False if it is test directory.
        
        Outputs:
            validColumnStructure: bool => True if file follows the desired column structure, else False.
            
        Exceptions:
            TypeError => This exception is raised if the filepath is not of correct data type.
    '''
    
    if type(filepath) != str:
        raise TypeError('Invalid datatype of "filepath" parameter.')
        
    validColumnStructure = False
    
    expectedColumnStructure = list()
    expectedColumnStructure.append('Unnamed: 0')
    
    for i in range(1, 591):
        expectedColumnStructure.append(f'Sensor-{i}')
        
    if trainDir:
        expectedColumnStructure[0] = ''
        expectedColumnStructure.append('Good/Bad')
    
    with open(filepath, newline='') as csvfile:
        reader = csv.reader(csvfile)
        headers = next(reader)
        if len(list(set(expectedColumnStructure).difference(set(headers)))) == 0:
            validColumnStructure = True
            
    return validColumnStructure

In [8]:
def identifyGoodFilesAndGetDir(dirName: str, trainDir: bool) -> str:
    '''
        Identify all the good csv files for the Wafer dataset based on filename and column structure as specified by client,\
        and stores the valid files in a new directory called goodTrainFiles or goodTestFiles (based on the trainDir parameter).
        
        Filename Validation:
            Wafer_[8 digit date (ddmmyyyy)]_[6 digit time (hhmmss)].csv
            
        Column Structure Validation:
            Train: Wafer Name, Sensor - 1, Sensor - 2, ..., Sensor - 590, Output
            Test: Wafer Name, Sensor - 1, Sensor - 2, ..., Sensor - 590
        
        Inputs:
            dirName: str => Name of the directory where the files are located which are to be identified
            trainDir: bool => Does the directory contain files used for training or testing
        
        Outputs:
            goodFilesDir: str => Relative Path of the folder containing the files identified as good files to folder of this file.
        
        Exceptions:
            NoFileException => This error is raised when no csv files are found in the directory
            NoGoodFileException => This error is raised when no good files are found in the directory
    '''
    
    goodFilesByName = list()
    all_csv_files = list([str(filename) for filename in Path(dirName).glob('*.csv')])
    
    if len(all_csv_files) == 0:
        raise NoFileException(dirName)
    
    for file in all_csv_files:
        if validateFileName(file):
            goodFilesByName.append(file)
    
    if len(goodFilesByName) == 0:
        raise NoGoodFileException(dirName)
    
    goodFiles = list()
    
    for file in goodFilesByName:
        if validateColumns(file, trainDir):
            goodFiles.append(file)
    
    if len(goodFiles) == 0:
        raise NoGoodFileException(dirName)
    
    goodFilesDir = 'Dataset/goodTrainFiles' if trainDir else 'Dataset/goodTestFiles'
    
    path = Path(goodFilesDir)
    if path.exists() and path.is_dir():
        shutil.rmtree(path)
    path.mkdir(parents=True)
    
    for filepath in goodFiles:
        shutil.copy(filepath, path)
        
    return goodFilesDir

In [9]:
goodTrainFilesPath = identifyGoodFilesAndGetDir('Dataset/Training_Batch_Files', TRAIN_DIRECTORY)
goodTestFilesPath = identifyGoodFilesAndGetDir('Dataset/Prediction_Batch_files', TEST_DIRECTORY)

In [10]:
column_names = list()
column_names.append('Wafer')
for i in range(1, 591):
    column_names.append(f'Sensor-{i}')
column_names.append('Output')

In [11]:
train_data = pd.DataFrame(columns=column_names)
for train_file in Path(goodTrainFilesPath).glob('*.csv'):
    df_new = pd.read_csv(str(train_file))
    df_new.rename(columns={'Unnamed: 0': 'Wafer', 'Good/Bad': 'Output'}, inplace=True)
    train_data = pd.concat([train_data, df_new], axis=0, ignore_index=True)

In [12]:
for col in train_data.columns:
    if 'sensor' in col.lower():
        train_data[col] = train_data[col].astype(np.float64)

In [13]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1537 entries, 0 to 1536
Columns: 592 entries, Wafer to Output
dtypes: float64(590), object(2)
memory usage: 6.9+ MB


In [14]:
train_data.describe()

Unnamed: 0,Sensor-1,Sensor-2,Sensor-3,Sensor-4,Sensor-5,Sensor-6,Sensor-7,Sensor-8,Sensor-9,Sensor-10,...,Sensor-581,Sensor-582,Sensor-583,Sensor-584,Sensor-585,Sensor-586,Sensor-587,Sensor-588,Sensor-589,Sensor-590
count,1531.0,1530.0,1523.0,1523.0,1523.0,1523.0,1523.0,1528.0,1535.0,1535.0,...,595.0,595.0,1536.0,1536.0,1536.0,1536.0,1537.0,1537.0,1537.0,1537.0
mean,3015.024193,2495.483771,2200.370187,1400.184797,4.255665,100.0,101.062428,0.121817,1.461905,-0.000798,...,0.005352,97.796315,0.500105,0.015029,0.003782,3.007635,0.021432,0.016414,0.005268,99.777881
std,73.908774,80.495576,29.46146,443.937101,56.906664,0.0,6.267841,0.009045,0.074078,0.01502,...,0.00311,88.689937,0.003357,0.0127,0.002733,2.641657,0.012398,0.008768,0.002865,94.560344
min,2743.24,2158.75,2060.66,0.0,0.6815,100.0,82.1311,0.0,1.191,-0.0534,...,0.001,0.0,0.4778,0.006,0.0017,1.1975,-0.0169,0.0032,0.001,0.0
25%,2967.465,2452.1825,2180.9666,1084.3779,1.0177,100.0,97.84,0.1211,1.4103,-0.01065,...,0.0033,45.833,0.4979,0.0116,0.0031,2.309525,0.0134,0.0106,0.0033,44.2355
50%,3012.09,2498.84,2200.9889,1287.3538,1.3168,100.0,101.4922,0.1224,1.4607,-0.0013,...,0.0046,71.5333,0.50015,0.0138,0.0036,2.75895,0.0207,0.0148,0.0046,71.5753
75%,3057.39,2538.5,2217.8667,1593.122,1.5291,100.0,104.52555,0.1238,1.5164,0.0083,...,0.0064,116.88855,0.502325,0.0165,0.0041,3.295575,0.0276,0.0203,0.0064,115.1005
max,3356.35,2846.44,2315.2667,3715.0417,1114.5366,100.0,129.2522,0.1286,1.6564,0.0749,...,0.0286,737.3048,0.5098,0.4714,0.1039,98.6628,0.1028,0.0799,0.0286,737.3048


In [15]:
train_data.dtypes

Wafer          object
Sensor-1      float64
Sensor-2      float64
Sensor-3      float64
Sensor-4      float64
               ...   
Sensor-587    float64
Sensor-588    float64
Sensor-589    float64
Sensor-590    float64
Output         object
Length: 592, dtype: object

In [16]:
train_data.head()

Unnamed: 0,Wafer,Sensor-1,Sensor-2,Sensor-3,Sensor-4,Sensor-5,Sensor-6,Sensor-7,Sensor-8,Sensor-9,...,Sensor-582,Sensor-583,Sensor-584,Sensor-585,Sensor-586,Sensor-587,Sensor-588,Sensor-589,Sensor-590,Output
0,Wafer-501,3076.81,2158.75,2208.2334,1517.0152,1.098,100.0,110.19,0.1247,1.4357,...,64.2405,0.5016,0.0152,0.004,3.0319,0.0465,0.0299,0.009,64.2405,-1
1,Wafer-502,2951.62,2511.92,2253.5111,1397.506,0.966,100.0,109.7611,0.121,1.5527,...,0.0,0.4953,0.0105,0.0037,2.1266,-0.0012,0.0252,0.0081,0.0,-1
2,Wafer-503,2930.42,2505.17,2235.0556,1302.6607,1.6347,100.0,109.9856,0.123,1.4588,...,,0.4958,0.0111,0.0033,2.2296,-0.0012,0.0252,0.0081,0.0,-1
3,Wafer-504,2997.28,2357.99,2141.0667,1236.5212,0.9698,100.0,98.3344,0.1238,1.5973,...,,0.4962,0.0086,0.0024,1.7297,-0.0012,0.0252,0.0081,0.0,-1
4,Wafer-505,3025.1,2475.18,2235.0556,1302.6607,1.6347,100.0,109.9856,0.123,1.5525,...,,0.4983,0.0159,0.0041,3.1927,-0.0012,0.0252,0.0081,0.0,-1


In [17]:
train_data.drop_duplicates(inplace=True)

In [18]:
train_data.drop(columns=['Wafer'], inplace=True)

In [19]:
train_data['Output'].replace({-1: 0}, inplace=True)

In [20]:
train_data.head()

Unnamed: 0,Sensor-1,Sensor-2,Sensor-3,Sensor-4,Sensor-5,Sensor-6,Sensor-7,Sensor-8,Sensor-9,Sensor-10,...,Sensor-582,Sensor-583,Sensor-584,Sensor-585,Sensor-586,Sensor-587,Sensor-588,Sensor-589,Sensor-590,Output
0,3076.81,2158.75,2208.2334,1517.0152,1.098,100.0,110.19,0.1247,1.4357,0.0089,...,64.2405,0.5016,0.0152,0.004,3.0319,0.0465,0.0299,0.009,64.2405,0
1,2951.62,2511.92,2253.5111,1397.506,0.966,100.0,109.7611,0.121,1.5527,0.0119,...,0.0,0.4953,0.0105,0.0037,2.1266,-0.0012,0.0252,0.0081,0.0,0
2,2930.42,2505.17,2235.0556,1302.6607,1.6347,100.0,109.9856,0.123,1.4588,-0.0143,...,,0.4958,0.0111,0.0033,2.2296,-0.0012,0.0252,0.0081,0.0,0
3,2997.28,2357.99,2141.0667,1236.5212,0.9698,100.0,98.3344,0.1238,1.5973,-0.0534,...,,0.4962,0.0086,0.0024,1.7297,-0.0012,0.0252,0.0081,0.0,0
4,3025.1,2475.18,2235.0556,1302.6607,1.6347,100.0,109.9856,0.123,1.5525,-0.0078,...,,0.4983,0.0159,0.0041,3.1927,-0.0012,0.0252,0.0081,0.0,0


In [21]:
X, y = train_data.drop(columns=['Output']), train_data['Output']

In [22]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
X = imputer.fit_transform(X)

In [23]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [24]:
from sklearn.decomposition import PCA
def getGoodPCA(data: np.ndarray) -> PCA:
    '''
        This function finds the best number of components for a minimum 95% explained variance for PCA and returns the PCA model.
        
        Inputs:
            data: pd.DataFrame => The dataframe on which PCA is to be performed.
        
        Outputs:
            model: PCA => PCA model with minimum number of components required for atleast 95% explained variance.
        
        Exceptions:
            None
    '''
    
    for i in range(5, len(data), 5):
        pca = PCA(n_components=i)
        pca.fit(data)
        if sum(pca.explained_variance_ratio_) > 0.95:
            return pca

In [25]:
pca = getGoodPCA(X)

In [26]:
X_train_data = pca.transform(X)
pd.DataFrame(X_train_data, columns=[f'PC-{i+1}' for i in range(pca.n_components_)]).head()

Unnamed: 0,PC-1,PC-2,PC-3,PC-4,PC-5,PC-6,PC-7,PC-8,PC-9,PC-10,...,PC-161,PC-162,PC-163,PC-164,PC-165,PC-166,PC-167,PC-168,PC-169,PC-170
0,-0.481504,1.915886,4.25694,-3.690993,1.874274,0.115925,0.494321,2.077702,-3.558232,-3.511658,...,-0.186853,0.616042,-0.89336,-0.070553,-0.666121,0.96325,-1.258177,-1.423634,0.512887,0.590612
1,0.320273,2.695612,4.916052,-4.068747,0.63819,0.277909,-0.021832,0.848662,0.447673,-2.278988,...,0.818737,0.648885,-0.969883,0.202978,1.522963,0.597412,0.124704,-0.767485,0.756795,-0.651954
2,0.475491,-0.539261,1.739185,-1.546155,0.32793,0.35428,0.354387,-1.060128,-1.629294,-6.134755,...,0.706982,0.081556,0.218223,0.581033,0.662268,-0.188053,0.112732,0.638733,1.387466,-0.959399
3,1.365005,2.299948,6.871609,-4.206412,2.253421,2.58744,-1.753602,3.233079,-4.263065,-3.643337,...,-0.281418,0.058932,0.365627,-0.379447,-0.217056,0.09167,-0.691744,-1.474498,-0.11041,1.108375
4,0.786307,0.570597,2.717501,-2.443015,0.893007,1.906021,1.049044,0.84891,-2.083137,-3.017119,...,0.176929,1.221855,-0.41198,0.631736,-1.406072,-0.766394,0.235794,-0.59532,-0.281904,-0.60705


In [27]:
list(y).count(0), list(y).count(1)

(1448, 89)

In [28]:
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_data, y_data = sm.fit_resample(X_train_data, y)

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier, DMatrix

  from pandas import MultiIndex, Int64Index
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [30]:
classifiers = {
    'lr': LogisticRegression(max_iter=250),
    'nb': GaussianNB(),
    'knn': KNeighborsClassifier(),
    'rfc': RandomForestClassifier(),
    'dtc': DecisionTreeClassifier(),
    'svc': SVC(),
}

xgbc_params = {"objective":"binary:logistic",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}
xgc = XGBClassifier(**xgbc_params)

In [31]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
kfold = StratifiedKFold(n_splits=5, shuffle=True)
clf_reports = {}
for clf_name in classifiers:
    model = classifiers[clf_name]
    clf_reports[clf_name] = list()
    for train_index, test_index in kfold.split(X, y):
        X_train, X_test, y_train, y_test = X_data[train_index], X_data[test_index], y_data[train_index], y_data[test_index]
        model.fit(X_train, y_train)
        y_preds = model.predict(X_test)
        clf_reports[clf_name] = classification_report(y_test, y_preds, target_names=['Bad Wafer', 'Good Wafer'], zero_division=0)

In [32]:
clf_reports['xgc'] = list()
for train_index, test_index in kfold.split(X, y):
    X_train, X_test, y_train, y_test = X_data[train_index], X_data[test_index], y_data[train_index], y_data[test_index]
    xgc.fit(X_train, y_train)
    y_preds = model.predict(X_test)
    clf_reports['xgc'] = classification_report(y_test, y_preds, target_names=['Bad Wafer', 'Good Wafer'], zero_division=0)

In [33]:
for clf_name in clf_reports:
    print('\nModel Name:', clf_name)
    print('Classification Report:')
    print(clf_reports[clf_name])


Model Name: lr
Classification Report:
              precision    recall  f1-score   support

   Bad Wafer       0.95      0.97      0.96       289
  Good Wafer       0.18      0.11      0.14        18

    accuracy                           0.92       307
   macro avg       0.56      0.54      0.55       307
weighted avg       0.90      0.92      0.91       307


Model Name: nb
Classification Report:
              precision    recall  f1-score   support

   Bad Wafer       0.94      0.94      0.94       289
  Good Wafer       0.10      0.11      0.11        18

    accuracy                           0.89       307
   macro avg       0.52      0.52      0.52       307
weighted avg       0.89      0.89      0.89       307


Model Name: knn
Classification Report:
              precision    recall  f1-score   support

   Bad Wafer       0.94      1.00      0.97       289
  Good Wafer       0.00      0.00      0.00        18

    accuracy                           0.94       307
   macro a