In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
%matplotlib inline
import pyodbc
con = pyodbc.connect('DSN=ISDW')
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics
from sklearn import naive_bayes 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.externals import joblib

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

  from numpy.core.umath_tests import inner1d


# Model Training

## Radial and Smiley

### Pull Training Features

In [2]:
daysback = 300
Calcdefs = 'S2561'
corner = 'SRAM256MST00p655CHIPA'
categoryName = 'BIT'

SQLQuery = """
SELECT lot_id_base, APRCTable.wafer_id, APRCTable.ChipX || ', ' || APRCTable.ChipY as ChipXY, family_Code,
     radius_Center_5, quadrant as waf_quadrant, locationType,
     --APRCTable.corner, categoryName as aprc, 
     BitCount, Q0FailCount, Q1FailCount, Q2FailCount, Q3FailCount
     
FROM
(
Select lot_id_base, wafer_Id, ChipX, ChipY, family_Code, radius_Center_5, quadrant, locationType, SUM(BitCount) as BitCount
FROM
(
SELECT left(lot_Id, 5) as lot_id_base, wafer_Id, normalized_TestX as ChipX, normalized_TestY as ChipY, family_Code,
     corner, categoryName, count, radius_Center_5, quadrant, locationType,
     case
         when corner = '{}' and categoryName = '{}' then count
         else 0
     end as BitCount
FROM DMIW.PattRecChipFactR prcfr
JOIN DMIW_SYSTEMS.DerivedDataSetup dds ON prcfr.derivedSetupKey = dds.derivedSetupKey
JOIN DMIW_SYSTEMS.PattRecCategory prc ON prcfr.pattRecCatKey = prc.pattRecCatKey
JOIN DMIW_SYSTEMS.Geography geo ON prcfr.geographyKey = geo.geographyKey
JOIN (select testedWaferKey, lot_Id, wafer_id, family_Code, cur_Lot_Grade as lot_Grade, last_TestTimeStamp as date,
     level, testProgramName, last_Eqp_Id
     from DMIW_SYSTEMS.TestedWafer tw
     where Last_test_date >= (current date - {} days) and Tech_id = '7HPP' and Calcdefs = '{}'
     group by testedWaferKey, lot_Id, wafer_Id, family_Code, cur_Lot_Grade, last_TestTimeStamp, level, 
     testProgramName, last_Eqp_Id) tw ON prcfr.testedWaferKey = tw.testedWaferKey
WHERE categoryName <> '' and exclude_Flag = 'N'
)
GROUP BY lot_id_base, wafer_Id, ChipX, ChipY, family_Code, radius_Center_5, quadrant, locationType
ORDER BY lot_Id_base, wafer_Id, ChipX, ChipY
) APRCTable

JOIN
(
SELECT wafer_Id, ChipX, ChipY,
    SUM(Q0FailCount) as Q0FailCount, SUM(Q1FailCount) as Q1FailCount,
    SUM(Q2FailCount) as Q2FailCount, SUM(Q3FailCount) as Q3FailCount
FROM
(
SELECT wafer_Id, normalized_TestX as ChipX, normalized_TestY as ChipY, tp.parm_Label, parmValue,
     case
         when ucase(tp.parm_Label) = 'S256~H2~Q0_FBC_0P65' then parmValue
         else 0
     end as Q0FailCount,
     case
         when ucase(tp.parm_Label) = 'S256~H2~Q1_FBC_0P65' then parmValue
         else 0
     end as Q1FailCount,
     case
         when ucase(tp.parm_Label) = 'S256~H2~Q2_FBC_0P65' then parmValue
         else 0
     end as Q2FailCount,
     case
         when ucase(tp.parm_Label) = 'S256~H2~Q3_FBC_0P65' then parmValue
         else 0
     end as Q3FailCount
FROM DMIW.ChipParmFactR cpfr
JOIN DMIW_SYSTEMS.TestParm tp ON cpfr.testParmKey = tp.testParmKey
JOIN DMIW_SYSTEMS.Geography geo ON cpfr.geographyKey = geo.geographyKey
JOIN (select testedWaferKey, lot_Id, wafer_Id, family_Code, cur_Lot_Grade as lot_Grade, 
     last_TestTimeStamp as date, level, last_Eqp_Id 
     from DMIW_SYSTEMS.TestedWafer testedWafer 
     where Last_test_date >= (current date - {} days) and Tech_id = '7HPP' and Calcdefs = '{}' 
     group by testedWaferKey, lot_Id, wafer_Id, family_Code, cur_Lot_Grade, last_TestTimeStamp,
     level, last_Eqp_Id) tw ON cpfr.testedWaferKey = tw.testedWaferKey
WHERE cpfr.parmValue is not null and abs(parmValue) < 1e25 and
    (ucase(tp.parm_Label) in ('S256~H2~Q0_FBC_0P65', 'S256~H2~Q1_FBC_0P65', 'S256~H2~Q2_FBC_0P65', 'S256~H2~Q3_FBC_0P65'))
)
GROUP BY wafer_Id, ChipX, ChipY
ORDER BY wafer_Id, ChipX, ChipY
) ElecTable

ON APRCTable.wafer_Id = ElecTable.wafer_Id and APRCTable.ChipX = ElecTable.ChipX and APRCTable.ChipY = ElecTable.ChipY
ORDER BY lot_Id_base, wafer_Id
""".format(corner, categoryName, daysback, Calcdefs, daysback, Calcdefs)

In [3]:
df = pd.read_sql(SQLQuery,con)

In [4]:
pivot = pd.pivot_table(df, values=('BITCOUNT','Q0FAILCOUNT','Q1FAILCOUNT','Q2FAILCOUNT','Q3FAILCOUNT'), index=['WAFER_ID'], columns=['CHIPXY'], aggfunc=np.mean).fillna(0)

In [5]:
labels = pd.read_csv("data/7nmLabels.csv")

In [6]:
train = pivot.join(labels.set_index('WAFER_ID')[['Radial','Smiley']]).dropna()



In [7]:
train.head()

Unnamed: 0_level_0,"(BITCOUNT, 16, 21)","(BITCOUNT, 16, 28)","(BITCOUNT, 16, 35)","(BITCOUNT, 16, 42)","(BITCOUNT, 25, 14)","(BITCOUNT, 25, 21)","(BITCOUNT, 25, 28)","(BITCOUNT, 25, 35)","(BITCOUNT, 25, 42)","(BITCOUNT, 25, 49)",...,"(Q3FAILCOUNT, 88, 28)","(Q3FAILCOUNT, 88, 35)","(Q3FAILCOUNT, 88, 42)","(Q3FAILCOUNT, 88, 49)","(Q3FAILCOUNT, 97, 21)","(Q3FAILCOUNT, 97, 28)","(Q3FAILCOUNT, 97, 35)","(Q3FAILCOUNT, 97, 42)",Radial,Smiley
WAFER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AZ2RG-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
AZ2RG-04,1.0,2.0,2.0,1.0,2.0,1.0,3.0,4.0,4.0,1.0,...,1.0,2.0,8.0,0.0,638976.0,554.0,2.0,0.0,0.0,0.0
AZ2RG-06,0.0,5.0,2.0,0.0,2.0,2.0,10.0,18.0,5.0,1.0,...,4.0,0.0,0.0,1.0,0.0,0.0,6942931.0,10.0,1.0,0.0
AZ2RG-10,0.0,1.0,1.0,0.0,0.0,0.0,8.0,8.0,2.0,0.0,...,1.0,0.0,0.0,24.0,0.0,42.0,9.0,0.0,0.0,0.0
AZ2RG-17,69.0,7.0,15.0,46.0,1.0,14.0,32.0,69.0,55.0,28.0,...,5.0,1.0,0.0,0.0,0.0,344.0,32802.0,0.0,1.0,0.0


### Radial Random Forest Model

In [8]:
X = train.loc[:, ((train.columns != 'Radial') & (train.columns != 'Smiley'))]
y = train['Radial']

In [9]:
def find_model_accuracy(train_data, train_labels, test_data, test_labels, model):
    # Fit the model to the training set
    model.fit(train_data, train_labels)
    test_predict = model.predict(test_data)
    
    # plot_confusion matrix
    sns.heatmap(metrics.confusion_matrix(test_labels, test_predict), 
                xticklabels=['pred: Neg', 'pred: Pos'],
                annot=True,
                fmt='g',
                cmap='Blues').set_yticklabels(['actual: Neg', 'actual: Pos'], rotation=0)
    plt.tight_layout()

    plt.show()
    # return accuracy
    return metrics.accuracy_score(test_predict, test_labels), test_predict, test_labels

In [None]:
# Split Train and Test Sets
train_data, test_data, train_labels, test_labels = train_test_split(train.loc[:, X, y, test_size=0.2)

In [10]:
# Quick Model RandomForestClassifier
#X,y = train_data, train_labels['Radial']
model = ensemble.RandomForestClassifier(class_weight=None, criterion='entropy',
                                            max_depth=6, n_estimators=220, n_jobs=-1)
#accuracy, predict, actual = find_model_accuracy(X, y, test_data, test_labels['Radial'], model)
model.fit(X, y)
#print(accuracy)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=220, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [None]:
# RandomForestClassifier
clf = ensemble.RandomForestClassifier(n_jobs=-1)

# Gridsearch CV
parameter_grid = {'max_depth' : np.arange(5, 12, 1),
                 'n_estimators': np.arange(200, 250, 10),
                 'criterion': ['gini','entropy'],
                 'class_weight': [None ,'balanced']}

grid_search = model_selection.GridSearchCV(clf, parameter_grid)
grid_search.fit(X, y)
grid_search.best_estimator_

In [None]:
# Model accuracy
# take best estimator from gridsearch and shuffle split x times against train data to get an estimate of performance
clf = grid_search.best_estimator_
cv = model_selection.ShuffleSplit(n_splits=1000, test_size=.2)
cv_scores = model_selection.cross_val_score(clf, X, y, cv=cv)

# plot out the distribution of shufflesplit to get performance estimate
plt.figure(figsize=(8,6))
sns.set(font_scale=1.5)
sns.distributions.distplot(tuple(cv_scores))
print("average accuracy score:", cv_scores.mean())

In [None]:
# Create a dataframe showing all the mistakes made by the classifier
fail_analysis = actual.to_frame()
fail_analysis['predicted'] = predict

In [None]:
fail_analysis

In [11]:
# Save the model
joblib.dump(model, 'models/7nmRadial.sav')

['models/7nmRadial.sav']

In [5]:
# Download the model
Radial_model= joblib.load('models/7nmRadial.sav')

### Smiley Random Forest Model

In [13]:
X = train.loc[:, ((train.columns != 'Radial') & (train.columns != 'Smiley'))]
y = train['Smiley']

In [None]:
# Split Train and Test Sets
train_data, test_data, train_labels, test_labels = train_test_split(train.loc[:, X, y, test_size=0.2)

In [14]:
# Quick Model RandomForestClassifier
#X,y = train_data, train_labels['Smiley']
model = ensemble.RandomForestClassifier(class_weight='balanced', criterion='gini',
                                            max_depth=7, n_estimators=220, n_jobs=-1)
#accuracy, predict, actual = find_model_accuracy(X, y, test_data, test_labels['Smiley'], model)
model.fit(X, y)
#print(accuracy)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=7, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=220, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [None]:
# RandomForestClassifier
clf = ensemble.RandomForestClassifier(n_jobs=-1)

# Gridsearch CV
parameter_grid = {'max_depth' : np.arange(5, 12, 1),
                 'n_estimators': np.arange(200, 250, 10),
                 'criterion': ['gini','entropy'],
                 'class_weight': [None ,'balanced']}

grid_search = model_selection.GridSearchCV(clf, parameter_grid)
grid_search.fit(X, y)
grid_search.best_estimator_

In [None]:
# Model accuracy
# take best estimator from gridsearch and shuffle split x times against train data to get an estimate of performance
clf = grid_search.best_estimator_
cv = model_selection.ShuffleSplit(n_splits=1000, test_size=.2)
cv_scores = model_selection.cross_val_score(clf, X, y, cv=cv)

# plot out the distribution of shufflesplit to get performance estimate
plt.figure(figsize=(8,6))
sns.set(font_scale=1.5)
sns.distributions.distplot(tuple(cv_scores))
print("average accuracy score:", cv_scores.mean())

In [None]:
# Create a dataframe showing all the mistakes made by the classifier
fail_analysis = actual.to_frame()
fail_analysis['predicted'] = predict

In [None]:
fail_analysis

In [15]:
# Save the model
joblib.dump(model, 'models/7nmSmiley.sav')

['models/7nmSmiley.sav']

In [6]:
# Download the model
Smiley_model= joblib.load('models/7nmSmiley.sav')

## Big and Small Center

### Pull Training Features

In [7]:
daysback = 300
Calcdefs = 'S2561'
corner = 'SRAM256MST00p655CHIPA'

SQLQuery = """
SELECT lot_id_base, APRCTable.wafer_id, APRCTable.ChipX || ', ' || APRCTable.ChipY as ChipXY, family_Code,
     radius_Center_5, quadrant as waf_quadrant, locationType,
     --APRCTable.corner, categoryName as aprc, 
     BitCount,DBCCount, Q0FailCount, Q1FailCount, Q2FailCount, Q3FailCount
     
FROM
(
Select lot_id_base, wafer_Id, ChipX, ChipY, family_Code, radius_Center_5, quadrant, locationType, 
SUM(BitCount) as BitCount, SUM(DBCCount) as DBCCount
FROM
(
SELECT left(lot_Id, 5) as lot_id_base, wafer_Id, normalized_TestX as ChipX, normalized_TestY as ChipY, family_Code,
     corner, categoryName, count, radius_Center_5, quadrant, locationType,
     case
         when corner = '{}' and categoryName = 'BIT' then count
         else 0
     end as BitCount,
     case
         when corner = '{}' and categoryName = 'DBC' then count
         else 0
     end as DBCCount
FROM DMIW.PattRecChipFactR prcfr
JOIN DMIW_SYSTEMS.DerivedDataSetup dds ON prcfr.derivedSetupKey = dds.derivedSetupKey
JOIN DMIW_SYSTEMS.PattRecCategory prc ON prcfr.pattRecCatKey = prc.pattRecCatKey
JOIN DMIW_SYSTEMS.Geography geo ON prcfr.geographyKey = geo.geographyKey
JOIN (select testedWaferKey, lot_Id, wafer_id, family_Code, cur_Lot_Grade as lot_Grade, last_TestTimeStamp as date,
     level, testProgramName, last_Eqp_Id
     from DMIW_SYSTEMS.TestedWafer tw
     where Last_test_date >= (current date - {} days) and Tech_id = '7HPP' and Calcdefs = '{}'
     group by testedWaferKey, lot_Id, wafer_Id, family_Code, cur_Lot_Grade, last_TestTimeStamp, level, 
     testProgramName, last_Eqp_Id) tw ON prcfr.testedWaferKey = tw.testedWaferKey
WHERE categoryName <> '' and exclude_Flag = 'N'
)
GROUP BY lot_id_base, wafer_Id, ChipX, ChipY, family_Code, radius_Center_5, quadrant, locationType
ORDER BY lot_Id_base, wafer_Id, ChipX, ChipY
) APRCTable

JOIN
(
SELECT wafer_Id, ChipX, ChipY,
    SUM(Q0FailCount) as Q0FailCount, SUM(Q1FailCount) as Q1FailCount,
    SUM(Q2FailCount) as Q2FailCount, SUM(Q3FailCount) as Q3FailCount
FROM
(
SELECT wafer_Id, normalized_TestX as ChipX, normalized_TestY as ChipY, tp.parm_Label, parmValue,
     case
         when ucase(tp.parm_Label) = 'S256~H2~Q0_FBC_0P65' then parmValue
         else 0
     end as Q0FailCount,
     case
         when ucase(tp.parm_Label) = 'S256~H2~Q1_FBC_0P65' then parmValue
         else 0
     end as Q1FailCount,
     case
         when ucase(tp.parm_Label) = 'S256~H2~Q2_FBC_0P65' then parmValue
         else 0
     end as Q2FailCount,
     case
         when ucase(tp.parm_Label) = 'S256~H2~Q3_FBC_0P65' then parmValue
         else 0
     end as Q3FailCount
FROM DMIW.ChipParmFactR cpfr
JOIN DMIW_SYSTEMS.TestParm tp ON cpfr.testParmKey = tp.testParmKey
JOIN DMIW_SYSTEMS.Geography geo ON cpfr.geographyKey = geo.geographyKey
JOIN (select testedWaferKey, lot_Id, wafer_Id, family_Code, cur_Lot_Grade as lot_Grade, 
     last_TestTimeStamp as date, level, last_Eqp_Id 
     from DMIW_SYSTEMS.TestedWafer testedWafer 
     where Last_test_date >= (current date - {} days) and Tech_id = '7HPP' and Calcdefs = '{}' 
     group by testedWaferKey, lot_Id, wafer_Id, family_Code, cur_Lot_Grade, last_TestTimeStamp,
     level, last_Eqp_Id) tw ON cpfr.testedWaferKey = tw.testedWaferKey
WHERE cpfr.parmValue is not null and abs(parmValue) < 1e25 and
    (ucase(tp.parm_Label) in ('S256~H2~Q0_FBC_0P65', 'S256~H2~Q1_FBC_0P65', 'S256~H2~Q2_FBC_0P65', 'S256~H2~Q3_FBC_0P65'))
)
GROUP BY wafer_Id, ChipX, ChipY
ORDER BY wafer_Id, ChipX, ChipY
) ElecTable

ON APRCTable.wafer_Id = ElecTable.wafer_Id and APRCTable.ChipX = ElecTable.ChipX and APRCTable.ChipY = ElecTable.ChipY
ORDER BY lot_Id_base, wafer_Id
""".format(corner, corner, daysback, Calcdefs, daysback, Calcdefs)

In [8]:
df = pd.read_sql(SQLQuery,con)

In [9]:
pivot = pd.pivot_table(df, values=('BITCOUNT','DBCCOUNT','Q0FAILCOUNT','Q1FAILCOUNT','Q2FAILCOUNT','Q3FAILCOUNT'), index=['WAFER_ID'], columns=['CHIPXY'], aggfunc=np.mean).fillna(0)

In [20]:
labels = pd.read_csv("data/7nmLabels.csv")

In [21]:
train2 = pivot.join(labels.set_index('WAFER_ID')[['Small_Center','Big_Center']]).dropna()



In [22]:
train2.head()

Unnamed: 0_level_0,"(BITCOUNT, 16, 21)","(BITCOUNT, 16, 28)","(BITCOUNT, 16, 35)","(BITCOUNT, 16, 42)","(BITCOUNT, 25, 14)","(BITCOUNT, 25, 21)","(BITCOUNT, 25, 28)","(BITCOUNT, 25, 35)","(BITCOUNT, 25, 42)","(BITCOUNT, 25, 49)",...,"(Q3FAILCOUNT, 88, 28)","(Q3FAILCOUNT, 88, 35)","(Q3FAILCOUNT, 88, 42)","(Q3FAILCOUNT, 88, 49)","(Q3FAILCOUNT, 97, 21)","(Q3FAILCOUNT, 97, 28)","(Q3FAILCOUNT, 97, 35)","(Q3FAILCOUNT, 97, 42)",Small_Center,Big_Center
WAFER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AZ2RG-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
AZ2RG-04,1.0,2.0,2.0,1.0,2.0,1.0,3.0,4.0,4.0,1.0,...,1.0,2.0,8.0,0.0,638976.0,554.0,2.0,0.0,1.0,0.0
AZ2RG-06,0.0,5.0,2.0,0.0,2.0,2.0,10.0,18.0,5.0,1.0,...,4.0,0.0,0.0,1.0,0.0,0.0,6942931.0,10.0,0.0,0.0
AZ2RG-10,0.0,1.0,1.0,0.0,0.0,0.0,8.0,8.0,2.0,0.0,...,1.0,0.0,0.0,24.0,0.0,42.0,9.0,0.0,0.0,1.0
AZ2RG-17,69.0,7.0,15.0,46.0,1.0,14.0,32.0,69.0,55.0,28.0,...,5.0,1.0,0.0,0.0,0.0,344.0,32802.0,0.0,0.0,0.0


### Small Center Random Forest Classifier

In [23]:
X = train2.loc[:, ((train2.columns != 'Small_Center') & (train2.columns != 'Big_Center'))]
y = train2['Small_Center']

In [None]:
# Split Train and Test Sets
train_data, test_data, train_labels, test_labels = train_test_split(train.loc[:, X, y, test_size=0.2)

In [24]:
# Quick Model RandomForestClassifier
#X,y = train_data, train_labels['Small_Center']
model = ensemble.RandomForestClassifier(class_weight='balanced', criterion='gini',
                                            max_depth=9, n_estimators=230, n_jobs=-1)
#accuracy, predict, actual = find_model_accuracy(X, y, test_data, test_labels['Small_Center'], model)
model.fit(X, y)
#print(accuracy)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=9, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=230, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [None]:
# RandomForestClassifier
clf = ensemble.RandomForestClassifier(n_jobs=-1)

# Gridsearch CV
parameter_grid = {'max_depth' : np.arange(5, 12, 1),
                 'n_estimators': np.arange(200, 250, 10),
                 'criterion': ['gini','entropy'],
                 'class_weight': [None ,'balanced']}

grid_search = model_selection.GridSearchCV(clf, parameter_grid)
grid_search.fit(X, y)
grid_search.best_estimator_

In [None]:
# Model accuracy
# take best estimator from gridsearch and shuffle split x times against train data to get an estimate of performance
clf = grid_search.best_estimator_
cv = model_selection.ShuffleSplit(n_splits=1000, test_size=.2)
cv_scores = model_selection.cross_val_score(clf, X, y, cv=cv)

# plot out the distribution of shufflesplit to get performance estimate
plt.figure(figsize=(8,6))
sns.set(font_scale=1.5)
sns.distributions.distplot(tuple(cv_scores))
print("average accuracy score:", cv_scores.mean())

In [None]:
# Create a dataframe showing all the mistakes made by the classifier
fail_analysis = actual.to_frame()
fail_analysis['predicted'] = predict

In [None]:
fail_analysis

In [25]:
# Save the model
joblib.dump(model, 'models/7nmSmallCenter.sav')

['models/7nmSmallCenter.sav']

In [10]:
# Download the model
SmallCenter_model= joblib.load('models/7nmSmallCenter.sav')

### Big Center Random Forest Classifier

In [27]:
X = train2.loc[:, ((train2.columns != 'Small_Center') & (train2.columns != 'Big_Center'))]
y = train2['Big_Center']

In [None]:
# Split Train and Test Sets
train_data, test_data, train_labels, test_labels = train_test_split(train.loc[:, X, y, test_size=0.2)

In [28]:
# Quick Model RandomForestClassifier
#X,y = train_data, train_labels['Big_Center']
model = ensemble.RandomForestClassifier(class_weight=None, criterion='entropy',
                                            max_depth=5, n_estimators=210, n_jobs=-1)
#accuracy, predict, actual = find_model_accuracy(X, y, test_data, test_labels['Big_Center'], model)
model.fit(X, y)
#print(accuracy)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=210, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [None]:
# RandomForestClassifier
clf = ensemble.RandomForestClassifier(n_jobs=-1)

# Gridsearch CV
parameter_grid = {'max_depth' : np.arange(5, 12, 1),
                 'n_estimators': np.arange(200, 250, 10),
                 'criterion': ['gini','entropy'],
                 'class_weight': [None ,'balanced']}

grid_search = model_selection.GridSearchCV(clf, parameter_grid)
grid_search.fit(X, y)
grid_search.best_estimator_

In [None]:
# Model accuracy
# take best estimator from gridsearch and shuffle split x times against train data to get an estimate of performance
clf = grid_search.best_estimator_
cv = model_selection.ShuffleSplit(n_splits=1000, test_size=.2)
cv_scores = model_selection.cross_val_score(clf, X, y, cv=cv)

# plot out the distribution of shufflesplit to get performance estimate
plt.figure(figsize=(8,6))
sns.set(font_scale=1.5)
sns.distributions.distplot(tuple(cv_scores))
print("average accuracy score:", cv_scores.mean())

In [None]:
# Create a dataframe showing all the mistakes made by the classifier
fail_analysis = actual.to_frame()
fail_analysis['predicted'] = predict

In [None]:
fail_analysis

In [29]:
# Save the model
joblib.dump(model, 'models/7nmBigCenter.sav')

['models/7nmBigCenter.sav']

In [11]:
# Download the model
BigCenter_model= joblib.load('models/7nmBigCenter.sav')

# Classify Data

## Radial and Smiley

### Pull Training Features

In [12]:
daysback = 300
Calcdefs = 'S2561'
corner = 'SRAM256MST00p655CHIPA'
categoryName = 'BIT'

SQLQuery = """
SELECT lot_id_base, APRCTable.wafer_id, APRCTable.ChipX || ', ' || APRCTable.ChipY as ChipXY, family_Code,
     radius_Center_5, quadrant as waf_quadrant, locationType,
     --APRCTable.corner, categoryName as aprc, 
     BitCount, Q0FailCount, Q1FailCount, Q2FailCount, Q3FailCount
     
FROM
(
Select lot_id_base, wafer_Id, ChipX, ChipY, family_Code, radius_Center_5, quadrant, locationType, SUM(BitCount) as BitCount
FROM
(
SELECT left(lot_Id, 5) as lot_id_base, wafer_Id, normalized_TestX as ChipX, normalized_TestY as ChipY, family_Code,
     corner, categoryName, count, radius_Center_5, quadrant, locationType,
     case
         when corner = '{}' and categoryName = '{}' then count
         else 0
     end as BitCount
FROM DMIW.PattRecChipFactR prcfr
JOIN DMIW_SYSTEMS.DerivedDataSetup dds ON prcfr.derivedSetupKey = dds.derivedSetupKey
JOIN DMIW_SYSTEMS.PattRecCategory prc ON prcfr.pattRecCatKey = prc.pattRecCatKey
JOIN DMIW_SYSTEMS.Geography geo ON prcfr.geographyKey = geo.geographyKey
JOIN (select testedWaferKey, lot_Id, wafer_id, family_Code, cur_Lot_Grade as lot_Grade, last_TestTimeStamp as date,
     level, testProgramName, last_Eqp_Id
     from DMIW_SYSTEMS.TestedWafer tw
     where Last_test_date >= (current date - {} days) and Tech_id = '7HPP' and Calcdefs = '{}'
     group by testedWaferKey, lot_Id, wafer_Id, family_Code, cur_Lot_Grade, last_TestTimeStamp, level, 
     testProgramName, last_Eqp_Id) tw ON prcfr.testedWaferKey = tw.testedWaferKey
WHERE categoryName <> '' and exclude_Flag = 'N'
)
GROUP BY lot_id_base, wafer_Id, ChipX, ChipY, family_Code, radius_Center_5, quadrant, locationType
ORDER BY lot_Id_base, wafer_Id, ChipX, ChipY
) APRCTable

JOIN
(
SELECT wafer_Id, ChipX, ChipY,
    SUM(Q0FailCount) as Q0FailCount, SUM(Q1FailCount) as Q1FailCount,
    SUM(Q2FailCount) as Q2FailCount, SUM(Q3FailCount) as Q3FailCount
FROM
(
SELECT wafer_Id, normalized_TestX as ChipX, normalized_TestY as ChipY, tp.parm_Label, parmValue,
     case
         when ucase(tp.parm_Label) = 'S256~H2~Q0_FBC_0P65' then parmValue
         else 0
     end as Q0FailCount,
     case
         when ucase(tp.parm_Label) = 'S256~H2~Q1_FBC_0P65' then parmValue
         else 0
     end as Q1FailCount,
     case
         when ucase(tp.parm_Label) = 'S256~H2~Q2_FBC_0P65' then parmValue
         else 0
     end as Q2FailCount,
     case
         when ucase(tp.parm_Label) = 'S256~H2~Q3_FBC_0P65' then parmValue
         else 0
     end as Q3FailCount
FROM DMIW.ChipParmFactR cpfr
JOIN DMIW_SYSTEMS.TestParm tp ON cpfr.testParmKey = tp.testParmKey
JOIN DMIW_SYSTEMS.Geography geo ON cpfr.geographyKey = geo.geographyKey
JOIN (select testedWaferKey, lot_Id, wafer_Id, family_Code, cur_Lot_Grade as lot_Grade, 
     last_TestTimeStamp as date, level, last_Eqp_Id 
     from DMIW_SYSTEMS.TestedWafer testedWafer 
     where Last_test_date >= (current date - {} days) and Tech_id = '7HPP' and Calcdefs = '{}' 
     group by testedWaferKey, lot_Id, wafer_Id, family_Code, cur_Lot_Grade, last_TestTimeStamp,
     level, last_Eqp_Id) tw ON cpfr.testedWaferKey = tw.testedWaferKey
WHERE cpfr.parmValue is not null and abs(parmValue) < 1e25 and
    (ucase(tp.parm_Label) in ('S256~H2~Q0_FBC_0P65', 'S256~H2~Q1_FBC_0P65', 'S256~H2~Q2_FBC_0P65', 'S256~H2~Q3_FBC_0P65'))
)
GROUP BY wafer_Id, ChipX, ChipY
ORDER BY wafer_Id, ChipX, ChipY
) ElecTable

ON APRCTable.wafer_Id = ElecTable.wafer_Id and APRCTable.ChipX = ElecTable.ChipX and APRCTable.ChipY = ElecTable.ChipY
ORDER BY lot_Id_base, wafer_Id
""".format(corner, categoryName, daysback, Calcdefs, daysback, Calcdefs)

df = pd.read_sql(SQLQuery,con)
    
features1 = pd.pivot_table(df, values=('BITCOUNT','Q0FAILCOUNT','Q1FAILCOUNT','Q2FAILCOUNT','Q3FAILCOUNT'), index=['WAFER_ID'], columns=['CHIPXY'], aggfunc=np.mean).fillna(0)

In [17]:
daysback = 300
Calcdefs = 'S2561'
corner = 'SRAM256MST00p655CHIPA'

SQLQuery = """
SELECT lot_id_base, APRCTable.wafer_id, APRCTable.ChipX || ', ' || APRCTable.ChipY as ChipXY, family_Code,
     radius_Center_5, quadrant as waf_quadrant, locationType,
     --APRCTable.corner, categoryName as aprc, 
     BitCount,DBCCount, Q0FailCount, Q1FailCount, Q2FailCount, Q3FailCount
     
FROM
(
Select lot_id_base, wafer_Id, ChipX, ChipY, family_Code, radius_Center_5, quadrant, locationType, 
SUM(BitCount) as BitCount, SUM(DBCCount) as DBCCount
FROM
(
SELECT left(lot_Id, 5) as lot_id_base, wafer_Id, normalized_TestX as ChipX, normalized_TestY as ChipY, family_Code,
     corner, categoryName, count, radius_Center_5, quadrant, locationType,
     case
         when corner = '{}' and categoryName = 'BIT' then count
         else 0
     end as BitCount,
     case
         when corner = '{}' and categoryName = 'DBC' then count
         else 0
     end as DBCCount
FROM DMIW.PattRecChipFactR prcfr
JOIN DMIW_SYSTEMS.DerivedDataSetup dds ON prcfr.derivedSetupKey = dds.derivedSetupKey
JOIN DMIW_SYSTEMS.PattRecCategory prc ON prcfr.pattRecCatKey = prc.pattRecCatKey
JOIN DMIW_SYSTEMS.Geography geo ON prcfr.geographyKey = geo.geographyKey
JOIN (select testedWaferKey, lot_Id, wafer_id, family_Code, cur_Lot_Grade as lot_Grade, last_TestTimeStamp as date,
     level, testProgramName, last_Eqp_Id
     from DMIW_SYSTEMS.TestedWafer tw
     where Last_test_date >= (current date - {} days) and Tech_id = '7HPP' and Calcdefs = '{}'
     group by testedWaferKey, lot_Id, wafer_Id, family_Code, cur_Lot_Grade, last_TestTimeStamp, level, 
     testProgramName, last_Eqp_Id) tw ON prcfr.testedWaferKey = tw.testedWaferKey
WHERE categoryName <> '' and exclude_Flag = 'N'
)
GROUP BY lot_id_base, wafer_Id, ChipX, ChipY, family_Code, radius_Center_5, quadrant, locationType
ORDER BY lot_Id_base, wafer_Id, ChipX, ChipY
) APRCTable

JOIN
(
SELECT wafer_Id, ChipX, ChipY,
    SUM(Q0FailCount) as Q0FailCount, SUM(Q1FailCount) as Q1FailCount,
    SUM(Q2FailCount) as Q2FailCount, SUM(Q3FailCount) as Q3FailCount
FROM
(
SELECT wafer_Id, normalized_TestX as ChipX, normalized_TestY as ChipY, tp.parm_Label, parmValue,
     case
         when ucase(tp.parm_Label) = 'S256~H2~Q0_FBC_0P65' then parmValue
         else 0
     end as Q0FailCount,
     case
         when ucase(tp.parm_Label) = 'S256~H2~Q1_FBC_0P65' then parmValue
         else 0
     end as Q1FailCount,
     case
         when ucase(tp.parm_Label) = 'S256~H2~Q2_FBC_0P65' then parmValue
         else 0
     end as Q2FailCount,
     case
         when ucase(tp.parm_Label) = 'S256~H2~Q3_FBC_0P65' then parmValue
         else 0
     end as Q3FailCount
FROM DMIW.ChipParmFactR cpfr
JOIN DMIW_SYSTEMS.TestParm tp ON cpfr.testParmKey = tp.testParmKey
JOIN DMIW_SYSTEMS.Geography geo ON cpfr.geographyKey = geo.geographyKey
JOIN (select testedWaferKey, lot_Id, wafer_Id, family_Code, cur_Lot_Grade as lot_Grade, 
     last_TestTimeStamp as date, level, last_Eqp_Id 
     from DMIW_SYSTEMS.TestedWafer testedWafer 
     where Last_test_date >= (current date - {} days) and Tech_id = '7HPP' and Calcdefs = '{}' 
     group by testedWaferKey, lot_Id, wafer_Id, family_Code, cur_Lot_Grade, last_TestTimeStamp,
     level, last_Eqp_Id) tw ON cpfr.testedWaferKey = tw.testedWaferKey
WHERE cpfr.parmValue is not null and abs(parmValue) < 1e25 and
    (ucase(tp.parm_Label) in ('S256~H2~Q0_FBC_0P65', 'S256~H2~Q1_FBC_0P65', 'S256~H2~Q2_FBC_0P65', 'S256~H2~Q3_FBC_0P65'))
)
GROUP BY wafer_Id, ChipX, ChipY
ORDER BY wafer_Id, ChipX, ChipY
) ElecTable

ON APRCTable.wafer_Id = ElecTable.wafer_Id and APRCTable.ChipX = ElecTable.ChipX and APRCTable.ChipY = ElecTable.ChipY
ORDER BY lot_Id_base, wafer_Id
""".format(corner, corner, daysback, Calcdefs, daysback, Calcdefs)
    
df = pd.read_sql(SQLQuery,con)
    
features2 = pd.pivot_table(df, values=('BITCOUNT','DBCCOUNT','Q0FAILCOUNT','Q1FAILCOUNT','Q2FAILCOUNT','Q3FAILCOUNT'), index=['WAFER_ID'], columns=['CHIPXY'], aggfunc=np.mean).fillna(0)

### Fit Models

In [13]:
X = features1.loc[:, ((features1.columns != 'Radial') & (features1.columns != 'Smiley'))]

In [14]:
model = Radial_model
Radial_results = pd.DataFrame({'WaferId':X.index,'Rad_pred':model.predict(X), 'Rad_prob':model.predict_proba(X)[:,1]})

In [15]:
model = Smiley_model
Smiley_results = pd.DataFrame({'WaferId':X.index,'Smile_pred':model.predict(X), 'Smile_prob':model.predict_proba(X)[:,1]})

In [18]:
X = features2.loc[:, ((features2.columns != 'Small_Center') & (features2.columns != 'Big_Center'))]

In [19]:
model = SmallCenter_model
SmallC_results = pd.DataFrame({'WaferId':X.index,'SmallC_pred':model.predict(X), 'SmallC_prob':model.predict_proba(X)[:,1]})

In [20]:
model = BigCenter_model
BigC_results = pd.DataFrame({'WaferId':X.index,'BigC_pred':model.predict(X), 'BigC_prob':model.predict_proba(X)[:,1]})

In [21]:
output = Radial_results.set_index('WaferId').join(Smiley_results.set_index('WaferId')).join(SmallC_results.set_index('WaferId')).join(BigC_results.set_index('WaferId'))

In [51]:
output.to_csv('~/Downloads/Fail_Signatures.csv')

In [23]:
# Convert to CSV File for Cheng Tin
localpath = "/Users/acyang@us.ibm.com/Downloads/Fail_Signatures.csv"
gsapath = "/gsa/pokgsa/home/a/c/acyang/public/Fail_Signatures.csv"

output.to_csv(localpath)

import paramiko

host = "pokgsa.ibm.com"
#port = 22
transport = paramiko.Transport((host))

password = "BobRoss99MakesMeSleep"                
username = "acyang"                
transport.connect(username = username, password = password)

sftp = paramiko.SFTPClient.from_transport(transport)

sftp.put(localpath, gsapath)

<SFTPAttributes: [ size=16949 uid=563230 gid=1024768 mode=0o200100660 atime=1571939257 mtime=1572898915 ]>

In [35]:
output[output.index.str.contains("AZ3FY")]

Unnamed: 0_level_0,Rad_pred,Rad_prob,Smile_pred,Smile_prob,SmallC_pred,SmallC_prob,BigC_pred,BigC_prob
WaferId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AZ3FY-01,0.0,0.031818,0.0,0.0,0.0,0.326087,0.0,0.033333
AZ3FY-04,0.0,0.140909,0.0,0.018182,0.0,0.03913,0.0,0.014286
AZ3FY-05,0.0,0.077273,0.0,0.0,0.0,0.017391,0.0,0.009524
AZ3FY-09,0.0,0.040909,0.0,0.0,0.0,0.030435,0.0,0.009524
AZ3FY-11,0.0,0.140909,0.0,0.009091,0.0,0.008696,0.0,0.014286
AZ3FY-12,0.0,0.086364,0.0,0.009091,0.0,0.013043,0.0,0.042857
AZ3FY-13,0.0,0.090909,0.0,0.013636,0.0,0.004348,0.0,0.004762
AZ3FY-14,0.0,0.077273,0.0,0.009091,0.0,0.017391,0.0,0.0
AZ3FY-15,0.0,0.05,0.0,0.0,0.0,0.026087,0.0,0.014286
AZ3FY-16,0.0,0.122727,0.0,0.022727,0.0,0.008696,0.0,0.033333
