## PHME 2022 Data Challenge

This is the skeleton of the Jupyter Notebook you have to fill.
The Notebook must define three functions, each for solving a separate classification task of the challenge.
They are `classification_1`, `classification_2` and `classification_3` and must solve, respectively, task 1, 2 and 3 of the challenges.

**Automatic Scoring and Leader Board:** 
For each participant, we will consider the file locate in `data-challenge-phme/solution.ipynb` as proposed solution. We will execute the Notebook and, in the end, invoke the functions with the test data.
We will evaluate the output of the functions and compute the performance on it.

**Note:** if the execution of the notebook leads to an error or an exception, the functions will not be defined and we will not evaluate the performance of your solution.

**Note:** the notebook must have a reasonable execution time. We will not evaluate notebooks requiring more than **10 minutes** to be executed.
If you want to train complex models that require a large amount of time, do it in a separate notebook. Thus, in `solution.ipynb`, load pre-trained models.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm 

from sklearn.neighbors import KernelDensity
import joblib
from sklearn.metrics import roc_auc_score
import os
import random
import xgboost
import numpy as np
import matplotlib.pylab as plt
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.preprocessing import StandardScaler

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA

import category_encoders as ce


Bad key "text.kerning_factor" on line 4 in
/opt/anaconda3/envs/bigdatalab_cpu_202101/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.2/matplotlibrc.template
or from the matplotlib source distribution


In [3]:
# The input is the SPI data in form of a Pandas DataFrame, exactly as it is read with pd.read_csv()
# The output must be the list of predicted defects. Each defect is a tuple (Panel, Figure, Component)

def classification_1 (spi):
    spi_real=spi.copy()
    spi_real=spi_real.dropna()
    spi_real["ComponentID2"]=spi_real["ComponentID"]
    spi_real["FigureID2"]=spi_real["FigureID"]
    spi_real["Result"]=spi_real["Result"].astype("category").cat.codes
    Num_features_to_cat=['Volume(%)','Area(%)', 'OffsetX(%)','OffsetY(%)']
    Num_features=['Shape(um)',
       'PosX(mm)', 'PosY(mm)','SizeX', 'SizeY']
    for feat in Num_features:
        spi_real[feat]=spi_real[feat].astype('float')
    encoder=joblib.load("Cat_boost_step1")
    
    spi_real["ComponentID2"]=spi_real["ComponentID"].astype('category').cat.codes

    list_var_to_encode=["ComponentID2"]
    
    AE_val=encoder.transform(spi_real[list_var_to_encode].copy())
    AE_val.columns=[x+"_encoded" for x in AE_val.columns.tolist()]
    spi_real=pd.concat([spi_real,AE_val],axis=1)
    
    features_x=["ComponentID2_encoded"]+Num_features_to_cat+Num_features
    X_test=spi_real[features_x].to_numpy()
    
    clf = joblib.load("xgboost_step1")
    
    pred_xgboost=clf.predict(X_test)
    spi_real['Pred_label']=pred_xgboost

    spi_real=spi_real.loc[spi_real['Pred_label']==0]
    spi_real=spi_real.dropna(subset=['FigureID2'])

    spi_real["FigureID2"]=spi_real["FigureID2"].astype(int).astype(str)

    defects=list(spi_real[["PanelID","FigureID2","ComponentID"]].itertuples(index=False, name=None))



   
    return defects

In [4]:
# The first input is the SPI data in form of a Pandas DataFrame, exactly as it is read with pd.read_csv()
# The second input is the AOI data. OperatorLabel and RepairLabel are not included, as you must predict OperatorLabel
# The output must be the classification result. Each entry is a tuple (Panel, Figure, Component, PredictedOperatorLabel)

def classification_2 (spi, aoi):
    spi_real=spi.copy()
    aoi_real=aoi.copy()
    spi_real['PinNumber']=spi_real['PinNumber']
    aoi_real['PinNumber']=aoi_real['PinNumber'].astype('Int64').astype(str)
    aoi_real['FigureID2']=aoi_real['FigureID']
    aoi_real['ComponentID2']=aoi_real['ComponentID']
    aoi_real['Count_Pin']=aoi_real.groupby(["PanelID","FigureID2","ComponentID2"])["PinNumber"].transform('count')
    aoi_real['Count_Pin_Figure']=aoi_real.groupby(["PanelID","FigureID2"])["PinNumber"].transform('count')
    aoi_real['Count_Pin_Panel']=aoi_real.groupby(["PanelID"])["PinNumber"].transform('count')


    aoi_real['Count_Pin']=aoi_real.groupby(["PanelID","FigureID2","ComponentID2"])["PinNumber"].transform('count')
    aoi_real['Count_Pin_Figure']=aoi_real.groupby(["PanelID","FigureID2"])["PinNumber"].transform('count')

    spi_real=spi_real.dropna()



    list_var_to_encode=["AOILabel","ComponentID","FigureID_ComponentID"]
    Num_features_to_cat=['Volume(%)','Area(%)', 'OffsetX(%)','OffsetY(%)']
    Num_features=['Shape(um)',
       'PosX(mm)', 'PosY(mm)','SizeX', 'SizeY']    
    aoi_real['AOILabel2']=aoi_real['AOILabel']

   
    aoi_inner=aoi_real.merge(spi_real,on=['PanelID','FigureID',"ComponentID","PinNumber"],how="inner")

    aoi_inner['FigureID_ComponentID']=(aoi_inner['FigureID'].astype(str)+'_'+aoi_inner['ComponentID'].astype(str)).astype("category").cat.codes
    aoi_inner['FigureID_ComponentID_PinNumber']=(aoi_inner['FigureID'].astype(str)+'_'+aoi_inner['ComponentID'].astype(str)+'_'+aoi_inner['PinNumber'].astype(str)).astype("category").cat.codes
    aoi_inner['FigureID_ComponentID_PinNumber_AOILabel']=(aoi_inner['FigureID'].astype(str)+'_'+aoi_inner['ComponentID'].astype(str)+'_'+aoi_inner['PinNumber'].astype(str)+aoi_inner['AOILabel'].astype(str)).astype("category").cat.codes
    aoi_inner['FigureID_ComponentID_AOILabel']=(aoi_inner['FigureID'].astype(str)+'_'+aoi_inner['ComponentID'].astype(str)+"_"+aoi_inner['AOILabel'].astype(str)).astype("category").cat.codes
    
   

    
    
    
    list_var_to_encode=["AOILabel","ComponentID","FigureID_ComponentID"]
    Num_features_to_cat=['Volume(%)','Area(%)', 'OffsetX(%)','OffsetY(%)']
    Num_features=['Shape(um)',
       'PosX(mm)', 'PosY(mm)','SizeX', 'SizeY']    
    
    for var in list_var_to_encode:
        aoi_inner[var]=aoi_inner[var].astype('category').cat.codes

    encoder=joblib.load("Cat_boost_step2_inner")
    AE_val=encoder.transform(aoi_inner[list_var_to_encode].copy())
    AE_val.columns=[x+"_encoded" for x in AE_val.columns.tolist()]
    aoi_inner=pd.concat([aoi_inner,AE_val],axis=1)

    col_features=[x+"_encoded" for x in list_var_to_encode]+Num_features+Num_features_to_cat+["Count_Pin","Count_Pin_Figure"]


    aoi_inner=aoi_inner.replace(np.nan,0)
    X_test_final=aoi_inner[col_features].to_numpy()

    xgb_cl=joblib.load("xgboost_step2_inner")
    aoi_inner['Pred']=xgb_cl.predict(X_test_final)
    
    spi_real=spi.copy()
    spi_real=spi_real.dropna()


    Features_to_bin=['Volume(%)',  'Area(%)', 'OffsetX(%)','OffsetY(%)']

    InputSPI_grouped=spi_real.groupby(["PanelID","FigureID","ComponentID"])[Features_to_bin].mean().reset_index()
    
 
    
    aoi_outter=aoi_real.loc[aoi_real.PinNumber=='<NA>']

    aoi_outter=aoi_outter.merge(InputSPI_grouped,on=['PanelID','FigureID',"ComponentID"],how="left")
    
   


    aoi_outter['FigureID_ComponentID']=(aoi_outter['FigureID'].astype(str)+'_'+aoi_outter['ComponentID'].astype(str)).astype("category").cat.codes
    aoi_outter['FigureID_ComponentID_PinNumber']=(aoi_outter['FigureID'].astype(str)+'_'+aoi_outter['ComponentID'].astype(str)+'_'+aoi_outter['PinNumber'].astype(str)).astype("category").cat.codes
    aoi_outter['FigureID_ComponentID_PinNumber_AOILabel']=(aoi_outter['FigureID'].astype(str)+'_'+aoi_outter['ComponentID'].astype(str)+'_'+aoi_outter['PinNumber'].astype(str)+aoi_outter['AOILabel'].astype(str)).astype("category").cat.codes
    aoi_outter['FigureID_ComponentID_AOILabel']=(aoi_outter['FigureID'].astype(str)+'_'+aoi_outter['ComponentID'].astype(str)+"_"+aoi_outter['AOILabel'].astype(str)).astype("category").cat.codes
    
    
    list_var_to_encode=["AOILabel","ComponentID","FigureID_ComponentID"]
    for var in list_var_to_encode:
        aoi_outter[var]=aoi_outter[var].astype('category').cat.codes

    encoder=joblib.load("Cat_boost_step2_outer")
    AE_val=encoder.transform(aoi_outter[list_var_to_encode].copy())
    AE_val.columns=[x+"_encoded" for x in AE_val.columns.tolist()]
    aoi_outter=pd.concat([aoi_outter,AE_val],axis=1)
    #col_features=[x for x in aoi_outter.columns.tolist() if "_encoded" in str(x)]
    col_features=Features_to_bin+[x+"_encoded" for x in list_var_to_encode]+["Count_Pin","Count_Pin_Figure"]#+['MachineID_encoded']

    xgb_cl=joblib.load("xgboost_step2_outter")

    X_test_final=aoi_outter[col_features].to_numpy()
    aoi_outter['Pred']=xgb_cl.predict(X_test_final)

    aoi=pd.concat([aoi_outter,aoi_inner])
    
    aoi_group=aoi.groupby(["PanelID","FigureID2","ComponentID2"])['Pred'].min().reset_index()
    aoi_group2=aoi_group.copy()
    aoi_group2.loc[aoi_group2['Pred']==0,"Pred"]="Bad"
    aoi_group2.loc[aoi_group['Pred']==1,'Pred']="Good"

    predicted=list(aoi_group2[["PanelID","FigureID2","ComponentID2",'Pred']].itertuples(index=False, name=None))
    return predicted

In [5]:
# The first input is the SPI data in form of a Pandas DataFrame, exactly as it is read with pd.read_csv()
# The second input is the AOI data. RepairLabel are not included, as you must predict it
# The output must be the classification result. Each entry is a tuple (Panel, Figure, Component, PredictedRepairLabel)

def classification_3 (spi, aoi):
    
    spi_real=spi.copy()
    aoi_real=aoi.copy()

    aoi_real['PinNumber']=aoi_real['PinNumber'].astype('Int64').astype(str)
    aoi_real['FigureID2']=aoi_real['FigureID']
    aoi_real['ComponentID2']=aoi_real['ComponentID']
    aoi_real['Count_Pin']=aoi_real.groupby(["PanelID","FigureID2","ComponentID2"])["PinNumber"].transform('count')
    aoi_real['Count_Pin_Figure']=aoi_real.groupby(["PanelID","FigureID2"])["PinNumber"].transform('count')
    aoi_real['Count_Pin_Panel']=aoi_real.groupby(["PanelID"])["PinNumber"].transform('count')
    
    aoi_real["MachineID"]=aoi_real["MachineID"].astype('category').cat.codes

    spi_real['FigureID2']=spi_real['FigureID']
    spi_real['ComponentID2']=spi_real['ComponentID']
    spi_real['key_spi']=(spi_real['PanelID'].astype(str)+'_'+spi_real['FigureID'].astype(int,errors='ignore').astype(str)+'_'+spi_real['ComponentID'].astype(str))
    aoi_real['key_spi']=(aoi_real['PanelID'].astype(str)+'_'+aoi_real['FigureID'].astype(int,errors='ignore').astype(str)+'_'+aoi_real['ComponentID'].astype(str))

    aoi_real=aoi_real.loc[aoi_real.OperatorLabel=="Bad"]

    Features_to_bin=['Volume(%)',  'Area(%)', 'OffsetX(%)','OffsetY(%)',]
    aoi_real['PanelID_FigureID_ComponentID']=aoi_real['PanelID'].astype(str)+'_'+aoi_real['FigureID'].astype(str)+'_'+aoi_real['ComponentID'].astype(str)
    listAOI_Label=joblib.load('listAOI_Label')
    for element in listAOI_Label:
        aoi_real.loc[:,element]=0
    for item in aoi_real['PanelID_FigureID_ComponentID'].unique().tolist():
        list_item_aoilabel=[]
        df_inter=aoi_real.loc[aoi_real['PanelID_FigureID_ComponentID']==item]
        list_item_aoilabel=df_inter.AOILabel.unique()
        for element in listAOI_Label:
            if element in list_item_aoilabel:
                aoi_real.loc[aoi_real['PanelID_FigureID_ComponentID']==item,element]=1
    spi_real=spi_real.dropna(subset=['FigureID'])
    aoi_real['FigureID_ComponentID']=(aoi_real['FigureID'].astype(str)+'_'+aoi_real['ComponentID'].astype(str)).astype("category").cat.codes
   
    spi_real=spi_real.loc[spi_real['key_spi'].isin(aoi_real['key_spi'].unique().tolist())]
    
    Num_features=['Shape(um)',
       'PosX(mm)', 'PosY(mm)','SizeX', 'SizeY']
    spi_real['Shape(um)']=pd.to_numeric(spi_real['Shape(um)'])
    
    InputSPI_grouped=spi_real.groupby(["PanelID","FigureID2","ComponentID2"])[Features_to_bin+Num_features].mean().reset_index()
    aoi_real=aoi_real.merge(InputSPI_grouped,how="left",left_on=["PanelID","FigureID2","ComponentID2"],right_on=["PanelID","FigureID2","ComponentID2"])
    aoi_real=aoi_real.drop_duplicates(subset=["PanelID","FigureID2","ComponentID2"])
    
    
    list_var_to_encode=["ComponentID","FigureID_ComponentID"]
    for var in list_var_to_encode:
        aoi_real[var]=aoi_real[var].astype('category').cat.codes
    encoder=joblib.load("Cat_boost_step3_no_fusion")

    AE_val=encoder.transform(aoi_real[list_var_to_encode].copy())
    AE_val.columns=[x+"_encoded" for x in AE_val.columns.tolist()]
    X_test_final=pd.concat([aoi_real,AE_val],axis=1)
    
    
    col_features=Num_features+[x+"_encoded" for x in list_var_to_encode]+["Count_Pin","Count_Pin_Figure","Count_Pin_Panel"]+listAOI_Label+Features_to_bin+["MachineID"]#+[x for x in X_val_final.columns.tolist() if "_counts" in x]#
    X_test_final=X_test_final.replace(np.nan,0)
    X_test_final=X_test_final[col_features].to_numpy()
    
    xgb_cl=joblib.load("xgboost_step3_no_fusion")

    aoi_real['Pred']=xgb_cl.predict(X_test_final)
    
    aoi_group=aoi_real.groupby(["PanelID","FigureID2","ComponentID2"])['Pred'].min().reset_index()
    aoi_group.loc[aoi_group['Pred']==0,'Pred']="FalseScrap"

    aoi_group.loc[aoi_group['Pred']==1,'Pred']="NotPossibleToRepair"

    predicted=list(aoi_group[["PanelID","FigureID2","ComponentID2",'Pred']].itertuples(index=False, name=None))

    return predicted

## Test the code

In the following, we report a code that you can use to test if your script correctly handles the data.

We will use a very similar piece of code to run your Notebook to build the leaderboard.

In [6]:
from sklearn.metrics import classification_report
import pandas as pd
import statistics
import math
import glob

dfs = []
for f in glob.glob("data/SPI_*.csv.zip"):
    dfs.append(pd.read_csv(f))
SPI = pd.concat(dfs)

dfs = []
for f in glob.glob("data/AOI_*.csv.zip"):
    dfs.append(pd.read_csv(f))
AOI = pd.concat(dfs)

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
results_1 = set(classification_1(SPI.copy()))
results_2 = classification_2(SPI, AOI[["PanelID","FigureID","MachineID","ComponentID","PinNumber","AOILabel"]])
results_3 = classification_3(SPI, AOI[["PanelID","FigureID","MachineID","ComponentID","PinNumber","AOILabel","OperatorLabel"]])

# Performance Task 1
groundtruth_1  = {tuple( [str(f) for f in e] ) for e in AOI[["PanelID","FigureID","ComponentID"]].values}
precision_1    = len(results_1&groundtruth_1)/len(results_1) if len(results_1) > 0 else 0
recall_1       = len(results_1&groundtruth_1)/len(groundtruth_1) if len(groundtruth_1) > 0 else 0
f1_1           = 2*precision_1*recall_1/(precision_1+recall_1) if precision_1+recall_1 > 0 else 0

# Performance Task 2
results_dict_2 = { (str(p), str(f), str(c)):l for p, f, c, l in results_2}
validationdata_2 = []
for t in AOI.drop_duplicates(subset=["PanelID","FigureID","ComponentID"], keep="first").itertuples():
    predicted = results_dict_2.get(( str(t.PanelID), str(t.FigureID), str(t.ComponentID)), "-" )
    validationdata_2.append((t.PanelID, t.FigureID, t.ComponentID, t.OperatorLabel, predicted))
validationdata_2 = pd.DataFrame(validationdata_2, columns = ["PanelID","FigureID","ComponentID", "Real", "Predicted"]) 
f1_2 = classification_report(validationdata_2["Real"], validationdata_2["Predicted"],output_dict=True)["Bad"]["f1-score"]

# Performance Task 3
results_dict_3 = { (str(p), str(f), str(c)):l for p, f, c, l in results_3}
validationdata_3 = []
for t in AOI[AOI["RepairLabel"].isin({"FalseScrap","NotPossibleToRepair"})]\
        .drop_duplicates(subset=["PanelID","FigureID","ComponentID"], keep="first").itertuples():
    predicted = results_dict_3.get(( str(t.PanelID), str(t.FigureID), str(t.ComponentID)), "-" )
    validationdata_3.append((t.PanelID, t.FigureID, t.ComponentID, t.RepairLabel, predicted))
validationdata_3 = pd.DataFrame(validationdata_3, columns = ["PanelID","FigureID","ComponentID", "Real", "Predicted"]) 
cr = classification_report(validationdata_3["Real"], validationdata_3["Predicted"],output_dict=True)
f1_3 = (cr["FalseScrap"]["f1-score"] + cr["NotPossibleToRepair"]["f1-score"])/2

print("F1 Score Task 1:", f1_1)
print("F1 Score Task 2:", f1_2)
print("F1 Score Task 3:", f1_3)
print("Final Score:", statistics.mean([f1_1, f1_2, f1_3]))

  _warn_prf(average, modifier, msg_start, len(result))


F1 Score Task 1: 0.411160647197315
F1 Score Task 2: 0.6576168929110106
F1 Score Task 3: 0.9021846370683579
Final Score: 0.6569873923922278
