### User Defined functions / classes and library initiations

In [1]:
# base libraries
import pandas as pd
import numpy as np

# preprocessing libraries
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# feature selection libraries
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import VarianceThreshold
from mlxtend.feature_selection import SequentialFeatureSelector as sfs

# modelling libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC,SV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier

#model selection & tuning libraries
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut

# visualisation libraries
import plotly.figure_factory as ff
from plotly import express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# compute performance libraries
from joblib import Parallel, delayed

# storage & retrieval
import pickle

In [2]:
pd.set_option('display.max_columns',50)
pd.set_option('display.max_rows',100)

In [3]:
def nulsCount(df):
    """summarise missing/unexpected values"""
    
    d2=pd.DataFrame(columns=["NULL","NAN","BLANKS","UNEXP"])
    try:
        d2["NULL"] = df.isnull().sum().astype('uint32') # check for null values
        d2["NAN"]=df.isna().sum().astype('uint32') # check for NaN
        d2["BLANKS"]=df.isin([""," "]).sum().astype('uint32') # check for blanks
        d2["UNEXP"]=df.isin(["-","?",".","NA","N/A","Unknown"]).sum().astype('uint32') # check for other unexpected values
    except:
        pass
    d2=d2.loc[(d2["NULL"]!=0) | (d2["NAN"]!=0) | (d2["BLANKS"]!=0) | (d2["UNEXP"]!=0)] # shortlist for the missing values
    
    # convert to percentages
    d2["NULL %"] = d2["NULL"].mul(100/df.shape[0]).round(2)
    d2["NAN %"] = d2["NAN"].mul(100/df.shape[0]).round(2)
    d2["BLANKS %"] = d2["BLANKS"].mul(100/df.shape[0]).round(2)
    d2["UNEXP %"] = d2["UNEXP"].mul(100/df.shape[0]).round(2)
    
    # rearrange
    d2=d2[["NULL","NULL %","NAN","NAN %","BLANKS","BLANKS %","UNEXP","UNEXP %"]]
    
    if d2.shape[0]==0:
        return
    else:     
        return d2

In [4]:
class SCFS():
    """https://www.frontiersin.org/articles/10.3389/fgene.2021.684100/full
    Reference article for feature scoring
    SCFS (Standard deviation and Cosine similarity based Feature Selection)
    Credits to: Juanying Xie, Mingzhao Wang, Shengquan Xu, Zhao Huang and Philip W. Grant"""
    
    def __init__(self,kind='exp'):
        """kind = {'exp','reciprocal','anti-similarity'} default='exp'"""
        self.kind=kind
        self.fitted=False
        
    def discernibility(self):
        """list down the feature discernibility
        same as sample standard deviations"""
        m=self.df.shape[0]
        self.dis=[np.sqrt(sum((self.df[i]-sum(self.df[i])/m)**2)/(m-1)) for i in self.df.columns]
        self.dis=pd.Series(self.dis,index=self.df.columns,dtype=float)
    
    def cosineSimilarity(self):
        """populate the cosine similarities (absolute)"""
        self.cosdf=pd.DataFrame(columns=self.df.columns,index=self.df.columns)
        for i in self.df.columns:
            for j in self.df.columns:
                norm_i=np.sqrt(self.df[i].dot(self.df[i]))
                norm_j=np.sqrt(self.df[j].dot(self.df[j]))
                self.cosdf.loc[i,j] = (np.abs(self.df[i].dot(self.df[j])))/(norm_i*norm_j)
                
    def independence(self):
        """evaluate the feature independance"""
        dismaxarg=self.dis.index[np.argmax(self.dis)]
        self.ind=pd.Series(index=self.df.columns,dtype=float)

        for i in self.df.columns:
            if i == dismaxarg: # for feature with max stddev
                if self.kind == 'exp':
                    self.ind[i] = np.exp(max(-self.cosdf.loc[i]))
                elif self.kind == 'reciprocal':
                    self.ind[i] = max(1/self.cosdf.loc[i])
                elif self.kind == 'anti-similarity':
                    self.ind[i] = max(1-self.cosdf.loc[i])
            else:
                if self.kind == 'exp':
                    self.ind[i] = np.exp(min(-self.cosdf.loc[i,self.dis[self.dis>self.dis[i]].index]))
                elif self.kind == 'reciprocal':
                    self.ind[i] = min(1/self.cosdf.loc[i,self.dis[self.dis>self.dis[i]].index])
                elif self.kind == 'anti-similarity':
                    self.ind[i] = min(1-self.cosdf.loc[i,self.dis[self.dis>self.dis[i]].index])
                    
    def fit(self,df):
        """evaluate feature scores of df"""
        self.df=df.copy()
        
        self.discernibility()
        self.cosineSimilarity()
        self.independence()
        
        self.fscore=self.dis.mul(self.ind)
        self.fitted=True

In [5]:
# delete instance of global variable scoreLog
try:
    del scoreLog
    print("scoreLog deleted")
except:
    print("scoreLog undefined")
    
# defining a function to report classification metrics
def reporter(Y_train, pred_train, Y_test, pred_test,model_name):
    """Classification report
    logs test scores to global dataframe named scoreLog
    the scoreLog (with any previous scores) will be displayed
    also displays confusion matrices of current instance of arguments
    ---------------------------------------------------------------------------
    Y_train ==> TRUE classes used for training (pandas series object or numpy array of 1-D)
    pred_train ==> PREDICTION on training data (pandas series object or numpy array of 1-D)
    Y_test ==> TRUE classes to be used for testing (pandas series object or numpy array of 1-D)
    pred_test ==> PREDICTION on test data (pandas series object or numpy array of 1-D)
    model_name ==> str name for current model, to be used as index for scoreLog
    ---------------------------------------------------------------------------
    """
    from sklearn import metrics
    import plotly.figure_factory as ff
    import numpy as np
    import pandas as pd
    
    global scoreLog
    
    classes=list(Y_test.unique())
    cols=["accuracy"]
    cols.extend(["precision_"+str(classes[i]) for i in range(len(classes))])
    cols.extend(["recall_"+str(classes[i]) for i in range(len(classes))])
    cols.extend(["fscore_"+str(classes[i]) for i in range(len(classes))])
    
    try:
        type(scoreLog)
    except:
        scoreLog=pd.DataFrame(columns=cols)
    
    #metrics based on training set
    #confusion matrix
    z=pd.DataFrame(metrics.confusion_matrix(Y_train, pred_train))
    fig1=ff.create_annotated_heatmap(np.array(z),annotation_text=np.array(z),
                                    x=list(np.sort(np.unique(Y_train))),y=list(np.sort(np.unique(Y_train))),
                                    colorscale='Mint',font_colors = ['grey','white'],name="TRAINING SET",
                                    hovertemplate="Prediction: %{x:d}<br>True: %{y:d}<br>Count: %{z:d}")
    fig1.update_layout(height=350,width=350)
    fig1.update_xaxes(title_text="PREDICTED (TRAINING SET) - "+model_name)
    fig1.update_yaxes(title_text="TRUE",tickangle=270)
    
    #scores
    score=[metrics.accuracy_score(Y_train,pred_train)]
    score.extend(metrics.precision_score(Y_train,pred_train,labels=classes,average=None))
    score.extend(metrics.recall_score(Y_train,pred_train,labels=classes,average=None))
    score.extend(metrics.f1_score(Y_train,pred_train,labels=classes,average=None))
    scoreLog=scoreLog.append(pd.DataFrame(score,index=cols,columns=[model_name+"_training"]).T)
    
    #metrics based on test set
    #confusion matrix
    z=pd.DataFrame(metrics.confusion_matrix(Y_test, pred_test))
    fig2=ff.create_annotated_heatmap(np.array(z),annotation_text=np.array(z),
                                    x=list(np.sort(np.unique(Y_test))),y=list(np.sort(np.unique(Y_test))),
                                    colorscale='Mint',font_colors = ['grey','white'],name="TEST SET",
                                    hovertemplate="Prediction: %{x:d}<br>True: %{y:d}<br>Count: %{z:d}")
    fig2.update_layout(height=350,width=350)
    fig2.update_xaxes(title_text="PREDICTED (TEST SET) - "+model_name)
    fig2.update_yaxes(title_text="TRUE",tickangle=270)
    
    #scores
    score=[metrics.accuracy_score(Y_test,pred_test)]
    score.extend(metrics.precision_score(Y_test,pred_test,labels=classes,average=None))
    score.extend(metrics.recall_score(Y_test,pred_test,labels=classes,average=None))
    score.extend(metrics.f1_score(Y_test,pred_test,labels=classes,average=None))
    scoreLog=scoreLog.append(pd.DataFrame(score,index=cols,columns=[model_name+"_test"]).T)
    
    # merge both confusion matrix heatplots
    fig=make_subplots(rows=1,cols=2,horizontal_spacing=0.05)
    fig.add_trace(fig1.data[0],row=1,col=1)#,name="training data")
    fig.add_trace(fig2.data[0],row=1,col=2)#,name="test data")

    annot1 = list(fig1.layout.annotations)
    annot2 = list(fig2.layout.annotations)
    for k  in range(len(annot2)):
        annot2[k]['xref'] = 'x2'
        annot2[k]['yref'] = 'y2'
    fig.update_layout(annotations=annot1+annot2) 
    fig.layout.xaxis.update(fig1.layout.xaxis)
    fig.layout.yaxis.update(fig1.layout.yaxis)
    fig.layout.xaxis2.update(fig2.layout.xaxis)
    fig.layout.yaxis2.update(fig2.layout.yaxis)
    fig.layout.yaxis2.update({'title': {'text': ''}})
    
    display(scoreLog)
    fig.show()

scoreLog undefined


In [6]:
def cvSplitter(X,Y,k=10,seed=129):
    """Splits K folds and returns array of copied dataframes"""
    X=X.copy()
    Y=Y.copy()
    L=X.shape[0]
    # seed pseudo random generator
    np.random.seed(seed)
    indices=np.random.choice(X.index,L,False)
    sets=[(int(np.floor(L*(i)/k)),int(np.floor(L*(i+1)/k))) for i in range(k)]
    Xtrains=[]
    Xvals=[]
    Ytrains=[]
    Yvals=[]
    ss=0
    for i in range(k):
        se=int(np.floor(L*(i+1)/k))
        Xvals.append(X.loc[list(indices[ss:se])].copy())
        Yvals.append(Y.loc[list(indices[ss:se])].copy())
        Xtrains.append(X.loc[list(indices[[j not in indices[ss:se] for j in indices]])].copy())
        Ytrains.append(Y.loc[list(indices[[j not in indices[ss:se] for j in indices]])].copy())
        ss=se
    return Xtrains,Ytrains,Xvals,Yvals

In [7]:
class remap():
    def __init__(self):
        """performs skew correction and z-score standardisation"""
        from sklearn.preprocessing import StandardScaler
        self.fitted=False
        
    def fit(self,df):
        """registers stats of the dataframe"""
        df=df.copy()
        self.fitting_info=pd.DataFrame(columns=["skew","kurt","min","max","reflect","r_min","r_max","mms","log","sqrt"],
                                       index=df.columns)
        
        # initialise flags
        self.fitting_info["reflect"] = False
        self.fitting_info["mms"] = False
        self.fitting_info["log"] = False
        self.fitting_info["sqrt"] = False
        
        # reocird basic stats
        self.fitting_info["skew"] = df.skew()
        self.fitting_info["kurt"] = df.kurt()
        self.fitting_info["min"] = df.min()
        self.fitting_info["max"] = df.max()
        
        # test need for reflected transforms
        collist=list(self.fitting_info.loc[self.fitting_info["skew"]<=-0.75].index)
        for col in collist:            
            # read basic stats
            [cskew,cmin,cmax]=self.fitting_info.loc[col,["skew","min","max"]]
            
            # reflect
            temp_r = cmax+1-df[col]
            cmin=temp_r.min()
            cmax=temp_r.max()
            self.fitting_info.loc[col,["r_min","r_max"]]=[cmin,cmax]
            # scale between 0-500
            temp_r_mms = (temp_r-cmin)*500/(cmax-cmin)
            self.fitting_info.loc[col,["mms_min","mms_max"]]=[temp_r_mms.min(),temp_r_mms.max()]
            # scaled log tranform
            temp_r_mms_l = (temp_r_mms+1).apply(np.log)
            # scaled sqrt tranform
            temp_r_mms_s = temp_r_mms.apply(np.sqrt)
            # plain log tranform
            temp_r_l = (temp_r+1).apply(np.log)
            # plain sqrt tranform
            temp_r_s = temp_r.apply(np.sqrt)
            # transformed skews
            t_skew = np.abs([temp_r_l.skew(),temp_r_s.skew(),temp_r_mms_l.skew(),temp_r_mms_s.skew()])
            # register flags
            if round(min(t_skew),2)<round(abs(cskew),2):
                self.fitting_info.loc[col,"reflect"]=True
                if min(t_skew)==t_skew[0]:
                    self.fitting_info.loc[col,"log"]=True
                    df[col]=temp_r_l
                elif min(t_skew)==t_skew[1]:
                    self.fitting_info.loc[col,"sqrt"]=True
                    df[col]=temp_r_s
                elif min(t_skew)==t_skew[2]:
                    self.fitting_info.loc[col,["log","mms"]]=[True,True]
                    df[col]=temp_r_mms_l
                elif min(t_skew)==t_skew[3]:
                    self.fitting_info.loc[col,["sqrt","mms"]]=[True,True]
                    df[col]=temp_r_mms_s                
        
        # test need for plain transforms
        collist=list(self.fitting_info.loc[self.fitting_info["skew"]>=0.75].index)
        for col in collist:            
            # read basic stats
            [cskew,cmin,cmax]=self.fitting_info.loc[col,["skew","min","max"]]
            
            # scale between 0-500
            temp_mms = (df[col]-cmin)*500/(cmax-cmin)
            self.fitting_info.loc[col,["mms_min","mms_max"]]=[temp_mms.min(),temp_mms.max()]
            # scaled log tranform
            temp_mms_l = (temp_mms+1).apply(np.log)
            # scaled sqrt tranform
            temp_mms_s = temp_mms.apply(np.sqrt)
            # plain log tranform
            temp_l = (df[col]+1).apply(np.log)
            # plain sqrt tranform
            temp_s = df[col].apply(np.sqrt)
            # transformed skews
            t_skew = np.abs([temp_l.skew(),temp_s.skew(),temp_mms_l.skew(),temp_mms_s.skew()])
            # register flags
            if round(min(t_skew),2)<round(abs(cskew),2):
                if min(t_skew)==t_skew[0]:
                    self.fitting_info.loc[col,"log"]=True
                    df[col]=temp_l
                elif min(t_skew)==t_skew[1]:
                    self.fitting_info.loc[col,"sqrt"]=True
                    df[col]=temp_s
                elif min(t_skew)==t_skew[2]:
                    self.fitting_info.loc[col,["log","mms"]]=True
                    df[col]=temp_mms_l
                elif min(t_skew)==t_skew[3]:
                    self.fitting_info.loc[col,["sqrt","mms"]]=[True,True]
                    df[col]=temp_mms_s
        
        # set fitted flag
        self.fitted=True             
    
    def transform(self,df):
        """perform transforms & scaling"""
        if not self.fitted:
            raise ValueError("please fit remap")
            return
        df=df.copy()
        for col in df.columns:            
            # find min max value
            cmin = self.fitting_info.loc[col,"min"]
            cmax = self.fitting_info.loc[col,"max"]
            
            # 1. reflection
            if self.fitting_info.loc[col,"reflect"]:
                temp = cmax+1-df[col]
                df[col] = temp
                # update min max
                cmin = self.fitting_info.loc[col,"r_min"] 
                cmax = self.fitting_info.loc[col,"r_max"]
                    
            # 2. min max scaling for log / sqrt
            if self.fitting_info.loc[col,"mms"]:
                temp = (df[col]-cmin)*500/(cmax-cmin)
                df[col] = temp
                # update min max
                cmin = self.fitting_info.loc[col,"mms_min"] 
                cmax = self.fitting_info.loc[col,"mms_max"]
            
            # 3. shift data to +ve scale
            if cmin<0:
                df[col]=df[col]-cmin 
            if df[col].min()<0: # reconfirm
                df[col]=df[col]-df[col].min()
                    
            # 4. log transform
            if self.fitting_info.loc[col,"log"]:
                df[col]=(df[col]+1).apply(np.log)
                
            # 5. sqrt transform
            if self.fitting_info.loc[col,"sqrt"]:
                df[col]=df[col].apply(np.sqrt)
                
            # 6. reverse Reflection
            if self.fitting_info.loc[col,"reflect"]:
                temp = np.log(cmax)+1-df[col]
                df[col] = temp
            
            # find skew
            self.fitting_info.loc[col,"trans_skew"]=df[col].skew()
        
        # find scaled skew
        self.fitting_info["trans_scaled_skew"]=df.skew()
            
        return df
    
    def fit_transform(self,df):
        """fit, remap"""
        self.fit(df)
        df=self.transform(df)
        return df

In [8]:
class pandaPoly():
    """PolynomialFeatures extraction and returns Pandas DataFrame"""
    from sklearn.preprocessing import PolynomialFeatures
    
    def __init__(self,degree=2, interaction_only=True):
        self.poly = self.PolynomialFeatures(degree=2, interaction_only=True)
        self.fitted=False
    
    def fit(self,df):
        self.poly.fit(df)
        self.fitted=True
    
    def transform(self,df):
        if self.fitted:
            df=df.copy()
            d2=pd.DataFrame(self.poly.transform(df),index=df.index)
            d2=pd.merge(df,d2,left_index=True,right_index=True)
            return d2
        else:
            raise ValueError("please fit pandaPoly")
    
    def fit_transform(self,df):
        self.fit(df)
        df=self.transform(df)
        return df    

In [9]:
class dummies:
    """to implement encoding without data leak"""
    def __init__(self):
        """input : dataframe"""
        self.ref={}
        self.fitted=False
    
    def fit(self,df):
        """Collect required encoding information"""
        cat=list(df.select_dtypes(include='object').columns)
        for col in cat:
            unq=list(df[col].value_counts().index)
            self.ref.update({col:unq})
        self.fitted=True
        return
    
    def transform(self,df):
        """perform encoding"""
        df=df.copy()
        if not self.fitted:
            raise ValueError("please fit first")
            return
        cat=list(self.ref.keys())
        for col in cat:
            unq=self.ref.get(col)
            for i in unq:
                df[col+"_"+str(i)]=df[col]
                df.loc[df[col+"_"+str(i)]==i,[col+"_"+str(i)]]=1
                df.loc[df[col+"_"+str(i)]!=1,[col+"_"+str(i)]]=0
            df.drop(col,axis=1,inplace=True)
            df.drop(col+"_"+str(unq[i]),axis=1,inplace=True) #drop_first=True
            df = df.apply(pd.to_numeric,errors='ignore',downcast='float',axis=0)
        return df
    
    def fit_transform(self,df):
        """learn and encode"""
        self.fit(df)
        df=self.transform(df)
        return df

In [10]:
class pandaCluster():
    """performs KMeans Clustering and returnd Pandas DataFrame with cluster encoded columns"""
    
    def __init__(self,n_clusters=4): # 4 selected for simplicity
        self.fitted=False
        # models
        self.scl = remap()
        self.dum = dummies()
        self.clt = KMeans(n_clusters=n_clusters)
    
    def fit(self,df):
        df=df.copy()
        
        # scale incoming data
        df= self.scl.fit_transform(df)
        # cluster fitting
        self.clt.fit(df)
        # encoder fitting for clusters
        pred=pd.DataFrame(self.clt.predict(df),columns=["CLUSTER"],index=df.index,dtype='object')
        self.dum.fit(pred)
        
        self.fitted=True
    
    def transform(self,df):
        if self.fitted:
            df=df.copy()
            dforig=df.copy()
            # scale data
            df= self.scl.transform(df)
            # predict clusters
            pred=pd.DataFrame(self.clt.predict(df),columns=["CLUSTER"],index=df.index,dtype='object')
            # encode cluster columns
            pred=self.dum.transform(pred)
            # merge with source
            df=pd.merge(dforig,pred,left_index=True,right_index=True)
            return df
        else:
            raise ValueError("please fit pandaCluster")
    
    def fit_transform(self,df):
        self.fit(df)
        df=self.transform(df)
        return df

below function implements the following steps<br>
**4. Data pre-processing:**<br>
B. Check for target balancing and ix it if found imbalanced.<br>
**5. Model training, testing and tuning:**<br>
A. Use any Supervised Learning technique to train a model.<br>
E. Display and explain the classi ication report in detail.<br>
**6. Post Training and Conclusion:**<br>
A. Display and compare all the models designed with their train and test accuracies.

In [11]:
def dtc_pipe(X_train, X_test, Y_train, Y_test, mname):
    """basic model+predict+log cycle"""
    
    # standardize
    scl=StandardScaler()
    X_train_std = pd.DataFrame(scl.fit_transform(X_train),columns=X_train.columns,index=X_train.index)
    X_test_std = pd.DataFrame(scl.transform(X_test),columns=X_test.columns,index=X_test.index) # transform only
    
    # balance training data set
    balancer = SMOTE(sampling_strategy='not majority', random_state=129)
    X_train_bal, Y_train_bal = balancer.fit_resample(X_train_std,Y_train)
    
    # model learning
    dtc=DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state=129)
    dtc.fit(X_train_bal,Y_train_bal)
    
    # predict
    pred_train=dtc.predict(X_train_std) # predict sufficient on imbalanced X
    pred_test=dtc.predict(X_test_std)

    # record scores
    reporter(Y_train,pred_train,Y_test,pred_test,mname)
    # generate reports (custom-built function : code in the begining of notebook)

## Project

**DOMAIN:** Semiconductor manufacturing process<br>
**• CONTEXT:**<br>
A complex modern semiconductor manufacturing process is normally under constant surveillance via the monitoring of signals/variables collected from sensors and or process measurement points. However, not all of these signals are equally valuable in a specific monitoring system. The measured signals contain a combination of useful information, irrelevant information as well as noise. Engineers typically have a much larger number of signals than are actually required. If we consider each type of signal as a feature, then feature selection may be applied to identify the most relevant signals. The Process Engineers may then use these signals to determine key factors contributing to yield excursions downstream in the process. This will enable an increase in process throughput, decreased time to learning and reduce the per unit production costs. These signals can be used as features to predict the yield type. And by analysing and trying out different combinations of features, essential signals that are impacting the yield type can be identified.<br>
**• DATA DESCRIPTION: sensor-data.csv :** (1567, 592)<br>
The data consists of 1567 datapoints each with 591 features. The dataset presented in this case represents a selection of such features where each example represents a single production entity with associated measured features and the labels represent a simple pass/fail yield for in house line testing. Target column “ –1” corresponds to a pass and “1” corresponds to a fail and the data time stamp is for that speci ic test point.<br>
**• PROJECT OBJECTIVE:**<br>
We will build a classifier to predict the Pass/Fail yield of a particular process entity and analyse whether all the
features are required to build the model or not.

**Steps and tasks:**<br>
**1.Import and understand the data.**<br>
A. Import ‘signal-data.csv’ as DataFrame.<br>
B. Print 5 point summary and share at least 2 observations.<br>

In [12]:
# read the dataset
df=pd.read_csv("signal-data.csv")

In [13]:
df.head()

Unnamed: 0,Time,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,...,566,567,568,569,570,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,Pass/Fail
0,2008-07-19 11:55:00,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,0.0162,-0.0034,0.9455,202.4396,0.0,7.9558,414.871,10.0433,0.968,192.3963,12.519,1.4026,-5419.0,2916.5,-4043.75,...,,,,,533.85,2.1113,8.95,0.3157,3.0624,0.1026,1.6765,14.9509,,,,,0.5005,0.0118,0.0035,2.363,,,,,-1
1,2008-07-19 12:32:00,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,-0.0005,-0.0148,0.9627,200.547,0.0,10.1548,414.7347,9.2599,0.9701,191.2872,12.4608,1.3825,-5441.5,2604.25,-3498.75,...,,,,,535.0164,2.4335,5.92,0.2653,2.0111,0.0772,1.1065,10.9003,0.0096,0.0201,0.006,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045,-1
2,2008-07-19 13:17:00,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,0.0041,0.0013,0.9615,202.0179,0.0,9.5157,416.7075,9.3144,0.9674,192.7035,12.5404,1.4123,-5447.75,2701.75,-4047.0,...,0.4122,0.2562,0.4119,68.8489,535.0245,2.0293,11.21,0.1882,4.0923,0.064,2.0952,9.2721,0.0584,0.0484,0.0148,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602,1
3,2008-07-19 14:43:00,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,-0.0124,-0.0033,0.9629,201.8482,0.0,9.6052,422.2894,9.6924,0.9687,192.1557,12.4782,1.4011,-5468.25,2648.25,-4515.0,...,3.5611,0.067,2.729,25.0363,530.5682,2.0253,9.33,0.1738,2.8971,0.0525,1.7585,8.5831,0.0202,0.0149,0.0044,73.8432,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432,-1
4,2008-07-19 15:22:00,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,-0.0031,-0.0072,0.9569,201.9424,0.0,10.5661,420.5925,10.3387,0.9735,191.6037,12.4735,1.3888,-5476.25,2635.25,-3987.5,...,,,,,532.0155,2.0275,8.83,0.2224,3.1776,0.0706,1.6597,10.9698,,,,,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432,-1


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Columns: 592 entries, Time to Pass/Fail
dtypes: float64(590), int64(1), object(1)
memory usage: 7.1+ MB


In [15]:
df.select_dtypes(include='int64').columns

Index(['Pass/Fail'], dtype='object')

In [16]:
df.select_dtypes(include='object').columns

Index(['Time'], dtype='object')

In [17]:
df.select_dtypes(include='object').describe()

Unnamed: 0,Time
count,1567
unique,1534
top,2008-10-15 01:52:00
freq,3


every column is a numeric data except for Time column<br>
later, lets see if we could extract features from Time column else drop it<br>
Also the Time column seems to have duplicates, which could be the same with all the other columns too.<br>
need to confirm to drop those.<br>

In [18]:
# typecast to datetime
df.Time=pd.to_datetime(df.Time)

In [19]:
# print 5 point summary
df.describe(datetime_is_numeric=True)[-5:]

Unnamed: 0,Time,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,...,566,567,568,569,570,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,Pass/Fail
25%,2008-07-09 15:32:00,2966.26,2452.2475,2181.0444,1081.8758,1.0177,100.0,97.92,0.1211,1.4112,-0.0108,-0.0056,0.9581,198.1307,0.0,7.094875,406.1274,9.567625,0.9682,188.299825,12.46,1.3965,-5933.25,2578.0,-4371.75,...,2.0902,0.0382,1.8844,15.4662,530.7027,1.9829,7.5,0.24225,2.56785,0.0751,1.40845,11.50155,0.0138,0.0106,0.0034,46.1849,0.4979,0.0116,0.0031,2.3065,0.013425,0.0106,0.0033,44.3686,-1.0
50%,2008-08-23 13:02:00,3011.49,2499.405,2201.0667,1285.2144,1.3168,100.0,101.5122,0.1224,1.4616,-0.0013,0.0004,0.9658,199.5356,0.0,8.967,412.2191,9.85175,0.9726,189.6642,12.4996,1.406,-5523.25,2664.0,-3820.75,...,2.15045,0.04865,1.9997,16.98835,532.3982,2.1186,8.65,0.2934,2.9758,0.0895,1.6245,13.8179,0.0204,0.0148,0.0047,72.2889,0.5002,0.0138,0.0036,2.75765,0.0205,0.0148,0.0046,71.9005,-1.0
75%,2008-09-22 11:16:30,3056.65,2538.8225,2218.0555,1591.2235,1.5257,100.0,104.5867,0.1238,1.5169,0.0084,0.0059,0.9713,202.0071,0.0,10.861875,419.089275,10.128175,0.9768,192.189375,12.5471,1.415,-5356.25,2841.75,-3352.75,...,3.098725,0.075275,2.97085,24.772175,534.3564,2.29065,10.13,0.3669,3.4925,0.11215,1.902,17.0809,0.0277,0.02,0.006475,116.53915,0.502375,0.0165,0.0041,3.295175,0.0276,0.0203,0.0064,114.7497,-1.0
max,2008-12-10 18:47:00,3356.35,2846.44,2315.2667,3715.0417,1114.5366,100.0,129.2522,0.1286,1.6564,0.0749,0.053,0.9848,272.0451,0.0,19.5465,824.9271,102.8677,0.9848,215.5977,12.9898,1.4534,0.0,3656.25,2363.0,...,14.0141,0.2932,12.7462,84.8024,589.5082,2.7395,454.56,2.1967,170.0204,0.5502,90.4235,96.9601,0.1028,0.0799,0.0286,737.3048,0.5098,0.4766,0.1045,99.3032,0.1028,0.0799,0.0286,737.3048,1.0
std,,73.621787,80.407705,29.513152,441.69164,56.35554,0.0,6.237214,0.008961,0.073897,0.015116,0.009302,0.012452,3.257276,0.0,2.796596,17.221095,2.403867,0.012062,2.781041,0.217965,0.016737,626.822178,295.498535,1380.162148,...,1.032761,0.032761,0.996644,10.213294,17.499736,0.275112,86.304681,0.248478,26.92015,0.067791,16.921369,12.485267,0.01173,0.00964,0.003116,87.520966,0.003404,0.01718,0.00372,3.578033,0.012358,0.008808,0.002867,93.891919,0.49801


There are few constant columns like "13","42",...<br>
There are few extreme skewed or quasi-constant columns like "4","21"...<br>
There are few near-perfect bell curves like "24"

Need to review and remove columns that doesn't add information to target
In reference to the target, the dataset seems imbalanced as more than 75% of data corresponds to -1

**2.Data cleansing:**<br>
A. Write a for loop which will remove all the features with 20%+ Null values and impute rest with mean of the feature.<br>
B. Identify and drop the features which are having same value for all the rows.

In [20]:
# verify if target column has nans
df["Pass/Fail"].isna().sum()

0

safe to continue without dropping any records

In [21]:
%%time
#lets review the nulls
nulsCount(df)
#(custom-built function : code in the begining of notebook)

CPU times: user 253 ms, sys: 39.5 ms, total: 292 ms
Wall time: 291 ms


Unnamed: 0,NULL,NULL %,NAN,NAN %,BLANKS,BLANKS %,UNEXP,UNEXP %
0,6,0.38,6,0.38,0,0.0,0,0.0
1,7,0.45,7,0.45,0,0.0,0,0.0
2,14,0.89,14,0.89,0,0.0,0,0.0
3,14,0.89,14,0.89,0,0.0,0,0.0
4,14,0.89,14,0.89,0,0.0,0,0.0
...,...,...,...,...,...,...,...,...
585,1,0.06,1,0.06,0,0.0,0,0.0
586,1,0.06,1,0.06,0,0.0,0,0.0
587,1,0.06,1,0.06,0,0.0,0,0.0
588,1,0.06,1,0.06,0,0.0,0,0.0


In [22]:
%%time
# lets review least number of uniques in the features
df.nunique().sort_values()[:5]

CPU times: user 121 ms, sys: 0 ns, total: 121 ms
Wall time: 156 ms


262    1
263    1
264    1
265    1
266    1
dtype: int64

In [23]:
# benchmark shape
df.shape

(1567, 592)

In [24]:
df_raw=df.copy()

In [25]:
%%time
for col in df.columns:
    if df[col].nunique()==1: # features having same values for all rows
        df.drop([col],axis=1,inplace=True)
    elif df[col].isnull().sum()/df.shape[0]>0.2: # features with 20%+ Null values
        df.drop([col],axis=1,inplace=True)
    elif df[col].isnull().sum()>0: # features having at least 1 null
        df[col].fillna(df[col].mean().astype('float32'),inplace=True)

CPU times: user 547 ms, sys: 7.34 ms, total: 554 ms
Wall time: 551 ms


In [26]:
# review shape
df.shape

(1567, 444)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Columns: 444 entries, Time to Pass/Fail
dtypes: datetime64[ns](1), float64(442), int64(1)
memory usage: 5.3 MB


In [28]:
# review nulls
nulsCount(df)

none found, hence lets proceed

----------------------------------------------------------------------------<br>
Let us set a base line model using DecisionTreeClassifier

In [29]:
# seperate predictors & targets
X = df[df.columns[1:-1]]
Y = df[df.columns[-1]]

# Train & Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.20, # split ratio of 80:20
                                                    random_state=129) # random seed

In [30]:
# model learning
dtc=DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state=129)
dtc.fit(X_train,Y_train)

# predict
pred_train=dtc.predict(X_train)
pred_test=dtc.predict(X_test)

# record scores
reporter(Y_train,pred_train,Y_test,pred_test,"DTC_raw")
# generate reports (custom-built function : code in the begining of notebook)

Unnamed: 0,accuracy,precision_-1,precision_1,recall_-1,recall_1,fscore_-1,fscore_1
DTC_raw_training,0.950519,0.95273,0.846154,0.99659,0.275,0.974167,0.415094
DTC_raw_test,0.914013,0.925566,0.2,0.986207,0.041667,0.954925,0.068966


pretty impressive accuracy and low execution time<br>
but unfortunately the precision, recall and f1_score for FAIL class (+1) is very poor<br>
they are poor in training data prediction, probably  because of imbalanced data<br>
in the test data prediction, those have fallen even lower, indicating over-fit model<br>
Lets build on our modelling

before proceeding further, lets extract some timestamp features & inherent clusters

In [31]:
# benchmark
X_train.shape

(1253, 442)

the below snippet adds to the following step<br>
**2.Data cleansing:**<br>
E. Make all relevant modi ications on the data using both functional/logical reasoning/assumptions.

In [32]:
# lets extract some features from the date time
X["Year"]=df.Time.dt.year
X["Mon"]=df.Time.dt.month
X["day"]=df.Time.dt.day
X["day_of_week"]=df.Time.dt.day_of_week
X["day_of_year"]=df.Time.dt.day_of_year
X["weekofyear"]=df.Time.dt.isocalendar().week
X["Hour"]=df.Time.dt.hour
X["Min"]=df.Time.dt.minute
X["Sec"]=df.Time.dt.second
X["Qtr"]=df.Time.dt.quarter

In [33]:
# Train & Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.20, # split ratio of 80:20
                                                    random_state=129) # random seed

In [34]:
# review
X_train.shape

(1253, 452)

In [35]:
%%time
# lets add few features about the inherent clusters in the dataset
clt=pandaCluster() # custom class : code in the begining of notebook
X_train_clt=clt.fit_transform(X_train)
X_test_clt=clt.transform(X_test)

CPU times: user 5.23 s, sys: 82.7 ms, total: 5.32 s
Wall time: 2.89 s


In [36]:
# review
X_train_clt.shape

(1253, 455)

below function implements the following steps<br>
**4. Data pre-processing:**<br>
B. Check for target balancing and ix it if found imbalanced.<br>
**5. Model training, testing and tuning:**<br>
A. Use any Supervised Learning technique to train a model.<br>
E. Display and explain the classification report in detail.<br>
**6. Post Training and Conclusion:**<br>
A. Display and compare all the models designed with their train and test accuracies.

In [37]:
# review model performance
# custom pipe : code in the begining of notebook
dtc_pipe(X_train_clt, X_test_clt, Y_train, Y_test,"DTC2_time_clt")

Unnamed: 0,accuracy,precision_-1,precision_1,recall_-1,recall_1,fscore_-1,fscore_1
DTC_raw_training,0.950519,0.95273,0.846154,0.99659,0.275,0.974167,0.415094
DTC_raw_test,0.914013,0.925566,0.2,0.986207,0.041667,0.954925,0.068966
DTC2_time_clt_training,0.865922,0.958904,0.221519,0.895141,0.4375,0.925926,0.294118
DTC2_time_clt_test,0.863057,0.942652,0.228571,0.906897,0.333333,0.924429,0.271186


the model perfromance has significantly improved in terms of FAIL class<br>
probably caused by the combined effect of feature additons, standardisation & target class balancing

**2.Data cleansing:**<br>
C. Drop other features if required using relevant functional knowledge. Clearly justify the same.<br>

In [38]:
# let us review spread of all features
stddev=pd.DataFrame(X_train_clt.std(),columns=["stddev"])
gdata=list(stddev.stddev)
fig = ff.create_distplot([gdata],['Stdandard Deviations'],
                         curve_type='kde',show_hist=True,
                         show_rug=True
                        )
fig.update_layout(height=500,width=1000,showlegend=False)
fig.show()

In [39]:
stddev.describe(percentiles=[0.1,0.2,0.25,0.3,0.4,0.5,0.6,0.7,0.75,0.8,0.9]).T

Unnamed: 0,count,mean,std,min,10%,20%,25%,30%,40%,50%,60%,70%,75%,80%,90%,max
stddev,455.0,74.49824,435.643811,0.0,0.009242,0.027431,0.05275,0.075622,0.25302,1.063889,2.917544,6.515314,9.233332,17.836337,61.769207,6522.814779


the variances (or standard deviations) of several fearutes are condensed below unity<br>
this indicates a that several features would not contribute to the model learning<br>
though z-score tranformation will shift & rescale the distributions, it would also leverage all the noises in the data towards model learning<br>
hence let us use few feature selection techniques to shrink our dataset

In [40]:
# let us remove any quasi-constant features
quasi = VarianceThreshold(threshold=0.01) #quasi-constant ness of 1%
X_train_quasi=pd.DataFrame(quasi.fit_transform(X_train_clt),
                           columns=X_train_clt.columns[quasi.get_support()],index=X_train_clt.index)
X_test_quasi=pd.DataFrame(quasi.transform(X_test_clt),
                          columns=X_train_clt.columns[quasi.get_support()],index=X_test_clt.index)

In [41]:
X_train_quasi.shape

(1253, 306)

In [42]:
X_train_clt.shape[1]-X_train_quasi.shape[1]

149

149 Quasi constant features were trimmed off leaving behind 306 features

In [43]:
# review model performance
# custom pipe : code in the begining of notebook
dtc_pipe(X_train_quasi, X_test_quasi, Y_train, Y_test,"DTC3_quasi")

Unnamed: 0,accuracy,precision_-1,precision_1,recall_-1,recall_1,fscore_-1,fscore_1
DTC_raw_training,0.950519,0.95273,0.846154,0.99659,0.275,0.974167,0.415094
DTC_raw_test,0.914013,0.925566,0.2,0.986207,0.041667,0.954925,0.068966
DTC2_time_clt_training,0.865922,0.958904,0.221519,0.895141,0.4375,0.925926,0.294118
DTC2_time_clt_test,0.863057,0.942652,0.228571,0.906897,0.333333,0.924429,0.271186
DTC3_quasi_training,0.642458,0.971391,0.119835,0.636829,0.725,0.76931,0.205674
DTC3_quasi_test,0.585987,0.949438,0.110294,0.582759,0.625,0.722222,0.1875


though the test scores seems to have reduced, the quantum of features dropped is a good trade off against it<br>
lets study furhter

**SCFS** (Standard deviation and Cosine similarity based Feature Selection)<br>
Reference article for feature scoring<br>
using custom method based on published paper from<br>
https://www.frontiersin.org/articles/10.3389/fgene.2021.684100/full<br>
Credits to: Juanying Xie, Mingzhao Wang, Shengquan Xu, Zhao Huang and Philip W. Grant<br>

**Explanation & Justification to use the method**<br>
The discernibility of a feature, refers to its distinguishable capability between categories<br>
Feature selection aims to detect the features whose distinguishable capability is strong while the redundancy between them is less<br>
To represent the redundancy between a feature and the other features, cosine similarity is used<br>
Feature independence is deduced from cosine similarity ( in 3 possible ways)<br>
The method guarantees that a feature will have the maximal independence as far as possible once it has the maximal discernibility<br>

In [44]:
%%time
# custom class code written at the beginning of the notebook
scfs=SCFS(kind='exp')

# evaluate feature scores
scfs.fit(X_train_quasi)

# lets review the feature scores
fig=go.Figure()
gdata=scfs.fscore.sort_values()
fig.add_trace(go.Scatter(x=gdata.index, y=gdata,name='feature score'))
fig.update_xaxes(title="features-->")
fig.update_yaxes(title="scores-->")
fig.show()

fig=go.Figure()
fig.add_trace(go.Scatter(x=scfs.dis,y=scfs.ind,mode='markers',name='discernibility vs independence'))
fig.update_xaxes(title="discernibility-->")
fig.update_yaxes(title="independence-->")
fig.show()

CPU times: user 17.6 s, sys: 73.3 ms, total: 17.7 s
Wall time: 17.5 s


the feature discernibility scales has over powered the feature independence scale, thus the above curve seems asymptotic to axes<br>
let us perform standardisation and then use the SCFS technique

In [45]:
%%time
# custom class code written at the beginning of the notebook
scfs_std=SCFS(kind='exp')

# standardize
scl=StandardScaler()
X_train_std = pd.DataFrame(scl.fit_transform(X_train_quasi),
                           columns=X_train_quasi.columns,index=X_train_quasi.index)

# evaluate feature scores
scfs_std.fit(X_train_std)

# lets review the feature scores
fig=go.Figure()
gdata=scfs_std.fscore.sort_values()
fig.add_trace(go.Scatter(x=gdata.index, y=gdata,name='feature score'))
fig.update_xaxes(title="features-->")
fig.update_yaxes(title="scores-->")
fig.show()

fig=go.Figure()
fig.add_trace(go.Scatter(x=scfs_std.dis,y=scfs_std.ind,mode='markers'))
fig.update_xaxes(title="discernibility-->")
fig.update_yaxes(title="independence-->")
fig.show()

CPU times: user 17.3 s, sys: 101 ms, total: 17.4 s
Wall time: 17.2 s


standard scaler has changed all feature discernibility to unity rendering no meaningful information<br>
let us try minmaxscaler

In [46]:
%%time
# custom class code written at the beginning of the notebook
scfs_mima=SCFS(kind='exp')

# standardize
mima=MinMaxScaler(feature_range=(-1,1))
X_train_mima = pd.DataFrame(mima.fit_transform(X_train_quasi),
                           columns=X_train_quasi.columns,index=X_train_quasi.index)

# evaluate feature scores
scfs_mima.fit(X_train_mima)

# lets review the feature scores
fig=go.Figure()
gdata=scfs_mima.fscore.sort_values()
fig.add_trace(go.Scatter(x=gdata.index, y=gdata,name='feature score'))
fig.update_xaxes(title="features-->")
fig.update_yaxes(title="scores-->")
fig.show()

fig=go.Figure()
fig.add_trace(go.Scatter(x=scfs_mima.dis,y=scfs_mima.ind,mode='markers'))
fig.update_xaxes(title="discernibility-->")
fig.update_yaxes(title="independence-->")
fig.show()

CPU times: user 17.1 s, sys: 59.1 ms, total: 17.1 s
Wall time: 17.1 s


In [47]:
# review potential trimmed feature count
(scfs_mima.fscore>0.2).sum()

44

the above feature scoring plot seems meaningful with an approximate elbow formed around a certain feature score<br>
let us try other 2 independence score kinds using same minmaxscaler

In [48]:
%%time
# custom class code written at the beginning of the notebook
scfs_mima_reci=SCFS(kind='reciprocal')

# standardize
mima=MinMaxScaler(feature_range=(-1,1))
X_train_mima = pd.DataFrame(mima.fit_transform(X_train_quasi),
                           columns=X_train_quasi.columns,index=X_train_quasi.index)

# evaluate feature scores
scfs_mima_reci.fit(X_train_mima)

# lets review the feature scores
fig=go.Figure()
gdata=scfs_mima_reci.fscore.sort_values()
fig.add_trace(go.Scatter(x=gdata.index, y=np.log(gdata),name='feature score')) # graph y-scale enhanced
fig.update_xaxes(title="features-->")
fig.update_yaxes(title="log(scores)-->")
fig.show()

fig=go.Figure()
fig.add_trace(go.Scatter(x=scfs_mima_reci.dis,y=np.log(scfs_mima_reci.ind),mode='markers')) # graph y-scale enhanced
fig.update_xaxes(title="discernibility-->")
fig.update_yaxes(title="log(independence)-->")
fig.show()

CPU times: user 17.7 s, sys: 101 ms, total: 17.8 s
Wall time: 17.6 s


In [49]:
# review potential trimmed feature count
(np.log(scfs_mima_reci.fscore)>0).sum()

31

reciprocal method gives significantly reduced number of features

In [50]:
%%time
# custom class code written at the beginning of the notebook
scfs_mima_as=SCFS(kind='anti-similarity')

# standardize
mima=MinMaxScaler(feature_range=(-1,1))
X_train_mima = pd.DataFrame(mima.fit_transform(X_train_quasi),
                           columns=X_train_quasi.columns,index=X_train_quasi.index)

# evaluate feature scores
scfs_mima_as.fit(X_train_mima)

# lets review the feature scores
fig=go.Figure()
gdata=scfs_mima_as.fscore.sort_values()
fig.add_trace(go.Scatter(x=gdata.index, y=gdata,name='feature score'))
fig.update_xaxes(title="features-->")
fig.update_yaxes(title="scores-->")
fig.show()

fig=go.Figure()
fig.add_trace(go.Scatter(x=scfs_mima_as.dis,y=scfs_mima_as.ind,mode='markers'))
fig.update_xaxes(title="discernibility-->")
fig.update_yaxes(title="independence-->")
fig.show()

CPU times: user 17.6 s, sys: 92.4 ms, total: 17.6 s
Wall time: 17.5 s


In [51]:
# review potential trimmed feature count
(scfs_mima_as.fscore>0.2).sum()

35

since reciproval method produces a better elbow, and returns minimal features<br>
lets choose reciprocal independence method with threshold of 0 scores in log scale

In [52]:
%%time
X_train_SCFS=X_train_mima.copy()

# cur progress
dims=X_train_SCFS.shape

# iteratively reduce features
# using reciprocal method with log threshold of 0
scfs_iter=SCFS(kind='reciprocal')
scfs_iter.fit(X_train_SCFS)
logscore=np.log(scfs_iter.fscore)
thresher=logscore.min()
while thresher<0:
    ind=logscore.argmin()
    feat=scfs_iter.fscore.index[ind]
    X_train_SCFS.drop(feat,axis=1,inplace=True)
    scfs_iter.fit(X_train_SCFS)
    logscore=np.log(scfs_iter.fscore)
    thresher=logscore.min()
    dims=X_train_SCFS.shape

CPU times: user 30min 14s, sys: 3.99 s, total: 30min 18s
Wall time: 30min 10s


In [53]:
# review trimmed shape
X_train_SCFS.shape

(1253, 34)

In [54]:
# review final feature scores
np.log(scfs_iter.fscore).describe()

count    34.000000
mean      0.874429
std       1.385611
min       0.037813
25%       0.203459
50%       0.422168
75%       0.964522
max       7.779985
dtype: float64

all low scored features have been removed leaving 34 features to go ahead

In [55]:
# the above trimmed dataset output is from mimax scalled data
# hence lets obtain same features from original unscaled data
cols=scfs_iter.fscore.index

#filter
X_train_SCFS=X_train_quasi[cols].copy()
X_test_SCFS=X_test_quasi[cols].copy()

In [56]:
# review shape
X_train_SCFS.shape

(1253, 34)

In [57]:
# lets test model performance
dtc_pipe(X_train_SCFS, X_test_SCFS, Y_train, Y_test,"DTC4_SCFS")

Unnamed: 0,accuracy,precision_-1,precision_1,recall_-1,recall_1,fscore_-1,fscore_1
DTC_raw_training,0.950519,0.95273,0.846154,0.99659,0.275,0.974167,0.415094
DTC_raw_test,0.914013,0.925566,0.2,0.986207,0.041667,0.954925,0.068966
DTC2_time_clt_training,0.865922,0.958904,0.221519,0.895141,0.4375,0.925926,0.294118
DTC2_time_clt_test,0.863057,0.942652,0.228571,0.906897,0.333333,0.924429,0.271186
DTC3_quasi_training,0.642458,0.971391,0.119835,0.636829,0.725,0.76931,0.205674
DTC3_quasi_test,0.585987,0.949438,0.110294,0.582759,0.625,0.722222,0.1875
DTC4_SCFS_training,0.705507,0.953725,0.106267,0.720375,0.4875,0.820787,0.174497
DTC4_SCFS_test,0.684713,0.940092,0.113402,0.703448,0.458333,0.804734,0.181818


though the scores seems to have reduced, the divide between training and testing scores have greately reduced, inferring significant reduction in data noise<br>
this justifies the power of SCFS methodology

Let us try to study feature importances from the DTree classifier

In [58]:
%%time
# standardize
scl=StandardScaler()
X_train_std = pd.DataFrame(scl.fit_transform(X_train_SCFS),
                           columns=X_train_SCFS.columns,index=X_train_SCFS.index)
X_test_std = pd.DataFrame(scl.transform(X_test_SCFS),
                          columns=X_test_SCFS.columns,index=X_test_SCFS.index)

# balance training data set
balancer = SMOTE(sampling_strategy='not majority', random_state=129)
X_train_bal, Y_train_bal = balancer.fit_resample(X_train_std,Y_train)

# model learning
dtc=DecisionTreeClassifier(criterion = 'gini', random_state=129)
dtc.fit(X_train_bal,Y_train_bal)

# store feature importances in sequence
fimp = pd.DataFrame(dtc.feature_importances_,index=X_train_bal.columns)
fimp=fimp.sort_values(by=0,ascending=True).index

CPU times: user 266 ms, sys: 12.1 ms, total: 278 ms
Wall time: 69.6 ms


In [59]:
fimp=pd.DataFrame(dtc.feature_importances_,index=X_train_bal.columns).sort_values(by=0,ascending=False)
fig=go.Figure()
y=fimp[0]
x=list(fimp.index)
fig.add_trace(go.Bar(x=x,y=y,name='importance'))
y=np.array(fimp[0]).cumsum()
fig.add_trace(go.Scatter(x=x,y=y,name='cumulative importance'))
fig.update_xaxes(title="features --->")
fig.update_yaxes(title="importance --->")
fig.show()

the above feature importance plot shows a gradual increase/decrease of influence by every next feature<br>
we cannot justify to drop any further features from the above, hence lets move on

-----------------------------------------------------------------------------------------------<br>
by now, the following project statements have been covered in various sections and mentioned here to keep track<br>

1. Import and understand the data.<br>
A. Import ‘signal-data.csv’ as DataFrame.<br>
B. Print 5 point summary and share at least 2 observations.<br>
2. Data cleansing:<br>
A. Write a for loop which will remove all the features with 20%+ Null values and impute rest with mean of the feature.<br>
B. Identify and drop the features which are having same value for all the rows.<br>
C. Drop other features if required using relevant functional knowledge. Clearly justify the same.<br>
   (quasi constants removed, SCFS method applied)<br>
E. Make all relevant modifications on the data using both functional/logical reasoning/assumptions.<br>
   (timestamp features extracted, inherent clusters identified)
4. Data pre-processing:<br>
A. Segregate predictors vs target attributes.<br>
B. Check for target balancing and fix it if found imbalanced.<br>
   (implmented inside pipe method)<br>
C. Perform train-test split and standardise the data or vice versa if required.<br>
   (implmented inside pipe method)
5. Model training, testing and tuning:<br>
A. Use any Supervised Learning technique to train a model.<br>
   (implmented as pipe)

**2.Data cleansing:**<br>
D. Check for multi-collinearity in the data and take necessary action.

In [60]:
# verify correlation in base data
((abs(X_train_quasi.corr())>0.75).sum().sum()-X_train_quasi.shape[1])/2

492.0

there had been 491 cases of multi-colinearity pairs within just 306 features after quasi constant feature elimination

In [61]:
# verify correlation in SCFS data
((abs(X_train_SCFS.corr())>0.75).sum().sum()-X_train_SCFS.shape[1])/2

1.0

there are no case of high correlation, since already SCFS has taken dependence of features in to consideration for feature scoring

In [62]:
z=pd.DataFrame(X_train_SCFS.corr())
fig=ff.create_annotated_heatmap(np.array(z),annotation_text=np.array(z).round(1),
                                colorscale='RdBu',zmin=-1,zmax=1,font_colors = ['Blue','Grey'])
fig.update_layout(height=1000,width=1000)

it could be seen that the maximum correlation is 0.7 or -0.6, hence there is not much multicolinearity, except for countably 2 or 3 pairs

lets investigate further, using Variance Inflation Factors<br>
by definition, the variance inflation factor is a measure for the increase of the variance of the parameter estimates if an additional variable, given by exog_idx is added to the linear regression. It is a measure for multicollinearity of the design matrix, exog.<br>
One recommendation is that if VIF is greater than 5, then the explanatory variable given by exog_idx is highly collinear with the other explanatory variables, and the parameter estimates will have large standard errors because of this.<br>
hence features having VIF above 5 needs to be studied for dropping

In [63]:
%%time
# let us drop features for VIF > 5
X_train_vif=X_train_SCFS.copy()
#obtain vif
cols=X_train_vif.columns
vif = pd.DataFrame(index=cols)
vif["VIF"]=[variance_inflation_factor(X_train_vif.values, i) for i in range(len(cols))]

#lets display the vif summary before trimming
print("for ",X_train_vif.shape[1]," features")
display(vif.describe().T)

while vif.max()[0]>5:
    col=vif.index[np.argmax(vif["VIF"])] # select top vif column
    X_train_vif.drop(col,axis=1,inplace=True)
    #recompute VIF
    del vif
    cols=X_train_vif.columns
    vif = pd.DataFrame(index=cols)
    vif["VIF"]=[variance_inflation_factor(X_train_vif.values, i) for i in range(len(cols))]

# lets review the vif after trimming
print("\nfor ",X_train_vif.shape[1]," features")
display(vif.describe().T)

for  34  features


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
VIF,34.0,154.336511,440.343532,1.599582,2.603413,10.50677,23.499655,2349.301543



for  14  features


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
VIF,14.0,2.195455,0.677902,1.242474,1.696825,2.069178,2.624982,3.40984


CPU times: user 2.94 s, sys: 72.2 ms, total: 3.02 s
Wall time: 1 s


In [64]:
# lets study the model performance
#test data selection
X_test_vif = X_test_SCFS[X_train_vif.columns]
dtc_pipe(X_train_vif, X_test_vif, Y_train, Y_test,"DTC5_vif")

Unnamed: 0,accuracy,precision_-1,precision_1,recall_-1,recall_1,fscore_-1,fscore_1
DTC_raw_training,0.950519,0.95273,0.846154,0.99659,0.275,0.974167,0.415094
DTC_raw_test,0.914013,0.925566,0.2,0.986207,0.041667,0.954925,0.068966
DTC2_time_clt_training,0.865922,0.958904,0.221519,0.895141,0.4375,0.925926,0.294118
DTC2_time_clt_test,0.863057,0.942652,0.228571,0.906897,0.333333,0.924429,0.271186
DTC3_quasi_training,0.642458,0.971391,0.119835,0.636829,0.725,0.76931,0.205674
DTC3_quasi_test,0.585987,0.949438,0.110294,0.582759,0.625,0.722222,0.1875
DTC4_SCFS_training,0.705507,0.953725,0.106267,0.720375,0.4875,0.820787,0.174497
DTC4_SCFS_test,0.684713,0.940092,0.113402,0.703448,0.458333,0.804734,0.181818
DTC5_vif_training,0.660016,0.95941,0.106818,0.664962,0.5875,0.785498,0.180769
DTC5_vif_test,0.617834,0.938144,0.1,0.627586,0.5,0.752066,0.166667


the recal scores have improved, with significant drop of features<br>
yet going further, let us test the models with both SCFS trimmed & VIF Trimmed data

**2.Data cleansing:**<br>
E. Make all relevant modifications on the data using both functional/logical reasoning/assumptions.

In [65]:
%%time
# lets try skew corrections in the data
rmp=remap() 
# custom class code written at the beginning of the notebook
X_train_vif_rmp=rmp.fit_transform(X_train_vif)
X_test_vif_rmp=rmp.transform(X_test_vif)

# lets study the model performance
dtc_pipe(X_train_vif_rmp, X_test_vif_rmp, Y_train, Y_test,"DTC6_vif+remap")

Unnamed: 0,accuracy,precision_-1,precision_1,recall_-1,recall_1,fscore_-1,fscore_1
DTC_raw_training,0.950519,0.95273,0.846154,0.99659,0.275,0.974167,0.415094
DTC_raw_test,0.914013,0.925566,0.2,0.986207,0.041667,0.954925,0.068966
DTC2_time_clt_training,0.865922,0.958904,0.221519,0.895141,0.4375,0.925926,0.294118
DTC2_time_clt_test,0.863057,0.942652,0.228571,0.906897,0.333333,0.924429,0.271186
DTC3_quasi_training,0.642458,0.971391,0.119835,0.636829,0.725,0.76931,0.205674
DTC3_quasi_test,0.585987,0.949438,0.110294,0.582759,0.625,0.722222,0.1875
DTC4_SCFS_training,0.705507,0.953725,0.106267,0.720375,0.4875,0.820787,0.174497
DTC4_SCFS_test,0.684713,0.940092,0.113402,0.703448,0.458333,0.804734,0.181818
DTC5_vif_training,0.660016,0.95941,0.106818,0.664962,0.5875,0.785498,0.180769
DTC5_vif_test,0.617834,0.938144,0.1,0.627586,0.5,0.752066,0.166667


CPU times: user 423 ms, sys: 8.05 ms, total: 431 ms
Wall time: 146 ms


In [66]:
%%time
# lets try skew corrections in the data
rmp=remap() 
# custom class code written at the beginning of the notebook
X_train_SCFS_rmp=rmp.fit_transform(X_train_SCFS)
X_test_SCFS_rmp=rmp.transform(X_test_SCFS)

# lets study the model performance
dtc_pipe(X_train_SCFS_rmp, X_test_SCFS_rmp, Y_train, Y_test,"DTC7_SCFS+remap")

Unnamed: 0,accuracy,precision_-1,precision_1,recall_-1,recall_1,fscore_-1,fscore_1
DTC_raw_training,0.950519,0.95273,0.846154,0.99659,0.275,0.974167,0.415094
DTC_raw_test,0.914013,0.925566,0.2,0.986207,0.041667,0.954925,0.068966
DTC2_time_clt_training,0.865922,0.958904,0.221519,0.895141,0.4375,0.925926,0.294118
DTC2_time_clt_test,0.863057,0.942652,0.228571,0.906897,0.333333,0.924429,0.271186
DTC3_quasi_training,0.642458,0.971391,0.119835,0.636829,0.725,0.76931,0.205674
DTC3_quasi_test,0.585987,0.949438,0.110294,0.582759,0.625,0.722222,0.1875
DTC4_SCFS_training,0.705507,0.953725,0.106267,0.720375,0.4875,0.820787,0.174497
DTC4_SCFS_test,0.684713,0.940092,0.113402,0.703448,0.458333,0.804734,0.181818
DTC5_vif_training,0.660016,0.95941,0.106818,0.664962,0.5875,0.785498,0.180769
DTC5_vif_test,0.617834,0.938144,0.1,0.627586,0.5,0.752066,0.166667


CPU times: user 475 ms, sys: 8.03 ms, total: 483 ms
Wall time: 192 ms


skew correction has decreased the scores in case of VIF trimmed dataset<br>
where as for SCFS trimmed dataset there is no change in scores

apart from above skew correction,<br>
timestamp feature extraction<br>
and cluster feature extraction<br>
were performed earlier in the notebook

In [67]:
# let us check for duplicate rows
display(X_train_SCFS.duplicated().sum())
display(X_train_vif.duplicated().sum())

0

0

there are no duplicate records

In [68]:
# let us check for duplicate features
display(X_train_SCFS.T.duplicated().sum())
display(X_train_vif.T.duplicated().sum())

0

0

as expected, the SCFC method and VIF methods would have removed any duplicated features out of similarity & collinearity

# -----------------------------------------------------------------------------------------------------------

**3. Data analysis & visualisation:**<br>
A. Perform a detailed univariate Analysis with appropriate detailed comments after each analysis.<br>
B. Perform bivariate and multivariate analysis with appropriate detailed comments after each analysis.<br>

In [69]:
# lets study the columns in order of importance
passind=Y_train.loc[Y_train==-1].index
failind=Y_train.loc[Y_train==1].index

In [70]:
def univar(X,col):
    """display univariate plots"""
    a=X[col].loc[passind]
    b=X[col].loc[failind]
    
    if X[col].nunique()>10:
        fig = make_subplots(rows=2,cols=1)
        fig2 = ff.create_distplot([a,b],['Pass', 'Fail'],curve_type='kde',
                                  bin_size=int(X[col].nunique()/10),show_hist=True,show_rug=False)
        fig.add_trace(go.Histogram(fig2.data[0],marker=dict(color='#1E90FF',opacity=0.4)),1,1)
        fig.add_trace(go.Histogram(fig2.data[1],marker=dict(color='#FF4500',opacity=0.4)),1,1)
        fig.add_trace(go.Scatter(fig2.data[2],line=dict(color='#1E90FF')),1,1)
        fig.add_trace(go.Scatter(fig2.data[3],line=dict(color='#FF4500')),1,1)
        text="continuous feature %s<br>pass mean %.2f, fail mean %.2f\
        <br>pass median %.2f, fail median %.2f"%(col,a.mean(),b.mean(),a.median(),b.median())
        fig.update_layout(title=text)
        
        fig.add_trace(go.Box(x=a,name='Pass',hovertemplate='%{x}',jitter=1,marker=dict(color='#1E90FF')),2,1)
        fig.add_trace(go.Box(x=b,name='Fail',hovertemplate='%{x}',jitter=1,marker=dict(color='#FF4500')),2,1)
        fig.show()
              
    else:
        fig=go.Figure()
        fig2 = ff.create_distplot([a,b],['Pass', 'Fail'],show_curve=True,show_hist=True,show_rug=False)
        fig.add_trace(go.Histogram(fig2.data[0],xbins=dict(start=0,end=X[col].nunique(),size=0.5),
                                   marker=dict(color='#1E90FF',opacity=0.4)))
        fig.add_trace(go.Histogram(fig2.data[1],xbins=dict(start=0,end=X[col].nunique(),size=0.5),
                                   marker=dict(color='#FF4500',opacity=0.4)))
        fig.add_trace(go.Scatter(fig2.data[2],line=dict(color='#1E90FF')))
        fig.add_trace(go.Scatter(fig2.data[3],line=dict(color='#FF4500')))
        text="discrete feature %s<br>pass mean %.2f, fail mean %.2f"%(col,a.mean(),b.mean())
        fig.update_layout(title=text)  
        fig.show()
    return a,b

In [71]:
#lets study the features in VIF trimmed dataset 
cols=X_train_vif.columns
univar(X_train_vif,cols[0]);

the data follows a normal distribution, but with several extreme values on either sides
means of pass & fail classes are not far off compared to the range of the data

In [72]:
univar(X_train_vif,cols[1]);

follows bell curve<br>
difference in target class means found

In [73]:
univar(X_train_vif,cols[2]);

does not follow normal distribution<br>
twin peaks found, one near 0, and another near 400<br>
difference in target class means found

In [74]:
univar(X_train_vif,cols[3]);

does not follow normal distribution<br>
twin peaks found, one near 0, and another near 400<br>
difference in target class means found, but on the opposite direction from previous feature

In [75]:
univar(X_train_vif,cols[4]);

In [76]:
univar(X_train_vif,cols[5]);

above two features exhibit similar distributions<br>
yet SCFS & VIF methods has confimed that these are not related

In [77]:
univar(X_train_vif,cols[6]);

a skewed bell curve found<br>
central tendencies are close for target classes

In [78]:
univar(X_train_vif,cols[7]);

twin peak found

In [79]:
univar(X_train_vif,cols[8]);

heavy peak found close to zero, causing high skewed distribution

In [80]:
univar(X_train_vif,cols[9]);

In [81]:
univar(X_train_vif,cols[10]);

above two features exhibit similar distributions<br>
yet SCFS & VIF methods has confimed that these are not related

In [82]:
univar(X_train_vif,cols[11]);

close to uniform distribution

In [83]:
univar(X_train_vif,cols[12]);

In [84]:
univar(X_train_vif,cols[13]);

above two features exhibite near uniform distribution

last two features are synthesised features of inherent data clusters<br>
doesn't follow any distribution

Let plot a bivariate pair plot and study further

In [86]:
gdata=X_train_vif.merge(Y_train.astype('object'),how='inner',left_index=True,right_index=True)
dims=gdata.columns
fig = px.scatter_matrix(gdata,dimensions=dims,color='Pass/Fail',symbol='Pass/Fail',opacity=0.5,height=800)                        
fig.show()

from the above plot one may not be able to decipher any relations as the datapoints have clouded all over their space<br>
let us study further

In [87]:
passind=Y_train.loc[Y_train==-1].index
failind=Y_train.loc[Y_train==1].index

In [88]:
def bivar(X,a,b):
    x1=X[a].loc[passind]
    x2=X[a].loc[failind]
    y1=X[b].loc[passind]
    y2=X[b].loc[failind]
    
    fig1=go.Scatter(x=x1,y=y1,mode='markers',marker=dict(color="#1E90FF",opacity=1,size=5),name='Pass')
    fig2=go.Scatter(x=x2,y=y2,mode='markers',marker=dict(color="#FF4500",opacity=1,size=6),name='Fail')
    fig3=go.Histogram2dContour(x=x2,y=y2,showscale=False,reversescale=True,ncontours=12,name='FailDensity',
                           colorscale=[[0.0, 'rgb(150,150,150)'], [1.0, 'rgb(255, 255, 255)']],showlegend=True)
    
    fig=go.Figure()
    fig.add_trace(fig1)
    fig.add_trace(fig2)
    fig.add_trace(fig3)
    fig.update_layout(height=650,width=650)
    fig.update_xaxes(title="Feature "+a)
    fig.update_yaxes(title="Feature "+b)
    fig.show()

In [89]:
# let us study multivariate distributions of top 5 important features in VIF trimmed dataset
cols=fimp[fimp.index.isin(X_train_vif.columns)].index
for i,a in enumerate(cols):
    for b in cols[(i+1):5]:
        bivar(X_train_vif,a,b)
        input("comments: ")        

comments: the fail class seems to be spread over the entire region, more concetrated in the etream values


comments: the fail class is concentrated around 0.5 of feature 129, and majorily around 250 of feature 486


comments: triple peaks of failure occurances found around (0,0),(400,0),(250,400)


comments: failure classes occur along the 2 bud shaped zones in a majority


comments: peaking failures around (500,0.5)


comments: two peak zones (0-500,0) and (500,400) found to be more prone to failure


comments: more concetration close to origin


comments: two peaks observed around (0.5,0) and (0.5,400)


comments: very high probabilty around the origin


comments: upto 5 clusters could be found, with prominent ones around origin and around (400,100)


# -----------------------------------------------------------------------------------------------------------

**4. Data pre-processing:**<br>
D. Check if the train and test data have similar statistical characteristics when compared with original data.

In [90]:
# lets create a table of feature summary characteristics
# training predictors
X_train_stats=X_train_SCFS.describe()[1:].T
X_train_stats["skew"]=X_train_SCFS.skew()
X_train_stats["kurt"]=X_train_SCFS.kurt()
# testing predictors
X_test_stats=X_test_SCFS.describe()[1:].T
X_test_stats["skew"]=X_test_SCFS.skew()
X_test_stats["kurt"]=X_test_SCFS.kurt()

In [91]:
# targets
Y_stats=pd.DataFrame(index=["train","test"],columns=["-1 count","1 count","ratio -1:1"])
stat=Y_train.value_counts()
Y_stats.iloc[0]=[stat.loc[-1],stat.loc[1],stat.loc[-1]/stat.loc[1]]
stat=Y_test.value_counts()
Y_stats.iloc[1]=[stat.loc[-1],stat.loc[1],stat.loc[-1]/stat.loc[1]]
display(Y_stats)

Unnamed: 0,-1 count,1 count,ratio -1:1
train,1173,80,14.6625
test,290,24,12.083333


the target class is almost equally distributed in the training & testing dataset

In [92]:
# lets compare the stats with a random column
def ttcomp(col):
    n=col
    fig=make_subplots(rows=1,cols=2)
    #------------------------------
    cols=X_train_stats.columns[:-2]
    fig.add_trace(go.Bar(x=cols,y=X_train_stats.loc[n,cols],
                         marker=dict(color="#87CEEB",opacity=0.8),name="training"),1,1)
    fig.add_trace(go.Bar(x=cols,y=X_test_stats.loc[n,cols],
                         marker=dict(color="#A9A9A9",opacity=0.8),name="testing"),1,1)
    #------------------------------
    cols=X_train_stats.columns[-2:]
    fig.add_trace(go.Bar(x=cols,y=X_train_stats.loc[n,list(cols)],
                         marker=dict(color="#87CEEB",opacity=0.8),name="training",showlegend=False),1,2)
    fig.add_trace(go.Bar(x=cols,y=X_test_stats.loc[n,list(cols)],
                         marker=dict(color="#A9A9A9",opacity=0.8),name="testing",showlegend=False),1,2)
    fig.update_layout(title="comparison of train & test data ditribution w.r.t. column %s"%n)
    fig.show()
    #------------------------------
    # lets visualise the distribution

    fig=go.Figure()
    #------------------------------
    passind=Y_train.loc[Y_train==-1].index
    failind=Y_train.loc[Y_train==1].index
    a=X_train_SCFS.loc[passind,n]
    b=X_train_SCFS.loc[failind,n]
    fig2 = ff.create_distplot([a,b],['Pass', 'Fail'],curve_type='kde',show_hist=False,show_rug=False)
    fig.add_trace(go.Scatter(fig2.data[0],line=dict(color='#1E90FF'),name='train/pass'))
    fig.add_trace(go.Scatter(fig2.data[1],line=dict(color='#FF4500'),name='train/fail'))
    #------------------------------
    passind=Y_test.loc[Y_test==-1].index
    failind=Y_test.loc[Y_test==1].index
    a=X_test_SCFS[n].loc[passind]
    b=X_test_SCFS[n].loc[failind]
    fig2 = ff.create_distplot([a,b],['Pass', 'Fail'],curve_type='kde',show_hist=False,show_rug=False)
    fig.add_trace(go.Scatter(fig2.data[0],line=dict(color='#7B68EE'),name='test/pass'))
    fig.add_trace(go.Scatter(fig2.data[1],line=dict(color='#CD5C5C'),name='test/fail'))
    fig.show()

In [93]:
ttcomp(fimp.index[10])

most of the statistical indices of the feature are closely similar in training & testing dataset

# -----------------------------------------------------------------------------------------------------------

**5. Model training, testing and tuning:**<br>
A. Use any Supervised Learning technique to train a model.<br>
B. Use cross validation techniques.<br>
Hint: Use all CV techniques that you have learnt in the course.<br>
C.
Apply hyper-parameter tuning techniques to get the best accuracy.<br>
Suggestion: Use all possible hyper parameter combinations to extract the best accuracies.<br>
D.
Use any other technique/method which can enhance the model performance.<br>
Hint: Dimensionality reduction, attribute removal, standardisation/normalisation, target balancing etc.<br>
E. Display and explain the classi ication report in detail. <br>
F. Apply the above steps for all possible models that you have learnt so far.

In [123]:
try:
    del scoreLog
    print("scoreLog deleted")
except:
    print("scoreLog undefined")

scoreLog deleted


In [124]:
# lets encode target class to from (pass,fail)=(-1,1) to (0,1)
Y_train.loc[Y_train==-1]=0
Y_test.loc[Y_test==-1]=0

In [125]:
models = [ XGBClassifier(objective='reg:logistic',n_jobs=-1,eval_metric='rmse',use_label_encoder=False),
          DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state=1),
          RandomForestClassifier(n_estimators = 50, random_state=1,max_features=12),
          LogisticRegression(solver="liblinear"),
          KNeighborsClassifier(n_neighbors= 5 , weights = 'distance'),
          SVC(gamma=0.025, C=3) 
         ]

In [126]:
xgbcp=dict(eta=np.arange(0.01,0.2,0.01),
           max_depth=np.arange(3,10,1),
           colsample_bytree=np.arange(0.5,1,0.1),
          )
lrp=dict(C=np.logspace(-5, 8, 15))
dtcp= {"max_depth": [3, None],
       "max_features": np.arange(5,15,1),
       "min_samples_leaf": np.arange(1,9,1),
       "criterion": ["gini", "entropy"]}
rdcp=dtcp
kncp=dict(n_neighbors=np.arange(3,9,2),
         weights=['uniform','distance'],
         leaf_size=np.arange(10,100,5))
svcp=dict(C=np.arange(1,10,1),
         kernel=['linear','rbf'])
params=[xgbcp,dtcp,rdcp,lrp,kncp,svcp]

In [127]:
%%time
# learn first model with KFold CrossValidation
kfold=KFold(n_splits=10,random_state=129,shuffle=True)
model=models[0]
results=cross_val_score(model,X_train_vif,Y_train,cv=kfold)

CPU times: user 5.43 s, sys: 36.2 ms, total: 5.46 s
Wall time: 717 ms


In [128]:
# lets review results
print(results)
print("\nMean Accuracy: %.2f\n95%% confidence interval: %.2f"%(
    results.mean(),results.std()))

[0.92063492 0.88095238 0.96825397 0.936      0.928      0.92
 0.944      0.92       0.936      0.92      ]

Mean Accuracy: 0.93
95% confidence interval: 0.02


In [129]:
%%time
# lets try another LOOCV
loocv = LeaveOneOut()
model=models[0]
results=cross_val_score(model,X_train_vif,Y_train,cv=loocv)
# lets review results
print(results)
print("\nMean Accuracy: %.2f\n95%% confidence interval: %.2f"%(
    results.mean(),results.std()))

[1. 1. 1. ... 1. 0. 1.]

Mean Accuracy: 0.93
95% confidence interval: 0.26
CPU times: user 11min 7s, sys: 6.05 s, total: 11min 13s
Wall time: 1min 25s


the LOOCV provides a caution about widened confidence interval,<br>
yet consumes more compute times.<br>
for the upcoming models lets stick to KFold

In [130]:
# before tuning, lets review test scores
model.fit(X_train_vif,Y_train)
print("Test Accuracy : %.2f"%metrics.accuracy_score(Y_test,model.predict(X_test_vif)))

Test Accuracy : 0.92


lower than train data

In [131]:
%%time
tune=RandomizedSearchCV(estimator=model,param_distributions=xgbcp,
                        cv=10,scoring="accuracy");
tune.fit(X_train_vif,Y_train);

CPU times: user 43.3 s, sys: 965 ms, total: 44.3 s
Wall time: 5.59 s


RandomizedSearchCV(cv=10,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1,
                                           eval_metric='rmse', gamma=0,
                                           gpu_id=-1, importance_type='gain',
                                           interaction_constraints='',
                                           learning_rate=0.300000012,
                                           max_delta_step=0, max_depth=6,
                                           min_child_weight=1, missing=nan,
                                           monotone_constraints='()',
                                           n_estimators...
                                           reg_lambda=1, scale_pos_weight=1,
                                           subsample=1, tree

In [132]:
tune.best_params_.keys()

dict_keys(['max_depth', 'eta', 'colsample_bytree'])

In [133]:
%%time
#lets fit to tuned parameters
model.set_params(**tune.best_params_)
kfold=KFold(n_splits=10,random_state=129,shuffle=True)
results=cross_val_score(model,X_train_vif,Y_train,cv=kfold)
# lets review results
print(results)
print("\nMean Accuracy: %.2f\n95%% confidence interval: %.2f"%(
    results.mean(),results.std()))

[0.92063492 0.88888889 0.97619048 0.944      0.928      0.928
 0.952      0.936      0.936      0.928     ]

Mean Accuracy: 0.93
95% confidence interval: 0.02
CPU times: user 5.3 s, sys: 32 ms, total: 5.33 s
Wall time: 681 ms


In [134]:
# lets review test scores
model.fit(X_train_vif,Y_train)
print("Test Accuracy : %.2f"%metrics.accuracy_score(Y_test,model.predict(X_test_vif)))

Test Accuracy : 0.92


Test accuracy not improved

In [135]:
%%time
# lets use all other  techniques
# standardize
scl=StandardScaler()
X_train_std = pd.DataFrame(scl.fit_transform(X_train_vif),
                           columns=X_train_vif.columns,index=X_train_vif.index)
X_test_std = pd.DataFrame(scl.transform(X_test_vif),
                          columns=X_test_vif.columns,index=X_test_vif.index)

# balance training data set
balancer = SMOTE(sampling_strategy='not majority', random_state=129)
X_train_bal, Y_train_bal = balancer.fit_resample(X_train_std,Y_train)

#hypertune
tune=RandomizedSearchCV(estimator=model,param_distributions=xgbcp,
                        cv=10,scoring="accuracy");
tune.fit(X_train_bal,Y_train_bal)

#lets fit to tuned parameters
model.set_params(**tune.best_params_)
kfold=KFold(n_splits=10,random_state=129,shuffle=True)
results=cross_val_score(model,X_train_bal,Y_train_bal,cv=kfold)

# lets review results
print(results)
print("\nMean Accuracy: %.2f\n95%% confidence interval: %.2f"%(
    results.mean(),results.std()))

model.fit(X_train_bal,Y_train_bal)
print("Test Accuracy : %.2f"%metrics.accuracy_score(Y_test,model.predict(X_test_std)))

[0.96595745 0.95319149 0.97021277 0.98723404 0.95319149 0.97446809
 0.98717949 0.98717949 0.97435897 0.96153846]

Mean Accuracy: 0.97
95% confidence interval: 0.01
Test Accuracy : 0.92
CPU times: user 1min 30s, sys: 616 ms, total: 1min 31s
Wall time: 11.5 s


test accuracy dropped lower<br>
need more trials<br>
lets build a custome pipe

In [136]:
scoreLog=pd.DataFrame(columns=["model_obj","Train_Acc","Test_Acc"])

In [137]:
def train_test_tune(estimator, p_grid, dset, scaler, skew_corr, mname):
    clf=estimator
    [X_train,Y_train,X_test,Y_test]=dset
    
    # balance
    balancer = SMOTE(sampling_strategy='not majority', random_state=129)
    X_train, Y_train = balancer.fit_resample(X_train,Y_train)
    
    # skew correction
    if skew_corr:
        rmp=remap()
        X_train=rmp.fit_transform(X_train)
        X_test=rmp.transform(X_test)
    
    # scale
    if scaler=="z-score":
        scl=StandardScaler()
    elif scaler=="minmax":
        scl=MinMaxScaler()
    X_train = pd.DataFrame(scl.fit_transform(X_train),columns=X_train.columns,index=X_train.index)
    X_test = pd.DataFrame(scl.transform(X_test),columns=X_test.columns,index=X_test.index)
    
    #hypertune
    tune=RandomizedSearchCV(estimator=clf,param_distributions=p_grid,
                            cv=10,scoring="accuracy")
    tune.fit(X_train,Y_train)
    
    #lets check tuned accuracy
    clf.set_params(**tune.best_params_)
    kfold=KFold(n_splits=10,random_state=129,shuffle=True)
    results=cross_val_score(clf,X_train,Y_train,cv=kfold)
    
    #lets fit the model
    clf.fit(X_train,Y_train)
    testscore=metrics.accuracy_score(Y_test,model.predict(X_test))
    
    #lets store the case
    score=[clf,results.mean(),testscore]
    scoreLog.loc[mname]=score
    return X_test

In [138]:
%%time
#pass dataset
dset=[X_train_vif,Y_train,X_test_vif,Y_test]
# fit model
train_test_tune(models[0],xgbcp,dset,'minmax',True,"XGBC_mima_skew")
#review Scores
scoreLog

CPU times: user 1min 17s, sys: 708 ms, total: 1min 18s
Wall time: 9.91 s


Unnamed: 0,model_obj,Train_Acc,Test_Acc
XGBC_mima_skew,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.967181,0.914013


In [139]:
%%time
#pass dataset
dset=[X_train_vif,Y_train,X_test_vif,Y_test]
# fit model
train_test_tune(models[1],params[1],dset,'z-score',True,"DTC_Z_skew")
#review Scores
scoreLog

CPU times: user 2.58 s, sys: 28.1 ms, total: 2.61 s
Wall time: 1.35 s


Unnamed: 0,model_obj,Train_Acc,Test_Acc
XGBC_mima_skew,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.967181,0.914013
DTC_Z_skew,"DecisionTreeClassifier(criterion='entropy', ma...",0.891737,0.898089


In [140]:
%%time
#pass dataset
dset=[X_train_vif,Y_train,X_test_vif,Y_test]
# fit model
train_test_tune(models[2],params[2],dset,'z-score',True,"RDC_Z_skew")
#review Scores
scoreLog

CPU times: user 28.4 s, sys: 72 ms, total: 28.4 s
Wall time: 27.2 s


Unnamed: 0,model_obj,Train_Acc,Test_Acc
XGBC_mima_skew,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.967181,0.914013
DTC_Z_skew,"DecisionTreeClassifier(criterion='entropy', ma...",0.891737,0.898089
RDC_Z_skew,"(DecisionTreeClassifier(max_features=6, random...",0.964199,0.898089


In [141]:
%%time
#pass dataset
dset=[X_train_vif,Y_train,X_test_vif,Y_test]
# fit model
train_test_tune(models[3],params[3],dset,'minmax',True,"LR_mima_skew")
#review Scores
scoreLog

CPU times: user 1.71 s, sys: 15.9 ms, total: 1.73 s
Wall time: 663 ms


Unnamed: 0,model_obj,Train_Acc,Test_Acc
XGBC_mima_skew,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.967181,0.914013
DTC_Z_skew,"DecisionTreeClassifier(criterion='entropy', ma...",0.891737,0.898089
RDC_Z_skew,"(DecisionTreeClassifier(max_features=6, random...",0.964199,0.898089
LR_mima_skew,"LogisticRegression(C=2275.845926074791, solver...",0.658125,0.914013


In [142]:
%%time
#pass dataset
dset=[X_train_vif,Y_train,X_test_vif,Y_test]
# fit model
train_test_tune(models[4],params[4],dset,'z-score',True,"KNC_Z_skew")
#review Scores
scoreLog

CPU times: user 2.59 s, sys: 43.9 ms, total: 2.64 s
Wall time: 1.51 s


Unnamed: 0,model_obj,Train_Acc,Test_Acc
XGBC_mima_skew,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.967181,0.914013
DTC_Z_skew,"DecisionTreeClassifier(criterion='entropy', ma...",0.891737,0.898089
RDC_Z_skew,"(DecisionTreeClassifier(max_features=6, random...",0.964199,0.898089
LR_mima_skew,"LogisticRegression(C=2275.845926074791, solver...",0.658125,0.914013
KNC_Z_skew,"KNeighborsClassifier(leaf_size=35, n_neighbors...",0.849514,0.898089


In [143]:
%%time
#pass dataset
dset=[X_train_vif,Y_train,X_test_vif,Y_test]
# fit model
train_test_tune(models[5],params[5],dset,'z-score',True,"SVC_Z_skew")
#review Scores
scoreLog

CPU times: user 31.8 s, sys: 35.4 ms, total: 31.9 s
Wall time: 30.8 s


Unnamed: 0,model_obj,Train_Acc,Test_Acc
XGBC_mima_skew,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.967181,0.914013
DTC_Z_skew,"DecisionTreeClassifier(criterion='entropy', ma...",0.891737,0.898089
RDC_Z_skew,"(DecisionTreeClassifier(max_features=6, random...",0.964199,0.898089
LR_mima_skew,"LogisticRegression(C=2275.845926074791, solver...",0.658125,0.914013
KNC_Z_skew,"KNeighborsClassifier(leaf_size=35, n_neighbors...",0.849514,0.898089
SVC_Z_skew,"SVC(C=9, gamma=0.025)",0.928803,0.898089


the best model was XGBClassifer owing to excellent boosting trees

In [144]:
# select best model
ind=scoreLog["Test_Acc"].argmax()
best=scoreLog.loc[scoreLog.index[ind],["model_obj"]]
#pickle it
pickle.dump(best[0],open("best_model.bhar",'wb'))

In [145]:
#pass dataset
dset=[X_train_vif,Y_train,X_test_vif,Y_test]
# fit model
X_tested=train_test_tune(models[0],xgbcp,dset,'minmax',True,"XGBC_mima_skew")

In [146]:
print(metrics.classification_report(Y_test,best[0].predict(X_tested)))

              precision    recall  f1-score   support

           0       0.92      0.99      0.95       290
           1       0.00      0.00      0.00        24

    accuracy                           0.91       314
   macro avg       0.46      0.49      0.48       314
weighted avg       0.85      0.91      0.88       314



the accuracy has come to 91%, but test recall has been very poor, due to over fit

# -----------------------------------------------------------------------------------------------------------