<h2> Problem type: Spot-check(shotgun) for Multiclass classification ML algorithm </h2>

Spot-checking algorithms is about getting a quick assessment of a bunch of different algorithms on your machine learning problem so that you know what algorithms to focus on and what to discard.

This python book intend to compare multiple algorithms based on single scoring on cross_validate

In [54]:
import warnings
from numpy import mean
from numpy import std
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression,LogisticRegressionCV,RidgeClassifier,RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn import metrics
from sklearn.metrics.scorer import make_scorer,jaccard_score

In [55]:
# Define multiple algorithms for multiclass classification problem.
# Return : Dictionary with algorithm names and values
def define_classifiers(classifiers=dict()):
    
    # linear models
    classifiers['log_clf'] = LogisticRegression()
    classifiers['logcv_clf'] = LogisticRegressionCV()
    
    alpha = [0.1, 1.0]
    for a in alpha:
        classifiers['ridge_clf-'+str(a)] = RidgeClassifier(alpha=a)


    classifiers['ridgeCV_clf'] = RidgeClassifierCV(cv=3)
    classifiers['lsvc_clf'] = LinearSVC()  
    
    # non-linear models
    n_neighbors = list(range(1, 3, 2))
    for k in n_neighbors:
        classifiers['knn_clf-'+str(k)] = KNeighborsClassifier(n_neighbors=k)
        
    classifiers['dt_clf'] = DecisionTreeClassifier()
    classifiers['et_clf'] = ExtraTreeClassifier() 
        
    classifiers['gnb_clf'] = GaussianNB()
    classifiers['mlp_clf'] = MLPClassifier(alpha=1, max_iter=1000)
    
    # ensemble models
    n_trees = 100
    classifiers['rf_ensemble_clf'] = RandomForestClassifier(n_estimators=n_trees)
    classifiers['et_ensemble_clf'] = ExtraTreesClassifier(n_estimators=n_trees)
    
    print('Defined %d classifiers' % len(classifiers))
    return classifiers

In [56]:
# https://scikit-learn.org/stable/modules/model_evaluation.html#from-binary-to-multiclass-and-multilabel
# macro, micro, weighted, none 

def scoring_parameter_multiclass(metrics=dict()):
        metrics['accuracy'] = 'accuracy'                                 #OR metrics['accuracy'] = accuracy_score()
        metrics['balanced_accuracy'] = 'balanced_accuracy'               #OR metrics['balanced_accuracy'] = balanced_accuracy_score()
        metrics['f1'] = 'f1_macro'                                       # macro, micro, weighted, none 
        metrics['neg_log_loss'] = 'neg_log_loss'                         #OR metrics['neg_log_loss'] = log_loss()
        metrics['precision'] = 'precision_macro'                         # macro, micro, weighted, none 
        metrics['recall'] = 'recall_macro'                               # macro, micro, weighted, none        
        metrics['jaccard'] = make_scorer(jaccard_score, average='macro') # macro, micro, weighted, none
       
        return metrics

In [57]:
# Automate different steps in machine learning
# Return tuple with Step name and value/funtion
def create_pipeline(model):
    steps = list()
    # standardization
    steps.append(('standardize', StandardScaler()))
    # normalization
    steps.append(('normalize', MinMaxScaler()))
    # the model
    steps.append(('classifier',model))
    # create pipeline
    pipeline = Pipeline(steps=steps)
    return pipeline

cross_val_score, scoring parameter controls what metric they apply to the estimators evaluated.
https://scikit-learn.org/stable/modules/model_evaluation.html

In [58]:
# evaluate a single model
# Returns : score evaluate a score by cross-validation
def evaluate_model(X, y, model, folds, metric):
    # create the pipeline
    pipeline = create_pipeline(model)
    # evaluate model
    scores = cross_validate(pipeline, X, y, scoring=metric, cv=folds, return_train_score=False, n_jobs=-1)
    return scores

In [59]:
# evaluate a model and try to trap errors and and hide warnings
# Returns : score evaluate a score by cross-validation
def handle_warning(X, y, model, folds, metric):
    try:
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            scores = evaluate_model(X, y, model, folds, metric)
    except:
        scores = None
    return scores

Different metric for multiclass classification
https://scikit-learn.org/stable/modules/model_evaluation.html#from-binary-to-multiclass-and-multilabel

In [60]:
# evaluate a dict of models {name:object}, returns {name:score}
# Returns : {name,metrics:score}
def evaluate_models(X, y, classifiers, metrics, folds=10):
    results = dict()
    for scoring_name,metric in metrics.items():
        print('%s' % scoring_name)
        for name, model in classifiers.items():
        # evaluate the model
            scores = handle_warning(X, y, model, folds, metric)
            # show process
            if scores is not None:
                # store a result
                results[name,scoring_name] = scores['test_score']
                print('>%s: %.3f' % (name, mean(scores['test_score'])))
            else:
                print('>%s: error' % name)
         
    return results

In [61]:
def summarize_results(results):
    # check for no results
    if len(results) == 0:
        print('no results')
        return
    # Convert dict to dataframe
    df = pd.DataFrame.from_dict(results)
    # Unstack multiindx and convert to dataframe
    df = df.unstack().to_frame()
    # convert index to column
    df.reset_index(inplace=True)
    # rename column
    df.columns = ['algorithms', 'metrics', 'number', 'score']

    sns.set_context("paper", font_scale=2)  ## increase font size
    g = sns.catplot(x='algorithms', y='score', col="metrics",data=df,col_wrap=2, kind="box",height=3, aspect=2)
    for i, ax in enumerate(g.fig.axes):   ## getting all axes of the fig object
         ax.set_xticklabels(ax.get_xticklabels(), rotation = 90) ## xlabel rotation
    g.savefig('output.png')