<h2> Problem type: Spot-check(shotgun) for Multiclass classification ML algorithm </h2>

Spot-checking algorithms is about getting a quick assessment of a bunch of different algorithms on your machine learning problem so that you know what algorithms to focus on and what to discard.

This python book intend to compare multiple algorithms based on single scoring on cross_val_score

In [1]:
import warnings
from numpy import mean
from numpy import std
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression,LogisticRegressionCV,RidgeClassifier,RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [2]:
# Define multiple algorithms for multiclass classification problem.
# Return : Dictionary with algorithm names and values
def define_classifiers(classifiers=dict()):
    
    # linear models
    classifiers['log_clf'] = LogisticRegression()
    classifiers['logcv_clf'] = LogisticRegressionCV()
    
    alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    for a in alpha:
        classifiers['ridge_clf-'+str(a)] = RidgeClassifier(alpha=a)


    classifiers['ridgeCV_clf'] = RidgeClassifierCV(cv=3)
    classifiers['lsvc_clf'] = LinearSVC()  
    
    # non-linear models
    n_neighbors = list(range(1, 31, 2))
    for k in n_neighbors:
        classifiers['knn_clf-'+str(k)] = KNeighborsClassifier(n_neighbors=k)
        
    classifiers['dt_clf'] = DecisionTreeClassifier()
    classifiers['et_clf'] = ExtraTreeClassifier() 
        
    classifiers['gnb_clf'] = GaussianNB()
    classifiers['mlp_clf'] = MLPClassifier(alpha=1, max_iter=1000)
    
    # ensemble models
    n_trees = 100
    classifiers['rf_ensemble_clf'] = RandomForestClassifier(n_estimators=n_trees)
    classifiers['et_ensemble_clf'] = ExtraTreesClassifier(n_estimators=n_trees)
    
    print('Defined %d classifiers' % len(classifiers))
    return classifiers

In [3]:
# Automate different steps in machine learning
# Return tuple with Step name and value/funtion
def create_pipeline(model):
    steps = list()
    # standardization
    steps.append(('standardize', StandardScaler()))
    # normalization
    steps.append(('normalize', MinMaxScaler()))
    # the model
    steps.append(('classifier',model))
    # create pipeline
    pipeline = Pipeline(steps=steps)
    return pipeline

cross_val_score, scoring parameter controls what metric they apply to the estimators evaluated.
https://scikit-learn.org/stable/modules/model_evaluation.html

In [4]:
# evaluate a single model
# Returns : score evaluate a score by cross-validation
def evaluate_model(X, y, model, folds, metric):
    # create the pipeline
    pipeline = create_pipeline(model)
    # evaluate model
    scores = cross_val_score(pipeline, X, y, scoring=metric, cv=folds, n_jobs=-1)
    return scores

In [5]:
# evaluate a model and try to trap errors and and hide warnings
# Returns : score evaluate a score by cross-validation
def handle_warning(X, y, model, folds, metric):
    try:
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            scores = evaluate_model(X, y, model, folds, metric)
    except:
        scores = None
    return scores

Different metric for multiclass classification
https://scikit-learn.org/stable/modules/model_evaluation.html#from-binary-to-multiclass-and-multilabel

In [1]:
# evaluate a dict of models {name:object}, returns {name:score}
# Returns : {name:score}
def evaluate_models(X, y, classifiers, folds=10, metric='accuracy'):
    results = dict()
    for name, model in classifiers.items():
    # evaluate the model
        scores = handle_warning(X, y, model, folds, metric)
        # show process
        if scores is not None:
            # store a result
            results[name] = scores
            mean_score, std_score = mean(scores), std(scores)
            print('>%s: %.3f (+/-%.3f)' % (name, mean_score, std_score))
        else:
            print('>%s: error' % name)
    return results

In [7]:
# print and plot the top n results
def summarize_results(results, maximize=True, top_n=10):
    # check for no results
    if len(results) == 0:
        print('no results')
        return
    # determine how many results to summarize
    n = min(top_n, len(results))
    # create a list of (name, mean(scores)) tuples
    mean_scores = [(k,mean(v)) for k,v in results.items()]
    # sort tuples by mean score
    mean_scores = sorted(mean_scores, key=lambda x: x[1])
    # reverse for descending order (e.g. for accuracy)
    if maximize:
        mean_scores = list(reversed(mean_scores))
    # retrieve the top n for summarization
    names = [x[0] for x in mean_scores[:n]]
    scores = [results[x[0]] for x in mean_scores[:n]]
    # print the top n
    print()
    for i in range(n):
        name = names[i]
        mean_score, std_score = mean(results[name]), std(results[name])
        print('Rank=%d, Name=%s, Score=%.3f (+/- %.3f)' % (i+1, name, mean_score, std_score))
    # boxplot for the top n
    plt.boxplot(scores, labels=names)
    plt.title('Compare ML Algorithms')
    _, labels = plt.xticks()
    plt.setp(labels, rotation=90)
    plt.ylabel('Accuracy')
    plt.savefig('modelcomparison.png')