# ML Classification Pipeline
This code is self contained and explores the performance of a large number of ML algorithms to predict binary labels from data features.

We start by loading the necessary libraries and functions that we will use in our analysis

In [1]:
from scipy import stats
from sklearn.metrics import accuracy_score, average_precision_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression, SGDClassifier, PassiveAggressiveClassifier, Lasso
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from sklearn import preprocessing
from sklearn import tree
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, IsolationForest, VotingClassifier
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, Normalizer, PolynomialFeatures
from sklearn.decomposition import PCA, KernelPCA, FastICA, SparsePCA
from sklearn.pipeline import Pipeline
from sklearn.cluster import FeatureAgglomeration
from sklearn.kernel_approximation import RBFSampler
from sklearn.preprocessing import Imputer
import random
import pandas as pd
import numpy as np
import warnings
import statsmodels.api as sm
warnings.filterwarnings('ignore')

We now load our data, say in csv format, and look at simple statistics.

In [None]:
dataset = pd.read_csv('/home/...')

print(dataset.describe())
print(dataset.dtypes)
print(dataset.columns.values)

def get_data(target = 'label'):
    dataset = pd.read_csv('/home/...')
    #dataset = dataset.iloc[:500,:]
    X = dataset.drop([target], axis=1, inplace=False)
    y = (dataset[[target]])*1
    return X,y

An interesting preliminary step to understand the relationships between features and labels is to fit a simple logistic regression model and explore its parameter values.

In [None]:
X,y           = get_data(target = 'label')
logit = sm.Logit(y, X)

# fit the model
result = logit.fit()
# summary report
print(result.summary())

We are now ready to define the main loop to test our classification algorithms. Each model and cross-validated performance using a number of metrics is obtained with the code below.

In [3]:
def Run_benchmark_classifiers(X, y, Model_list, Metrics_list, num_folds):
    text_file = open("/home/...", "w")  # Open a file to store results

    ## Preprocessing step
    # X        = preprocessing.scale(X) # scaling
    X = preprocessing.normalize(X)  # normalization
    # pipeline = Pipeline([('scaling', Normalizer()), ('pca', PCA(n_components=4))]) # create a pipeline
    # pipeline = Pipeline([('scaling', Normalizer()), ('fast_ica', FastICA(n_components=4))])
    # pipeline = Pipeline([('scaling', Normalizer()), ('sparse_pca', SparsePCA(n_components=4))])
    # X        = pipeline.fit_transform(X) # In future versions split this between training and testing
    # poly     = PolynomialFeatures(interaction_only=True)
    # X        = poly.fit_transform(X)
    # agglo    = FeatureAgglomeration(n_clusters=4)
    # agglo.fit(X)
    # X        = agglo.transform(X)
    # RFF       = RBFSampler(gamma=1, random_state=1)
    # X         = RFF.fit_transform(X)

    for i in range(len(Model_list)):

        # Create an instantiation of the selected model
        if Model_list[i][0] == 'Logistic Regression':
            model = LogisticRegression()
        elif Model_list[i][0] == 'SGD':
            model = SGDClassifier(loss="hinge", penalty="l2")
        elif Model_list[i][0] == 'kNN':
            model = KNeighborsClassifier()
        elif Model_list[i][0] == 'Decision Tree':
            model = tree.DecisionTreeClassifier()
        elif Model_list[i][0] == 'Naive Bayes':
            if Model_list[i][1] == 'Gaussian':
                model = GaussianNB()
            if Model_list[i][1] == 'Bernoulli':
                model = BernoulliNB()
            if Model_list[i][1] == 'Multinomial':
                model = MultinomialNB()
        elif Model_list[i][0] == 'SVM':
            if Model_list[i][1] == 'Linear':
                model = svm.LinearSVC()
            if Model_list[i][1] == 'Kernel':
                model = svm.SVC()
        elif Model_list[i][0] == 'Random Forest':
            model = RandomForestClassifier(n_estimators=20)
        elif Model_list[i][0] == 'Extra Trees':
            model = ExtraTreesClassifier(n_estimators=20)
        elif Model_list[i][0] == 'LDA':
            model = LinearDiscriminantAnalysis()
        elif Model_list[i][0] == 'QDA':
            model = QuadraticDiscriminantAnalysis()
        elif Model_list[i][0] == 'Passive Agressive':
            model = PassiveAggressiveClassifier()
        elif Model_list[i][0] == 'AdaBoost':
            model = AdaBoostClassifier()
        elif Model_list[i][0] == 'Bagging':
            model = BaggingClassifier()
        elif Model_list[i][0] == 'Gradient Boosting':
            model = GradientBoostingClassifier()
        elif Model_list[i][0] == 'Isolation Forest':
            model = IsolationForest()
        elif Model_list[i][0] == 'XGBoost':
            model = xgb.XGBClassifier()
        elif Model_list[i][0] == 'LASSO':
            model = Lasso()
        elif Model_list[i][0] == 'Dropout':
            layers = [Layer("Rectifier", units=100), Layer("Rectifier", units=100), Layer("Softmax")]
            model = Classifier(layers, dropout_rate=0.5)
        elif Model_list[i][0] == 'MLP':
            model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=Model_list[i][1], random_state=1)
        elif Model_list[i][0] == 'AutoML':
            if Model_list[i][1] == 'Evolutionary':
                model = TPOTClassifier(generations=5, population_size=50, verbosity=2)
            if Model_list[i][1] == 'Bayesian':
                model = TPOTClassifier(generations=5, population_size=50, verbosity=2)

        # Compute the set of metrics via K-fold cross validation
        # -------------------------------------------------------
        if Model_list[i][0] == 'Naive Bayes':
            text_file.write("**Performance of %s %s:" % (Model_list[i][1], Model_list[i][0]))
            print("Performance of %s %s:" % (Model_list[i][1], Model_list[i][0]))
        else:
            text_file.write("**Performance of %s:" % Model_list[i][0])
            print("Performance of %s:" % Model_list[i][0])
        for k in range(len(Metrics_list)):
            scores_val = cross_val_score(model, X, y, cv=StratifiedKFold(n_splits=num_folds), scoring=Metrics_list[k])
            if k == len(Metrics_list) - 1:
                text_file.write(
                    "%s = %0.4f (+/- %0.4f)**\n" % (Metrics_list[k], scores_val.mean(), scores_val.std() * 2))
                print("%s = %0.4f (+/- %0.4f)\n" % (Metrics_list[k], scores_val.mean(), scores_val.std() * 2))
            else:
                text_file.write("%s = %0.4f (+/- %0.4f)," % (Metrics_list[k], scores_val.mean(), scores_val.std() * 2))
                print("%s = %0.4f (+/- %0.4f)" % (Metrics_list[k], scores_val.mean(), scores_val.std() * 2))
    text_file.close()
    return


Define the algorithms of interest, number of folds for cross-validation and metrics to compute performance. And RUN!

In [4]:
num_folds     = 10                                                        # number of crossvalidation folds
Model_list    = [['Logistic Regression'],['SGD'],['kNN'],
                 ['Decision Tree'],['Naive Bayes','Gaussian'],['Naive Bayes','Bernoulli'],
                 ['Naive Bayes','Multinomial'],['SVM','Linear'],['SVM','Kernel'],['QDA'],
                 ['Random Forest'], ['Extra Trees'],['LDA'],['Passive Agressive'],
                 ['AdaBoost'],['Bagging'],
                 ['Gradient Boosting'],['XGBoost'],['MLP',(100,100)]]     # List of models and parameters
Metrics_list  = ['roc_auc','recall','precision','accuracy','average_precision']
                                                                          # List of metrics for benchmark evaluation
    

# Run benchmarks in the model list
#-------------------------------------------------------------------------
Run_benchmark_classifiers(X,y,Model_list,Metrics_list,num_folds)

## Variable importance
Finally, consider variable importance computation by fitting a desired model with a single feature at a time.

In [10]:
def variable_importance(X,y):
    scores = []
    X = preprocessing.normalize(X)
    #model = GradientBoostingClassifier()
    model = LogisticRegression()
    
    for i in range(X.shape[1]):
        #data = np.delete(X, i, axis=1)
        data = X[:,i]
        data = data[:,np.newaxis]
        scores_val = cross_val_score(model, data, y, cv=StratifiedKFold(n_splits=3), scoring='roc_auc')
        scores.append(scores_val.mean())
        
        #print("AUC for %s removed = %0.4f (+/- %0.4f)" % (dataset.columns.values[4+i], scores_val.mean(), 
        #                                                    scores_val.std()))
        print("AUC with %s only = %0.4f (+/- %0.4f)" % (dataset.columns.values[4+i], scores_val.mean(), 
                                                            scores_val.std()))
    return scores

In [None]:
X,y           = get_data(target = 'target')

scores = variable_importance(X,y)