In [35]:
# for classification and regression problems


In [144]:
import matplotlib.pyplot as plt

In [145]:
import warnings
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [146]:
# loading dataset
# def load_dataset():
#     x , y = None , None
#     return x,y

def load_dataset():
    return make_classification(n_samples=1000, n_classes=2, random_state=1)

In [147]:
# linear , non linear , ensemble models

# dict as a list of different models we want to evaluate

# create a dict of standard models to evaluate {name:object}
def define_models(models=dict()):
    # linear models
    models['logistic'] = LogisticRegression()
    alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    for a in alpha:
        models['ridge-'+str(a)] = RidgeClassifier(alpha=a)
    models['sgd'] = SGDClassifier(max_iter=1000, tol=1e-3)
    models['pa'] = PassiveAggressiveClassifier(max_iter=1000, tol=1e-3)
    # non-linear models
    n_neighbors = range(1, 21)
    for k in n_neighbors:
        models['knn-'+str(k)] = KNeighborsClassifier(n_neighbors=k)
    models['cart'] = DecisionTreeClassifier()
    models['extra'] = ExtraTreeClassifier()
    models['svml'] = SVC(kernel='linear')
    models['svmp'] = SVC(kernel='poly')
    c_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    for c in c_values:
        models['svmr'+str(c)] = SVC(C=c)
    models['bayes'] = GaussianNB()
    # ensemble models
    n_trees = 100
    models['ada'] = AdaBoostClassifier(n_estimators=n_trees)
    models['bag'] = BaggingClassifier(n_estimators=n_trees)
    models['rf'] = RandomForestClassifier(n_estimators=n_trees)
    models['et'] = ExtraTreesClassifier(n_estimators=n_trees)
    models['gbm'] = GradientBoostingClassifier(n_estimators=n_trees)
    print('Defined %d models' % len(models))
    return models

In [148]:
# we may want to transform a data prior to training and testing
# for say standarsization, normalisation, feature selection
# we do this in a blanket way to all models

In [149]:
# make pipeline function takes a defined model and returns a pipeline


In [150]:
def make_pipeline(model):
    steps = list()
    steps.append(('standardise' , StandardScaler()))
    steps.append(('normalize', MinMaxScaler()))
    steps.append(('model',model))
    pipeline = Pipeline(steps=steps)
    return pipeline

In [162]:
#evaluating a single model
#we use k-fold cross validation here
def evaluate_model(x,y,model,folds,metric):
    pipeline = make_pipeline(model)
    scores = cross_val_score(pipeline,x,y,scoring=metric , cv=folds)
    return scores

In [163]:
#trap the errors and ignore the warnings
#for an exception , make the result as none

def robust_evaluate_model(x,y,model,folds,metric):
    scores = None
    try :
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            scores = evaluate_model(x,y,model,folds,metric)
    except:
        scores = None
    return scores

In [164]:
# evaluate dict of models {name:object} , return {name:score}
def evaluate_models(x,y,models,folds=10,metric='accuracy'):
    results = dict()
    for name, model in models.items():
        scores = evaluate_model(x,y,model,folds,metric)
        if scores is not None:
            results[name] = scores
            mean_score , std_score = mean(scores),std(scores)
            print ('>%s: %.3f (+/-%.3f)' % (name, mean_score, std_score))
        else :
            print ('>%s: error' % name)
    return results

In [165]:
#print and plot top n results
def summarize_results(results , maximize = True,top_n =10):
    if len(results)==0:
        print ('no results')
        return
    n = min(top_n,len(results))
    mean_scores = [(k,mean(v)) for k,v in results.items()]
    mean_scores = sorted(mean_scores,key=lambda x : x[1])
    
    if maximize:
        mean_scores = list(reversed(mean_scores))
    names = [x[0] for x in mean_scores[:n]]
    scores = [results[x[0]] for]
    print()

    
    for i in range(n):
        name = names[i]
        mean_score,std_score = mean(results[name]),std(results[name])
        print('Rank=%d, Name=%s, Score=%.3f (+/- %.3f)' % (i+1, name, mean_score, std_score))
    plt.boxplot(scores,labels = names)
    _,labels = pyplot.xticks()
    plt.setp(labels,rotation = 90)
    plt.show()
    

In [166]:
x , y = load_dataset()
models = define_models()
results = evaluate_models(x,y,models)
summarize_results(results)


Defined 53 models
>knn-11: 0.783 (+/-0.041)
>knn-10: 0.767 (+/-0.042)
>knn-13: 0.789 (+/-0.041)
>knn-12: 0.781 (+/-0.045)
>knn-15: 0.799 (+/-0.044)
>knn-14: 0.787 (+/-0.038)
>knn-17: 0.799 (+/-0.027)
>knn-16: 0.791 (+/-0.039)
>knn-19: 0.801 (+/-0.031)
>knn-18: 0.797 (+/-0.030)
>ridge-0.9: 0.848 (+/-0.038)
>ridge-0.8: 0.848 (+/-0.038)
>et: 0.870 (+/-0.029)
>ridge-0.1: 0.847 (+/-0.037)
>svmr0.7: 0.833 (+/-0.030)
>ridge-0.3: 0.847 (+/-0.037)
>ridge-0.2: 0.847 (+/-0.037)
>ridge-0.5: 0.848 (+/-0.038)
>sgd: 0.815 (+/-0.070)
>ridge-0.7: 0.848 (+/-0.038)
>ridge-0.6: 0.848 (+/-0.038)
>svmr1.0: 0.837 (+/-0.032)
>bag: 0.860 (+/-0.040)
>extra: 0.768 (+/-0.031)
>ridge-0.4: 0.848 (+/-0.038)
>rf: 0.865 (+/-0.041)
>pa: 0.769 (+/-0.076)
>ada: 0.850 (+/-0.035)
>svmr0.3: 0.805 (+/-0.032)
>svmr0.2: 0.781 (+/-0.043)
>svmr0.1: 0.797 (+/-0.034)
>ridge-1.0: 0.847 (+/-0.038)
>svmr0.6: 0.828 (+/-0.031)
>svmr0.5: 0.823 (+/-0.029)
>svmr0.4: 0.817 (+/-0.032)
>svml: 0.843 (+/-0.035)
>svmr0.9: 0.838 (+/-0.034)
>svmr

NameError: global name 'scores' is not defined

In [None]:
ii