In [42]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import KFold 
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from catboost import CatBoostClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn import preprocessing
from tqdm import tqdm
import weka.core.jvm as jvm
from weka.core.converters import Loader
from weka.classifiers import Classifier
from weka.classifiers import Evaluation
from weka.core.classes import Random
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from collections import defaultdict
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [4]:
jvm.start()
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file("human_activity.arff")
data.class_is_last()
cls = Classifier(classname="weka.classifiers.functions.SMO")
evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 10, Random(1))

print(evl.percent_correct)
print(evl.summary())
print(evl.class_details())

98.50470919506748

Correctly Classified Instances       10145               98.5047 %
Incorrectly Classified Instances       154                1.4953 %
Kappa statistic                          0.982 
Mean absolute error                      0.2226
Root mean squared error                  0.3107
Relative absolute error                 80.3187 %
Root relative squared error             83.4718 %
Total Number of Instances            10299     

=== Detailed Accuracy By Class ===

                 TP Rate  FP Rate  Precision  Recall   F-Measure  MCC      ROC Area  PRC Area  Class
                 0.960    0.009    0.961      0.960    0.960      0.951    0.991     0.944     STANDING
                 0.958    0.009    0.957      0.958    0.957      0.948    0.986     0.930     SITTING
                 1.000    0.000    1.000      1.000    1.000      1.000    1.000     1.000     LAYING
                 1.000    0.000    0.999      1.000    0.999      0.999    1.000     0.999     WALKING
     

In [3]:
df = pd.read_csv("human_activity.csv").drop('Unnamed: 0',1)

In [4]:
x = df.drop("label", 1).values
encoder = LabelEncoder()
y = encoder.fit_transform(df["label"])

In [15]:
n_splits = 10
kf = KFold(n_splits=n_splits)

In [67]:
def k_fold_test(classifier):
    total_accuracy = 0
    for train_index, test_index in kf.split(x):
        y_train, y_test = y[train_index], y[test_index]
        x_train, x_test = x[train_index], x[test_index]
        classifier.fit(x_train, y_train)
        y_pred = classifier.predict(x_test)
        accuracy = accuracy_score(y_pred, y_test)
        print(accuracy)
        total_accuracy += accuracy
    return total_accuracy/n_splits

In [7]:
classifiers = [
        MLPClassifier(alpha=1),
        LogisticRegression(),
        KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025),
        AdaBoostClassifier(),
        GaussianNB(),
        CatBoostClassifier(iterations=500, learning_rate=0.3, depth=5, loss_function='MultiClass', classes_count=6, logging_level='Silent', l2_leaf_reg=2, thread_count=4),
        RandomForestClassifier(n_estimators=70, max_depth=5, max_features=0.8, n_jobs=4, class_weight='balanced'),
        BernoulliNB(),
    ]
names = [
    "MLP",
    "logistic regression",
    "KNN",
    "SVC",
    "RandomForestClassifier",
    "AdaBoostClassifier",
    "GaussianNB",
    "CatBoostClassifier",
    "RandomForestClassifier",
    "BernoulliNB"
]
for name,classifier in zip(names,classifiers):
    accuracy = k_fold_test(classifier) 
    print(name+ " :")
    print(accuracy)
    
    

MLP :
0.9494145994143637




logistic regression :
0.9612602026622407
KNN :
0.899118952078726
SVC :
0.9531041555661387
RandomForestClassifier :
0.7128909782767578
AdaBoostClassifier :
0.5437415184107659
GaussianNB :
0.727261654964942
CatBoostClassifier :
0.9495117339456896
RandomForestClassifier :
0.8788266525837315
BernoulliNB :
0.8339649749855006


In [5]:
def svc_param_selection(X, y,kernel, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(SVC(kernel=kernel), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

In [10]:
best_params = svc_param_selection(x,y,'poly',10)

In [13]:
best_params

{'C': 0.001, 'gamma': 1}

In [48]:
poly_accuracy = k_fold_test(SVC(kernel="poly",C=.001,gamma=1))
poly_accuracy

0.9667944181833621

In [11]:
linear_best_params = svc_param_selection(x,y,'linear',10)

In [14]:
linear_best_params

{'C': 10, 'gamma': 0.001}

In [21]:
linear_accuracy = k_fold_test(SVC(kernel="linear",C=10,gamma=.001))
linear_accuracy

0.9609689876790034


In [6]:
rbf_best_params = svc_param_selection(x,y,'rbf',10)

In [7]:
rbf_best_params

{'C': 10, 'gamma': 0.01}

In [16]:
rbf_accuracy = k_fold_test(SVC(kernel="rbf",C=10,gamma=.01))
rbf_accuracy

0.9648527649617407

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [26]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(x, y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 65.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 129.9min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'bootstrap': [True, False], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'min_samples_leaf': [1, 2, 4], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 5, 10]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [27]:
rf_random.best_params_

{'bootstrap': False,
 'max_depth': 50,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [28]:
random_forest = RandomForestClassifier(bootstrap=False,max_depth=50,max_features='auto',min_samples_leaf=1,min_samples_split=2,n_estimators=1000)
random_forest_accuracy = k_fold_test(random_forest)

0.9395106003566476


In [30]:
final_classifiers = [
    SVC(kernel="poly",C=.001,gamma=1,probability=True),
    RandomForestClassifier(bootstrap=False,max_depth=50,max_features='auto',min_samples_leaf=1,min_samples_split=2,n_estimators=1000),
    LogisticRegression(),
    KNeighborsClassifier(3),
    CatBoostClassifier(iterations=100,depth=5, loss_function='MultiClass', classes_count=6, logging_level='Silent', l2_leaf_reg=2, thread_count=8)
    ]
final_names = ["svc","random forest","logistic regression", "KNN", "Catboost"]

In [28]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

In [37]:
def use_trained_classifier(clf_index,clf,X_te,num_labels,level1,te_idx):
    predictions = clf.predict_proba(X_te)
    for label_index in range(num_labels):
        current = clf_index*num_labels+label_index
        level1[te_idx,current] = predictions[:,label_index]

In [40]:
def get_level_data(X,y,classifiers):
    num_labels = len(set(y))
    kf = KFold(n_splits=10)
    level1 = np.zeros((X.shape[0], len(classifiers)*num_labels))
    split_index = 0
    for tr_idx, te_idx in kf.split(X):
        split_index+=1
        print("Split index {}".format(split_index))
        X_tr, y_tr = X[tr_idx], y[tr_idx]
        X_te, y_te = X[te_idx], y[te_idx]
        for clf_index,clf in enumerate(classifiers):
            print("Clf index {}".format(clf_index))
            clf.fit(X_tr,y_tr)
            use_trained_classifier(clf_index,clf,X_te,num_labels,level1,te_idx)
    return level1

In [41]:
level1_train = get_level_data(X_train,y_train,final_classifiers)
level1_test = get_level_data(X_test,y_test,final_classifiers)

Split index 1
Clf index 0
Clf index 1
Clf index 2
Clf index 3
Clf index 4
Split index 2
Clf index 0
Clf index 1
Clf index 2
Clf index 3
Clf index 4
Split index 3
Clf index 0
Clf index 1
Clf index 2
Clf index 3
Clf index 4
Split index 4
Clf index 0
Clf index 1
Clf index 2
Clf index 3
Clf index 4
Split index 5
Clf index 0
Clf index 1
Clf index 2
Clf index 3
Clf index 4
Split index 6
Clf index 0
Clf index 1
Clf index 2
Clf index 3
Clf index 4
Split index 7
Clf index 0
Clf index 1
Clf index 2
Clf index 3
Clf index 4
Split index 8
Clf index 0
Clf index 1
Clf index 2
Clf index 3
Clf index 4
Split index 9
Clf index 0
Clf index 1
Clf index 2
Clf index 3
Clf index 4
Split index 10
Clf index 0
Clf index 1
Clf index 2
Clf index 3
Clf index 4
Split index 1
Clf index 0
Clf index 1
Clf index 2
Clf index 3
Clf index 4
Split index 2
Clf index 0
Clf index 1
Clf index 2
Clf index 3
Clf index 4
Split index 3
Clf index 0
Clf index 1
Clf index 2
Clf index 3
Clf index 4
Split index 4
Clf index 0
Clf index 1

In [60]:
def test_meta_classifiers(level1_train,y_train,level1_test,y_test):
    meta_clfs = [
        RidgeClassifier(normalize=True, class_weight='balanced')
    ]
    for meta_clf in meta_clfs:
        meta_clf.fit(level1_train, y_train)
        meta_preds = meta_clf.predict(level1_test)
        total_accuracy = accuracy_score(meta_preds, y_test)
        print(total_accuracy)

In [65]:
test_meta_classifiers(level1_train,y_train,level1_test,y_test)

0.962135922330097


In [74]:
train = np.concatenate((level1_train,X_train),axis=1)
test =  np.concatenate((level1_test,X_test),axis=1) 

In [73]:
test_meta_classifiers(train,y_train,test,y_test)

0.9631067961165048


In [81]:
def get_level1_test(X_train,y_train,X_test,classifiers):
    num_labels = len(set(y_train))
    kf = KFold(n_splits=10)
    level1 = np.zeros((X_test.shape[0], len(classifiers)*num_labels))
    split_index = 0
    for clf_index,clf in enumerate(classifiers):
        clf.fit(X_train,y_train)
        predictions = clf.predict_proba(X_test)
        for label_index in range(num_labels):
            current = clf_index*num_labels+label_index
            level1[:,current] = predictions[:,label_index]
    return level1

In [82]:
level1_test_full_train = get_level1_test(X_train,y_train,X_test,final_classifiers)

In [84]:
test_meta_classifiers(level1_train,y_train,level1_test_full_train,y_test)

0.996116504854369


In [43]:
from sklearn import datasets


def make_meshgrid(x, y, h=.02):
    """Create a mesh of points to plot in

    Parameters
    ----------
    x: data to base x-axis meshgrid on
    y: data to base y-axis meshgrid on
    h: stepsize for meshgrid, optional

    Returns
    -------
    xx, yy : ndarray
    """
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    return xx, yy


def plot_contours(ax, clf, xx, yy, **params):
    """Plot the decision boundaries for a classifier.

    Parameters
    ----------
    ax: matplotlib axes object
    clf: a classifier
    xx: meshgrid ndarray
    yy: meshgrid ndarray
    params: dictionary of params to pass to contourf, optional
    """
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out

def plot_decision_boundaries(models,titles,meta):    
    # import some data to play with
    iris = datasets.load_iris()
    # Take the first two features. We could avoid this by using a two-dim dataset
    X_iris = iris.data[:, :2]
    y_iris = iris.target

    # we create an instance of SVM and fit out data. We do not scale our
    # data since we want to plot the support vectors
    C = 1.0  # SVM regularization parameter
    models = (clf.fit(X_iris, y_iris) for clf in models)
    # Set-up 2x2 grid for plotting.
    fig, sub = plt.subplots(2, 2)
    plt.subplots_adjust(wspace=0.4, hspace=0.4)

    X0, X1 = X_iris[:, 0], X_iris[:, 1]
    xx, yy = make_meshgrid(X0, X1)

    for clf, title, ax in zip(models, titles, sub.flatten()):
        plot_contours(ax, clf, xx, yy,
                      cmap=plt.cm.coolwarm, alpha=0.8)
        ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xlabel('Sepal length')
        ax.set_ylabel('Sepal width')
        ax.set_xticks(())
        ax.set_yticks(())
        ax.set_title(title)

    plt.show()

In [None]:
def test_meta_model(names,classifiers,x,y):
    model_to_accuracy = defaultdict(list)
    split_index = 0
    for train_index, test_index in kf.split(x):
        print(split_index)
        split_index+=1
        y_train, y_test = y[train_index], y[test_index]
        x_train, x_test = x[train_index], x[test_index]
        for name,classifier in zip(names,classifiers):
            classifier.fit(x_train, y_train)
            y_pred = classifier.predict(x_test)
            accuracies[name].append(accuracy_score(y_pred, y_test))
        level1_train = get_level_data(x_train,y_train,classifiers)
        level1_test = get_level_data(x_test,y_test,classifiers)
        meta_accuracy = test_meta_classifiers(level1_train,y_train,level1_test,y_test)
        accuracies["meta_learner"].append(meta_accuracy)
    return model_to_accuracy

In [None]:
def test_accuracies(model_to_accuracy,p=.05):
    meta_learner_accuracies = model_to_accuracy["meta_learner"]
    for model in model_to_accuracy:
        if model!="meta_learner":
            p_score = stats.ttest_rel(meta_learner_accuracies,model_to_accuracy[model])[1]
            if p_score<.05:
                print("There is a significant difference between the meta learner and model {}".format(model))
            else:
                print("There is no significant difference between the meta learner and model {}".format(model))

In [44]:
datasetsast_cancer_x,breast_cancer_y = datasets.load_breast_cancer() 
breast_cancer_accuracies = test_meta_model(final_names,final_classifiers,breast_cancer_x,breast_cancer_y)
breast_cancer_accuracies

ValueError: too many values to unpack (expected 2)

In [None]:
test_accuracies(breast_cancer_accuracies)

In [None]:
iris_x,iris_y = datasets.load_iris()
iris_accuracies = test_meta_model(final_names,final_classifiers,breast_cancer_x,breast_cancer_y)
iris_accuracies

In [None]:
test_accuracies(iris_accuracies)