# A hint at bringing some interactivity to machine learning investigations

The following code is reproduced in part from work by Jason Brownlee (see [link](https://machinelearningmastery.com/compare-machine-learning-algorithms-python-scikit-learn/))

The data studied here includes information from the Pima Indians Diabetes Database.  All patients were females 21+ yrs old of Pima Indian Heritage, and the following attributes are in the dataset:
   1. Number of times pregnant
   2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
   3. Diastolic blood pressure (mm Hg)
   4. Triceps skin fold thickness (mm)
   5. 2-Hour serum insulin (mu U/ml)
   6. Body mass index (weight in kg/(height in m)^2)
   7. Diabetes pedigree function
   8. Age (years)
   9. Class variable (0 or 1)

Using this Pima Indians Diabetes Database, the goal has been to see whether one can predict a diabetes diagnosis. Six machine learning methods are here tested via cross-validation:
1. Logistic Regression
2. Linear Discriminant Analysis
3. K-Nearest Neighbors
4. Classification and Regression Trees
5. Naive Bayes
6. Support Vector Machines

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="ticks")

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [None]:
# load dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)

In [None]:
dataframe.head()

In [None]:
sns.pairplot(dataframe, vars=dataframe.columns[:-1], hue="class")

In [None]:
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

# prepare configuration for cross validation test harness
seed = 7

# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = model_selection.KFold(n_splits=5, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
import ipywidgets
from ipywidgets import Output
from IPython.display import clear_output

In [None]:
out = Output()
def splitprobe(splitnum=10):
    # evaluate each model in turn
    results = []
    names = []
    scoring = 'accuracy'
    msg = ''
    with out:
        clear_output(wait=True)
        for name, model in models:
            kfold = model_selection.KFold(n_splits=splitnum, random_state=seed)
            cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
            results.append(cv_results)
            names.append(name)
            msg += "%s: %f (%f)\n" % (name, cv_results.mean(), cv_results.std())
        # boxplot algorithm comparison
        fig = plt.figure()
        fig.suptitle('Algorithm Comparison')
        ax = fig.add_subplot(111)
        plt.boxplot(results)
        ax.set_xticklabels(names)
        plt.text(1.0, 0.25, msg, fontsize=14, transform=plt.gcf().transFigure)
        ax.set_ylim([0.5,1.0])
        display(figure)

ipywidgets.interactive(splitprobe,splitnum=(1,20))