In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn import datasets, metrics, svm, tree, neighbors
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## Summary

Important types of datasets to have experience with
* Task
    * Classification (binary and multiclass)
    * Regression
    * Clustering
    * Dimensionality reduction
* Input formats:
    * Numbers
    * Images
    * Time series
    * Text
    * Natural language
* Data properties
    * Sparse data
    * Big data
    * Corrupted data

<a id="table_of_contents"></a>
## Contents

* [Digits](#sec_digits)
* [Iris](#sec_iris)
* [Bottom](#sec_bottom)

<a id="sec_digits"></a>
# Digits dataset
[top](#table_of_contents)

In [None]:
digits = datasets.load_digits()

### Exploratory Data Analysis

First, summarize the dataset's structure so we know how to explore it

In [None]:
print("Dataset structure")
print('\tObject type = ', type(digits))
for k, v in digits.items():
    t = type(v)
    s = v.shape if t == np.ndarray else len(v) if t == list else ""
    print(f"\tKey = {k:15} : {str(t):25} {s}")
print()

In agreement with what is described on the datasets [documentation page](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits), there are 1797  images of handwritten digits (i.e. 0-9), consisting of 64 pixels arranged in an 8x8 grid.

In [None]:
print("Target names =", digits.target_names)
X = digits.data
y = digits.target
df = pd.DataFrame(data= np.c_[X, y], columns= digits['feature_names'] + ['target'])
df.head()

### Overview

In [None]:
digits.images[0]

In [None]:
# Running this takes 10s of seconds
n_examples = 3
#fig, axs = plt.subplots(10, n_examples)
#fig.set_figheight(30)
#fig.set_figwidth(3*n_examples)

def plot_digit(image, axes=None):
    sns.heatmap(image, 
                annot=False, cbar=False, square=True, 
                cmap='binary',
                ax=axes
                ) 

for num in digits.target_names:
    print(f"{num}...", end="")
    for ii in range(n_examples):
        entry = df[df['target']==num].index[ii]
        image = digits.images[entry]
        #plot_digit(image, axs[num,ii])

print("Done")

### Fitting and predicting

In [None]:
n_cv = 5
clf = svm.SVC(gamma=0.001, C=100.)

In [None]:
result = cross_validate(clf, X, y, cv=n_cv)
print(result['test_score'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0, test_size=1/n_cv)
p_train = len(X_train) / len(X) * 100
p_test = len(X_test) / len(X) * 100
print(f'{p_train:.0f}% ({len(X_train)}) training + {p_test:.0f}% ({len(X_test)}) testing : {len(X)} total')

In [None]:
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
p_right = metrics.accuracy_score(pred, y_test) * 100
n_right = metrics.accuracy_score(pred, y_test, normalize=False)
n_wrong = len(X_test) - n_right
print(f"{n_wrong}/{len(X_test)} misclassifications ({p_right:.2f}% accurate)")

In [None]:
wrong_filt = pred != y_test
image_wrong  = X_test[wrong_filt].reshape((-1,8,8))
pred_wrong   = pred[wrong_filt]
target_wrong = y_test[wrong_filt]
wrong_pred_str = [f'{t} => {p}' for p,t in zip(pred_wrong, target_wrong)]
sns.countplot(wrong_pred_str)
plt.title("Tally of mistakes")
plt.xlabel("Truth => Prediction")

In [None]:
ii = 2
sns.heatmap(image_wrong[ii], annot=False, cbar=False, square=True, cmap='binary')
plt.title(f'{target_wrong[ii]} mistaken for {pred_wrong[ii]}')

<a id="sec_iris"></a>
# Iris Dataset
[top](#table_of_contents)
<img src="../Tutorial-DataScience/iris.png">

In [None]:
iris = datasets.load_iris()

SciKit Learn's summary of the data set can be found 
[here](https://scikit-learn.org/stable/datasets/toy_dataset.html#iris-plants-dataset) 
and documentation on the `load_iris` function 
[here](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html). The main task with this dataset is classification of each plant into one of the 3 possible Iris species based on the dimensions of the plant's sepal and petal. We get a hint from the documentation that "one class is linearly separable from the other 2; the latter are NOT linearly separable from each other". Therefore this demonstration will have the following goals:

* Optimize a classification model with hard-coded linear selections
* Improve performance using more advanced machine learning models
* Compare to classifications acheived by others

To start, confirm the `Bunch` class (i.e. extended dictionary type) loaded in by `load_iris` stores the data as specified in the documentation.

In [None]:
for k, v in iris.items():
    t = type(v)
    s = v.shape if t == np.ndarray else len(v) if t == list else ""
    print(f"Key = {k:15} : {str(t):25} {s}")

Second, extract needed info from the dataset into convenient formats for further study (e.g. DataFrame)

In [None]:
X = iris.data
y = iris.target
target_id = {n : d for d, n in enumerate(iris.target_names)}
df = pd.DataFrame(
    data    = np.column_stack((X, y)), 
    columns = iris.feature_names + ['target_id']
)
df.head()

`target_id` is set to float by default but this is only an integer so change that. Also, add a column for the actual iris type name corresponding to the ID to improve readability.

In [None]:
df = df.astype({'target_id': 'int64'})
df['target_name'] = df.apply(lambda row : iris.target_names[int(row['target_id'])], axis=1)
df.head()

Withold part of data for final evaluation to avoid biasing model development

In [None]:
random_seed = 123
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=random_seed, test_size=0.2)
print(f'{len(X_test)}/{len(X)} entries witheld for final model evaluation')

def build_dataframe(X, y):
    df = pd.DataFrame(
        data    = np.column_stack((X, y)), 
        columns = iris.feature_names + ['target_id']
    )  
    df = df.astype({'target_id': 'int64'})
    df['target_name'] = df.apply(lambda row : iris.target_names[int(row['target_id'])], axis=1)
    return df

df_train = build_dataframe(X_train, y_train)
df_test  = build_dataframe(X_test, y_test)
df_train.describe()

With only 4 features, a pairplot is a good way to get an overview for the feature distributions

In [None]:
sns.pairplot(df_train, vars=iris.feature_names, hue='target_name', corner=True, diag_kind='hist')

Clearly, iris-setosa can be distinguished from the other iris types by petal length alone while the others will benefit from more complex considerations. 

In [None]:
min_by_type = df_train.groupby(['target_name']).min()
max_by_type = df_train.groupby(['target_name']).max()

min_length = min_by_type.loc['versicolor']['petal length (cm)']
max_length = max_by_type.loc['setosa']['petal length (cm)']
setosa_petal_length_cutoff = (min_length + max_length) / 2

sns.displot(data=df_train, x='petal length (cm)', hue='target_name')
plt.title(f"Setosa Cutoff : petal length < {setosa_petal_length_cutoff:.2f}cm")
plt.axvline(setosa_petal_length_cutoff, color='r', linestyle=':')

Lets first see how accurate a classifier can be achieved with fixed selections.

In [None]:

def optimize_cutoff(cut_feature, iris1, iris2):
    min1 = min_by_type.loc[iris1][cut_feature]
    min2 = min_by_type.loc[iris2][cut_feature]
    max1 = max_by_type.loc[iris1][cut_feature]
    max2 = max_by_type.loc[iris2][cut_feature]

    # Force iris1 to reference type with  minimum
    if min1 > min2:
        iris_up, iris_dn = iris1, iris2
        min_cutoff, max_cutoff = min1, max2
    else:
        iris_up, iris_dn = iris2, iris1
        min_cutoff, max_cutoff = min2, max1

    is_iris_up = df_train['target_name'] == iris_up
    is_iris_dn = df_train['target_name'] == iris_dn
    total = is_iris_up.sum() + is_iris_dn.sum()

    cutoffs = np.linspace(min_cutoff, max_cutoff, 20)
    acc = np.zeros(len(cutoffs))

    for i, c in enumerate(cutoffs):
        true_pos = (is_iris_up & (df_train[cut_feature] >= c)).sum()
        true_neg = (is_iris_dn & (df_train[cut_feature] < c)).sum()
        acc[i] = (true_pos + true_neg) / total
    
    optimal_cutoffs = cutoffs[acc == np.max(acc)]
    optimal_cutoff = (min(optimal_cutoffs) + max(optimal_cutoffs)) / 2

    fig, ax = plt.subplots()
    ax.plot(cutoffs, acc, color='k', label='Cutoff accuracy')
    ax.set_ylabel('Cutoff accuracy')
    ax.set_xlabel(cut_feature)
    diff = max_cutoff-min_cutoff
    ax.set_xlim(min_cutoff-diff, max_cutoff+diff)
    sns.rugplot(data=df_train[is_iris_up | is_iris_dn], 
                x=cut_feature, 
                hue='target_name',
                linewidth=5,
                ax=ax)
    ax.axvline(optimal_cutoff, color='r')
    ax.set_title(f"Optimal cutoff for {cut_feature} = {optimal_cutoff:.3f} ({np.max(acc):.1%} acc)")
    plt.show()
    
    return optimal_cutoff

virginica_petal_length_cutoff = optimize_cutoff('petal length (cm)', 'virginica', 'versicolor')
virginica_petal_width_cutoff = optimize_cutoff('petal width (cm)', 'virginica', 'versicolor')
#optimize_cutoff('petal length (cm)', 'setosa', 'versicolor')

Both petal width and length provide equal classification power so sticking with petal length simplifies the model. Putting both selections together:

In [None]:
def manual_predict(df_X):
    y_pred = np.zeros(len(df_X))
    
    is_setosa = df_X['petal length (cm)'] <= setosa_petal_length_cutoff
    is_virginica = df_X['petal length (cm)'] >= virginica_petal_length_cutoff
    is_versicolor = ~is_setosa & ~is_virginica
    
    y_pred[is_setosa]     = target_id['setosa']
    y_pred[is_versicolor] = target_id['versicolor']
    y_pred[is_virginica]  = target_id['virginica']
    
    return y_pred

In [None]:
def measure_performance(y, y_pred):
    conf_matrix = metrics.confusion_matrix(y, y_pred)
    sns.heatmap(data=conf_matrix, 
                annot=True, 
                xticklabels=iris.target_names, 
                yticklabels=iris.target_names)
    plt.xlabel('Prediction')
    plt.ylabel('Truth')
    plt.show()
    clf_report = metrics.classification_report(y, y_pred, target_names=iris.target_names)
    print(clf_report)

print('Performance on training set')
print('-'*40)
measure_performance(y_train, manual_predict(df_train))

print('Performance on test set')
print('-'*40)
measure_performance(y_test, manual_predict(df_test))

An accuracy of 93% is obtained on the test set. Now lets try to do better using more sophisticated techniques. The most obvious ML algorithms to follow up with are support vector machines (SVM), specifically a linear SVM, and decision trees. This is because these algorithm essentially try to do what was done above, namely find simple decision rules that best separate different data classes. 

The improvement with decision trees is that the algorithms builds combinations of simple rules (i.e. if x > 1 and y < 3 or x > 1 and z == 4, then apply label A) instead of a single rule for each class. Visually, this looks like carving up the phase space into various (potentially disconnected) boxes.

The improvement with linear SVM is twofold. First, it can apply cuts to linear combinations of input features, which visually looks like angled boundaries in the phase space instead of vertical or horizontal ones. Second, it combines rules in a more nuanced way than simply giving priority to one rule. Visually, this looks like muultiple bounded reagions with 2 edges as opposed to a single infinitely extending one. 

In [None]:
clf = svm.LinearSVC(
    dual=False,  
    C=1.0, 
    random_state=random_seed,
)
clf = make_pipeline(StandardScaler(), clf)
#clf = make_pipeline(MinMaxScaler(), clf)

clf.fit(X_train, y_train)
print('Performance on training set')
print('-'*40)
measure_performance(y_train, clf.predict(X_train))

print('Performance on test set')
print('-'*40)
measure_performance(y_test, clf.predict(X_test))

In [None]:
clf = svm.SVC(
    kernel='linear',
    C=1.0, 
    random_state=random_seed,
)
#clf = make_pipeline(StandardScaler(), clf)
#clf = make_pipeline(MinMaxScaler(), clf)

clf.fit(X_train, y_train)
print('Performance on training set')
print('-'*40)
measure_performance(y_train, clf.predict(X_train))

print('Performance on test set')
print('-'*40)
measure_performance(y_test, clf.predict(X_test))

In [None]:
clf = tree.DecisionTreeClassifier(
    random_state=random_seed,
)
#clf = make_pipeline(StandardScaler(), clf)
#clf = make_pipeline(MinMaxScaler(), clf)

clf.fit(X_train, y_train)
print('Performance on training set')
print('-'*40)
measure_performance(y_train, clf.predict(X_train))

print('Performance on test set')
print('-'*40)
measure_performance(y_test, clf.predict(X_test))

In [None]:
fig = plt.figure(figsize=(10,10))
tree.plot_tree(clf)
plt.show()

In [None]:
clf = neighbors.KNeighborsClassifier(
    n_neighbors=len(iris.target_names)
)

clf.fit(X_train, y_train)
print('Performance on training set')
print('-'*40)
measure_performance(y_train, clf.predict(X_train))

print('Performance on test set')
print('-'*40)
measure_performance(y_test, clf.predict(X_test))

<a id="sec_bottom"></a>
# Bottom
[top](#table_of_contents)