# Supervised Learning, Part II

---

_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-machine-learning/resources/bANLa) course resource._

---

## Preamble and Datasets

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification, make_blobs
from matplotlib.colors import ListedColormap
from sklearn.datasets import load_breast_cancer

In [None]:
from adspy_shared_utilities import load_crime_dataset
from adspy_shared_utilities import plot_class_regions_for_classifier
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot

In [None]:
cmap_bold = ListedColormap(['#FFFF00', '#00FF00', '#0000FF','#000000'])

### Numpy defaults

In [None]:
np.set_printoptions(precision=2)

## Datasets

### Helper functions

#### Sanity check dataset format

In [None]:
def format_check(X, y):
    import numpy as np
    assert type(X) == type(np.zeros(2))
#     assert X.shape[1] > 0
    assert type(y) == type(np.zeros(y.shape))
    try:
        y.shape[1]
        print('{} must be of shape: (n,)'.format(y.shape))  
    except:
        pass
    if len(set(y)) > 9:
        classes = 'regression'
    else:
        classes = set(y)
    print('X:\t {} {}\ny:\t {} {}\nclasses: {}\n'.format(X.shape, type(X), y.shape, type(y), classes))  
    

#### Plot datasets

In [None]:
def plot_dataset(X, y, **kwargs):
    import matplotlib.cm as cm
    import seaborn as sns
    from matplotlib.colors import ListedColormap, BoundaryNorm
    
    title = kwargs.get('title', 'default Title')
    label = kwargs.get('c', None)
    
    cmap = kwargs.get('cmap', cm.jet)
    cols = ['#00570d', '#1dea5a', '#0762f1', '#cb00f7', '#ff02a0']  # very bright
    col_pal = sns.color_palette(cols).as_hex()
    cmap = ListedColormap(col_pal)
    
    plt.scatter(X, y, c=label, cmap=cmap, alpha=0.6)
    plt.title(title)
    plt.tight_layout();
    plt.show();

## Syntethic datasets

In [None]:
from sklearn.datasets import make_regression, make_classification, make_blobs

### Dataset for simple regression

In [None]:
X_R1, y_R1 = make_regression(n_samples=100, n_features=1,
                            n_informative=1, bias=150.0,
                            noise=30, random_state=0)
format_check(X_R1, y_R1)
plot_dataset(X_R1, y_R1, title='Sample regression problem with one input variable')

### Dataset for more complex regression

In [None]:
from sklearn.datasets import make_friedman1

X_F1, y_F1 = make_friedman1(n_samples = 100,
                           n_features = 7, random_state=0)

format_check(X_F1[:, 2], y_F1)
plot_dataset(X_F1[:, 2], y_F1, title='Complex regression problem with one input variable')

### Dataset for classification (binary) 

In [None]:
X_C2, y_C2 = make_classification(n_samples = 100, n_features=2,
                                n_redundant=0, n_informative=2,
                                n_clusters_per_class=1, flip_y = 0.1,
                                class_sep = 0.5, random_state=0)

format_check(X_C2[:, 0], X_C2[:, 1])
plot_dataset(X_C2[:, 0], X_C2[:, 1], c=y_C2, title='Complex regression problem with one input variable')

### Dataset for lineary unseparable classification (binary) 

In [None]:
X_D2, y_D2 = make_blobs(n_samples=100, n_features=2, centers=8,
                       cluster_std=1.3, random_state=4)
y_D2 = y_D2 % 2

format_check(X_D2[:,0], X_D2[:,1])
plot_dataset(X_D2[:,0], X_D2[:,1], c=y_D2, title='Sample binary classification problem with non-linearly separable classes')

### Blobs for classification and clustering

In [None]:
from sklearn.datasets import make_blobs

X_blob, y_blob = make_blobs(n_samples=500,
                              n_features=2, 
                              centers=2, 
                              cluster_std=1.0,
                              shuffle=True, 
                              random_state=0)

format_check(X_blob, y_blob)
plot_dataset(X_blob[:,0], X_blob[:,1], c=y_blob, title='Sample regression problem with one input variable')

### Breast cancer

In [None]:
# Breast cancer dataset for classification
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
X_cancer, y_cancer = load_breast_cancer(return_X_y = True)

format_check(X_cancer, y_cancer)
plot_dataset(X_cancer[:,0], X_cancer[:,1], c=y_cancer, title='Breast Cancer dataset')

### Crime

In [None]:
def load_crime_dataset():
    # Communities and Crime dataset for regression
    # https://archive.ics.uci.edu/ml/datasets/Communities+and+Crime+Unnormalized

    crime = pd.read_table('../../_data/CommViolPredUnnormalizedData.txt', sep=',', na_values='?')
    
    # remove features with poor coverage or lower relevance, and keep ViolentCrimesPerPop target column
    columns_to_keep = [5, 6] + list(range(11,26)) + list(range(32, 103)) + [145]  
    crime = crime.iloc[:,columns_to_keep].dropna()

    X_crime = crime.iloc[:,range(0,88)]
    y_crime = crime['ViolentCrimesPerPop']

    return X_crime, y_crime, crime.columns

X_crime, y_crime, crime_features = load_crime_dataset()
X_crime, y_crime = X_crime.values, y_crime.values

format_check(X_crime, y_crime)
plot_dataset(X_crime[:,0], X_crime[:,1], c=y_crime, title='Crime dataset')

### Dataset fruits

In [None]:
fruits = pd.read_table('../../_data/fruit_data_with_colors.txt')

feature_names_fruits = ['height', 'width', 'mass', 'color_score']
X_fruits = fruits[feature_names_fruits].values
y_fruits = fruits['fruit_label'].values
target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon']

X_fruits_2d = fruits[['height', 'width']].values
y_fruits_2d = fruits['fruit_label'].values

format_check(X_fruits_2d, y_fruits_2d)
plot_dataset(X_fruits[:,0], X_fruits[:,1], c=y_fruits, title='Fruits dataset')

---

#### Helper functions

In [None]:
def print_regression(model, coef, intercept, train_score, test_score, **kwargs):
    non_zero_coef = kwargs.get('non_zero_coef', None)
    features = kwargs.get('features', None)
    
    print('''
    model:\n    {}
    linear model coeff (w):      {}\n
    linear model intercept (b):  {:.3f}
    R-squared score (training):  {:.3f}
    R-squared score (test):      {:.3f}
    non-zero coeffs (test):      {}
    '''.format(model, coef, intercept, train_score, test_score, non_zero_coef))
    
    if features is not None:
        print('non-zero coeffs sorted by abs weight: ')
        coeff_names = sorted(list(zip(features, np.round(coef, 3))), key = lambda e: -abs(e[1]))
        for name, coeff in coeff_names:
            if coeff != 0:
                print('\t', name, '=', coeff)
    else:
        coeff_names = None
        
def print_accuracy(title, train_score, test_score):
    print('''
    {}
    Accuracy on training set: {:.2f}
    Accuracy on test set:     {:.2f}'''.format(title, train_score, test_score))

In [None]:
def train_test(**kwargs):
    model = kwargs.get('model', None) 
    X = kwargs.get('X', None)
    y = kwargs.get('y', None)
    scaler = kwargs.get('scaler', None)
    title = kwargs.get('title', None) 
    random_state = kwargs.get('random_state', 0)
    plot = kwargs.get('plot', True)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    
    if scaler:
        scaler = scaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    model.fit(X_train, y_train)
    
    if not plot:
        print_accuracy(title, model.score(X_train, y_train), model.score(X_test, y_test))
        print('-'*80)
    else:
        ax = kwargs.get('ax', plt.gca())
        plot_class_regions_for_classifier_subplot(model, X_train, y_train, X_test, y_test, title, ax)

---

## Naive Bayes classifiers

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
fig, subaxes = plt.subplots(1, 4, figsize=(20, 6))

datasets = [(X_C2, y_C2), (X_D2, y_D2), (X_fruits[:,:2], y_fruits), (X_cancer[:,:2], y_cancer)]
datatitles = ['Classifier', 'Blob', 'fruits X0-X1', 'Breast cancer X0-X1']
for title, (X, y), ax in zip(datatitles, datasets, subaxes.ravel()):
    train_test(model=GaussianNB(),
               X=X, 
               y=y, 
               title='Gaussian NB classifier: \nDataset {}'.format(title),
               ax=ax)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
fig, subaxes = plt.subplots(1, 4, figsize=(20, 6))

datasets = [(X_C2, y_C2), (X_D2, y_D2), (X_fruits[:,:2], y_fruits), (X_cancer[:,:2], y_cancer)]
datatitles = ['Classifier', 'Blob', 'fruits X0-X1', 'Breast cancer X0-X1']
for title, (X, y), ax in zip(datatitles, datasets, subaxes.ravel()):
    train_test(model=GaussianNB(),
               X=X, 
               y=y, 
               title='Gaussian NB classifier with scaler: \nDataset {}'.format(title),
               scaler=MinMaxScaler,
               ax=ax)

## Ensembles of Decision Trees

### Random forests

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
fig, subaxes = plt.subplots(1, 4, figsize=(20, 6))

datasets = [(X_C2, y_C2), (X_D2, y_D2), (X_fruits[:,:2], y_fruits), (X_cancer[:,:2], y_cancer)]
datatitles = ['Classifier', 'Blob', 'fruits X0-X1', 'Breast cancer X0-X1']
for title, (X, y), ax in zip(datatitles, datasets, subaxes.ravel()):
    train_test(model=RandomForestClassifier(),
               X=X, 
               y=y, 
               title='Random Forest Classifier, default settings: \nDataset {}'.format(title),
               ax=ax)

In [None]:
fig, subaxes = plt.subplots(1, 4, figsize=(20, 6))

datasets = [(X_C2, y_C2), (X_D2, y_D2), (X_fruits[:,:2], y_fruits), (X_cancer[:,:2], y_cancer)]
datatitles = ['Classifier', 'Blob', 'fruits X0-X1', 'Breast cancer X0-X1']
for title, (X, y), ax in zip(datatitles, datasets, subaxes.ravel()):
    train_test(model=RandomForestClassifier(),
               X=X, 
               y=y, 
               title='Random Forest Classifier, scaled: \nDataset {}'.format(title),
               scaler=MinMaxScaler,
               ax=ax)

### Random Forest - tuning max. features

In [None]:
max_features = [2, 3, 4]
datasets = [(X_fruits, y_fruits)] * len(max_features)

for max_f, (X, y) in zip(max_features, datasets):
    train_test(model=RandomForestClassifier(max_features=max_f),
               X=X, 
               y=y, 
               title='Random Forest Classifier, max. {} features: \nDataset: Fruits'.format(title),
               plot=False)

In [None]:
max_features = [2, 5, 10, 15, 20, 25, 30]
datasets = [(X_cancer, y_cancer)] * len(max_features)

for max_f, (X, y) in zip(max_features, datasets):
    train_test(model=RandomForestClassifier(max_features=max_f),
               X=X, 
               y=y, 
               title='Random Forest Classifier, max. {} features: \nDataset: Cancer'.format(title),
               plot=False)

### Random forest: Fruit dataset pair-plot

In [None]:
fig, subaxes = plt.subplots(2, 3, figsize=(20, 12))

pair_list = [[0,1], [0,2], [0,3], [1,2], [1,3], [2,3]]
datasets = [(X_fruits, y_fruits)] * len(pair_list)

for (X, y), pair, ax in zip(datasets, pair_list, subaxes.ravel()):
    train_test(model=RandomForestClassifier(),
               X=X[:, pair], 
               y=y, 
               title='Random Forest Classifier, \nFruits dataset {}'.format(pair),
               ax=ax)

### Gradient-boosted decision trees

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
fig, subaxes = plt.subplots(1, 4, figsize=(20, 6))

datasets = [(X_C2, y_C2), (X_D2, y_D2), (X_fruits[:,:2], y_fruits), (X_cancer[:,:2], y_cancer)]
datatitles = ['Classifier', 'Blob', 'fruits X0-X1', 'Breast cancer X0-X1']
for title, (X, y), ax in zip(datatitles, datasets, subaxes.ravel()):
    train_test(model=GradientBoostingClassifier(),
               X=X, 
               y=y, 
               title='Gradient Boosting, default settings \nDataset {}'.format(title),
               ax=ax)

#### Gradient boosted decision trees on the fruit dataset

In [None]:
fig, subaxes = plt.subplots(2, 3, figsize=(20, 12))

pair_list = [[0,1], [0,2], [0,3], [1,2], [1,3], [2,3]]
datasets = [(X_fruits, y_fruits)] * len(pair_list)

for (X, y), pair, ax in zip(datasets, pair_list, subaxes.ravel()):
    train_test(model=GradientBoostingClassifier(),
               X=X[:, pair], 
               y=y, 
               title='Gradient Boosting, default settings \nFruits features: {}'.format(pair),
               ax=ax)

#### Gradient-boosted decision trees on a real-world dataset

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
learning_rate = [0.01, 0.05, 0.1, 0.5, 1]
datasets = [(X_cancer, y_cancer)] * len(learning_rate)

for depth in [2, 3, 4]:
    max_depth = [depth] * len(learning_rate)

    for max_d, lr, (X, y) in zip(max_depth, learning_rate, datasets):
        train_test(model=GradientBoostingClassifier(learning_rate=lr, max_depth=max_d),
                   X=X, 
                   y=y, 
                   title='Random Forest Classifier: Breast Cancer dataset\n\tlearning rate: {} \n\tmax. depth: {}'.format(lr, max_d),
                   plot=False)

## Neural networks: Multi-layer Perceptron(MPL)

#### Activation functions

In [None]:
import matplotlib as mpl
mpl.rc('axes.spines', left=True, top=False, right=False, bottom=True)

In [None]:
xrange = np.linspace(-2, 2, 200)

plt.figure(figsize=(7,6))

plt.plot(xrange, np.maximum(xrange, 0), label = 'relu')
plt.plot(xrange, np.tanh(xrange), label = 'tanh')
plt.plot(xrange, 1 / (1 + np.exp(-xrange)), label = 'logistic')
plt.legend(frameon=False)
plt.title('Neural network activation functions')
plt.xlabel('Input value (x)')
plt.ylabel('Activation function output')
plt.xticks([])
plt.yticks([])
ax = plt.gca()
ax.xaxis.set_label_coords(0.9, 0.33)
ax.yaxis.set_label_coords(0.49, 0.78)
ax.spines['left'].set_position(('data', 0))
ax.spines['bottom'].set_position(('data', 0))
plt.show();

In [None]:
# Reset axis
mpl.rc('axes.spines', left=True, top=True, right=True, bottom=True)

### Neural networks: Classification

#### Synthetic dataset 1: single hidden layer

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
fig, subaxes = plt.subplots(2, 3, figsize=(20, 12))

units = [1, 2, 4, 8, 16, 32]
datasets = [(X_D2, y_D2)] * len(units)

for (X, y), units, ax in zip(datasets, units, subaxes.ravel()):
    train_test(model=MLPClassifier(hidden_layer_sizes = [units], solver='lbfgs'),
               X=X, 
               y=y, 
               title='Dataset 1: Neural net classifier, 1 layer, {} units'.format(units),
               ax=ax)

In [None]:
fig, subaxes = plt.subplots(2, 3, figsize=(20, 12))

hidden_layers = [1, 2, 4, 8, 16, 32]
units_layer = 10
datasets = [(X_D2, y_D2)] * len(hidden_layers)

for (X, y), hidden_layers, ax in zip(datasets, hidden_layers, subaxes.ravel()):
    train_test(model=MLPClassifier(hidden_layer_sizes = [units]*hidden_layers, solver='lbfgs'),
               X=X, 
               y=y, 
               title='Dataset 1: Neural Net classifier, {} layers, {} units'.format(hidden_layers, units_layer),
               ax=ax)

In [None]:
fig, subaxes = plt.subplots(9, 3, figsize=(18, 54))

hidden_layers = [5, 10, 15]
units_layer = 8
activations = ['logistic', 'tanh', 'relu']
alphas = [0.01, 0.1, 1.0]
axes = subaxes.ravel()

i_ax = 0
for (i, j, k), _ in np.ndenumerate(np.zeros((len(activations), len(alphas), len(hidden_layers)))):
    print('.', end='')
#     print(activations[i], alphas[j], hidden_layers[k], axes[i_ax])
    train_test(model=MLPClassifier(hidden_layer_sizes=units * hidden_layers[k], 
                                   activation=activations[i], 
                                   alpha=alphas[j], 
                                   solver='lbfgs'),
               X=X_D2, 
               y=y_D2, 
               title='Dataset 1: Neural Net classifier, {} layers, {} units, \nalpha = {:.3f}, activation = {}'.format(
                   hidden_layers[k], units_layer, alphas[j], activations[i]),
               ax=axes[i_ax])
    i_ax += 1

In [None]:
X_cancer[:, pair]

In [None]:
fig, subaxes = plt.subplots(24, 3, figsize=(20, 150))

hidden_layers = [5, 10, 15]
units_layer = 8
activations = ['tanh', 'relu']
alphas = [0.1, 0.5]
axes = subaxes.ravel()
pair_list = [[0,1], [1,2], [2,3]]  # sample of a few feature pairs

i_ax = 0
for pair in pair_list:
    X, y = X_cancer[:, pair], y_cancer
    for (i, j, k), _ in np.ndenumerate(np.zeros((len(activations), len(alphas), len(hidden_layers)))):
        print('.', end='')
    #     print(activations[i], alphas[j], hidden_layers[k], axes[i_ax])
        train_test(model=MLPClassifier(hidden_layer_sizes=units * hidden_layers[k], 
                                       activation=activations[i], 
                                       alpha=alphas[j], 
                                       solver='lbfgs'),
                   X=X, 
                   y=y, 
                   title='Dataset 1: Neural Net classifier, {} layers, {} units, \nalpha = {:.3f}, activation = {}'.format(
                       hidden_layers[k], units_layer, alphas[j], activations[i]),
                   ax=axes[i_ax])
        i_ax += 1

### Neural networks: Regression

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_R1[0::5], y_R1[0::5], random_state = 0)

X_predict_input = np.linspace(-3, 3, 50).reshape(-1,1)

fig, subaxes = plt.subplots(2, 3, figsize=(20, 9), dpi=70)

for axisrow, activation in zip(subaxes, ['tanh', 'relu']):
    for alpha, ax in zip([0.0001, 1.0, 100], axisrow):
        reg = MLPRegressor(hidden_layer_sizes=[100,100],
                             activation=activation,
                             alpha=alpha,
                             solver='lbfgs').fit(X_train, y_train)
        
        y_predict_output = reg.predict(X_predict_input)
        _ = ax.set_xlim([-2.5, 0.75])
        _ = ax.plot(X_predict_input, y_predict_output,
                     '^', markersize = 10)
        _ = ax.plot(X_train, y_train, 'o')
        _ = ax.set_xlabel('Input feature')
        _ = ax.set_ylabel('Target value')
        _ = ax.set_title('MLP regression\nalpha={}, activation={})'
                          .format(alpha, activation))
        plt.tight_layout();

#### Application to real-world dataset for classification

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state=0)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = MLPClassifier(hidden_layer_sizes=[100, 100], alpha=5.0,random_state=0, solver='lbfgs')
clf.fit(X_train_scaled, y_train)

title = 'Breast cancer dataset'
print_accuracy(title, clf.score(X_train_scaled, y_train), clf.score(X_test_scaled, y_test))

In [None]:
from adspy_shared_utilities import load_crime_dataset
from adspy_shared_utilities import plot_class_regions_for_classifier
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot

In [None]:
fig, subaxes = plt.subplots(2, 2, figsize=(12, 12))

for alpha, ax in zip([0.01, 0.1, 1.0, 5.0], subaxes.ravel()):
    train_test(model=MLPClassifier(hidden_layer_sizes=[100, 100], alpha=alpha, solver='lbfgs'),
           X=X_D2, 
           y=y_D2, 
           title='Dataset 2: NN classifier, alpha = {:.3f}'.format(alpha),
           ax=ax)  

In [None]:
train_test(model=MLPClassifier(hidden_layer_sizes=[100, 100], alpha=5.0, solver='lbfgs'),
           X=X_cancer, 
           y=y_cancer, 
           title='Breast cancer dataset',
           scaler=MinMaxScaler)