# Supervised Learning, Part I

---

_You are currently looking at **version 1.1** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-machine-learning/resources/bANLa) course resource._

---

In [None]:
!find ../.. | grep -i fruit_data_with_colors.txt

## Import packages

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

### Numpy defaults

In [None]:
np.set_printoptions(precision=2)

## Datasets

### Helper functions

#### Sanity check dataset format

In [None]:
def format_check(X, y):
    import numpy as np
    assert type(X) == type(np.zeros(2))
#     assert X.shape[1] > 0
    assert type(y) == type(np.zeros(y.shape))
    try:
        y.shape[1]
        print('{} must be of shape: (n,)'.format(y.shape))  
    except:
        pass
    if len(set(y)) > 9:
        classes = 'regression'
    else:
        classes = set(y)
    print('X:\t {} {}\ny:\t {} {}\nclasses: {}\n'.format(X.shape, type(X), y.shape, type(y), classes))  
    

#### Plot datasets

In [None]:
def plot_dataset(X, y, **kwargs):
    import matplotlib.cm as cm
    import seaborn as sns
    from matplotlib.colors import ListedColormap, BoundaryNorm
    title = kwargs.get('title', 'default Title')
    label = kwargs.get('c', None)
    
    cmap = kwargs.get('cmap', cm.jet)
    colors = ['#FFFF00', '#00AAFF', '#000000', '#FF00AA']
    revised = ['#e92929', '#7cf500', '#006990', '#ffa900', '#55c4f6']
    col_pal = sns.color_palette(revised).as_hex()
    cmap = ListedColormap(col_pal)
    
    plt.figure()
    plt.title(title)
    plt.scatter(X, y, c=label, marker= 'o', s=50, cmap=cmap)
    plt.show();

## Syntethic datasets

In [None]:
from sklearn.datasets import make_regression, make_classification, make_blobs

### Dataset for simple regression

In [None]:
X_R1, y_R1 = make_regression(n_samples=100, n_features=1,
                            n_informative=1, bias=150.0,
                            noise=30, random_state=0)
format_check(X_R1, y_R1)
plot_dataset(X_R1, y_R1, title='Sample regression problem with one input variable')

### Dataset for more complex regression

In [None]:
from sklearn.datasets import make_friedman1

X_F1, y_F1 = make_friedman1(n_samples = 100,
                           n_features = 7, random_state=0)

format_check(X_F1[:, 2], y_F1)
plot_dataset(X_F1[:, 2], y_F1, title='Complex regression problem with one input variable')

### Dataset for classification (binary) 

In [None]:
X_C2, y_C2 = make_classification(n_samples = 100, n_features=2,
                                n_redundant=0, n_informative=2,
                                n_clusters_per_class=1, flip_y = 0.1,
                                class_sep = 0.5, random_state=0)

format_check(X_C2[:, 0], X_C2[:, 1])
plot_dataset(X_C2[:, 0], X_C2[:, 1], c=y_C2, title='Complex regression problem with one input variable')

### Dataset for lineary unseparable classification (binary) 

In [None]:
X_D2, y_D2 = make_blobs(n_samples=100, n_features=2, centers=8,
                       cluster_std=1.3, random_state=4)
y_D2 = y_D2 % 2

format_check(X_D2[:,0], X_D2[:,1])
plot_dataset(X_D2[:,0], X_D2[:,1], c=y_D2, title='Sample binary classification problem with non-linearly separable classes')

### Blobs for classification and clustering

In [None]:
from sklearn.datasets import make_blobs

X_blob, y_blob = make_blobs(n_samples=500,
                              n_features=2, 
                              centers=2, 
                              cluster_std=1.0,
                              shuffle=True, 
                              random_state=0)

format_check(X_blob, y_blob)
plot_dataset(X_blob[:,0], X_blob[:,1], c=y_blob, title='Sample regression problem with one input variable')

### Breast cancer

In [None]:
# Breast cancer dataset for classification
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
X_cancer, y_cancer = load_breast_cancer(return_X_y = True)

format_check(X_cancer, y_cancer)
plot_dataset(X_cancer[:,0], X_cancer[:,1], c=y_cancer, title='Breast Cancer dataset')

### Crime

In [None]:
def load_crime_dataset():
    # Communities and Crime dataset for regression
    # https://archive.ics.uci.edu/ml/datasets/Communities+and+Crime+Unnormalized

    crime = pd.read_table('../../_data/CommViolPredUnnormalizedData.txt', sep=',', na_values='?')
    
    # remove features with poor coverage or lower relevance, and keep ViolentCrimesPerPop target column
    columns_to_keep = [5, 6] + list(range(11,26)) + list(range(32, 103)) + [145]  
    crime = crime.iloc[:,columns_to_keep].dropna()

    X_crime = crime.iloc[:,range(0,88)]
    y_crime = crime['ViolentCrimesPerPop']

    return X_crime, y_crime, crime.columns

X_crime, y_crime, crime_features = load_crime_dataset()
X_crime, y_crime = X_crime.values, y_crime.values

format_check(X_crime, y_crime)
plot_dataset(X_crime[:,0], X_crime[:,1], c=y_crime, title='Crime dataset')

### Dataset fruits

In [None]:
fruits = pd.read_table('../../_data/fruit_data_with_colors.txt')

feature_names_fruits = ['height', 'width', 'mass', 'color_score']
X_fruits = fruits[feature_names_fruits].values
y_fruits = fruits['fruit_label'].values
target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon']

X_fruits_2d = fruits[['height', 'width']].values
y_fruits_2d = fruits['fruit_label'].values

format_check(X_fruits_2d, y_fruits_2d)
plot_dataset(X_fruits[:,0], X_fruits[:,1], c=y_fruits, title='Fruits dataset')

***

## K-Nearest Neighbors

### Fruits dataset

#### Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_fruits, y_fruits, random_state=0)

#### Transform / scale

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

# we must apply the scaling to the test set that we computed for the training set
X_test_scaled = scaler.transform(X_test)

#### Train

In [None]:
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train_scaled, y_train)

#### Score

In [None]:
print('''

Accuracy of K-NN classifier on training set: {:.2f}
Accuracy of K-NN classifier on test set: {:.2f}
'''.format(knn.score(X_train_scaled, y_train), knn.score(X_test_scaled, y_test)))

#### Predict

In [None]:
example_fruit = [[5.5, 2.2, 10, 0.70]]
example_fruit_scaled = scaler.transform(example_fruit)

print('Predicted fruit type for ', example_fruit, ' is ', 
          target_names_fruits[knn.predict(example_fruit_scaled)[0]-1])

from matplotlib.colors import ListedColormap

cmap_bold = ListedColormap(['#FFFF00', '#00FF00', '#0000FF','#000000'])

***

### KNN - classification dataset

In [None]:
def plot_two_class_knn(X_mat, y_mat, n_neighbors, weights, X_test, y_test):
    import matplotlib.patches as mpatches
    from matplotlib.colors import ListedColormap, BoundaryNorm
    from sklearn.neighbors import KNeighborsClassifier
    
    clf = KNeighborsClassifier(n_neighbors, weights=weights).fit(X_mat, y_mat)

    # Plot the decision boundary by assigning a color in the color map to each mesh point.
    mesh_step_size, padding = .01, 1
    xx, yy = np.meshgrid(np.arange(X_mat[:, 0].min()-padding, X_mat[:, 0].max()+padding, mesh_step_size),
                            np.arange(X_mat[:, 1].min()-padding, X_mat[:, 1].max()+padding, mesh_step_size))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    # Create color maps
    cmap_light = ListedColormap(['#FFFFAA', '#AAFFAA', '#AAAAFF','#EFEFEF'])
    cmap_bold  = ListedColormap(['#FFFF00', '#00FF00', '#0000FF','#000000'])
    
    # Put the result into a color plot
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

    # Plot training points
    plot_symbol_size = 50
    plt.scatter(X_mat[:, 0], X_mat[:, 1], s=plot_symbol_size, c=y_mat, cmap=cmap_bold, edgecolor = 'black')
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())

    # Title
    title = "Neighbors = {}".format(n_neighbors)
    if (X_test is not None):
        train_score = clf.score(X_mat, y_mat)
        test_score  = clf.score(X_test, y_test)
        title = title + "\nTrain score = {:.2f}, Test score = {:.2f}".format(train_score, test_score)
    plt.title(title)
    
    # Legend
    patch0 = mpatches.Patch(color='#FFFF00', label='class 0')
    patch1 = mpatches.Patch(color='#000000', label='class 1')
    plt.legend(handles=[patch0, patch1])

    # Axis labels
    plt.xlabel('Feature 0')
    plt.ylabel('Feature 1')
    plt.show();

#### Classification

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state=0)

plot_two_class_knn(X_train, y_train, 1, 'uniform', X_test, y_test)
plot_two_class_knn(X_train, y_train, 3, 'uniform', X_test, y_test)
plot_two_class_knn(X_train, y_train, 11, 'uniform', X_test, y_test)

### KNN Regression

In [None]:
from sklearn.neighbors import KNeighborsRegressor

X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1, random_state=0)

knnreg = KNeighborsRegressor(n_neighbors=5).fit(X_train, y_train)

print(knnreg.predict(X_test))
print('R-squared test score: {:.3f}'.format(knnreg.score(X_test, y_test)))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_R1[0::5], y_R1[0::5], random_state = 0)
X_predict_input = np.linspace(-3, 3, 50).reshape(-1,1)

fig, subaxes = plt.subplots(1, 2, figsize=(8,4))
for ax, K in zip(subaxes, [1, 3]):
    knnreg = KNeighborsRegressor(n_neighbors = K).fit(X_train, y_train)
    y_predict_output = knnreg.predict(X_predict_input)
    ax.plot(X_predict_input, y_predict_output, '+', markersize=8, label='Predicted', alpha=0.8)
    ax.plot(X_train, y_train, 'o', label='True Value', alpha=0.8)
    ax.set_xlim([-2.5, 0.75])
    ax.set_xlabel('Input feature')
    ax.set_ylabel('Target value')
    ax.set_title('KNN regression (K={})'.format(K))
    ax.legend()
plt.tight_layout();

### Regression model complexity as a function of K

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1, random_state=0)
X_predict_input = np.linspace(-3, 3, 500).reshape(-1,1)


fig, subaxes = plt.subplots(2, 3, figsize=(20,10))
subaxes = subaxes.ravel()    # !NOTE: 2D axes are sublists [[],[]]
for ax, K in zip(subaxes, [1, 3, 7, 11, 15, 55]):
    
    knnreg = KNeighborsRegressor(n_neighbors = K).fit(X_train, y_train)
    
    y_predict_output = knnreg.predict(X_predict_input)
    train_score = knnreg.score(X_train, y_train)
    test_score = knnreg.score(X_test, y_test)
    
    _ = ax.plot(X_predict_input, y_predict_output)
    _ = ax.plot(X_train, y_train, 'o', alpha=0.9, label='Train')
    _ = ax.plot(X_test, y_test, '+', alpha=0.9, label='Test')
    _ = ax.set_xlabel('Input feature')
    _ = ax.set_ylabel('Target value')
    _ = ax.set_title('KNN Regression (K={})\nTrain $R^2 = {:.3f}$,  Test $R^2 = {:.3f}$'
                      .format(K, train_score, test_score))
    _ = ax.legend()
    _ = plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0);

---

## Linear models for regression

### Linear regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
def print_regression(model, coef, intercept, train_score, test_score, **kwargs):
    non_zero_coef = kwargs.get('non_zero_coef', None)
    features = kwargs.get('features', None)
    
    print('''
    model:\n    {}
    linear model coeff (w):      {}\n
    linear model intercept (b):  {:.3f}
    R-squared score (training):  {:.3f}
    R-squared score (test):      {:.3f}
    non-zero coeffs (test):      {}
    '''.format(model, coef, intercept, train_score, test_score, non_zero_coef))
    
    if features is not None:
        print('non-zero coeffs sorted by abs weight: ')
        coeff_names = sorted(list(zip(features, np.round(coef, 3))), key = lambda e: -abs(e[1]))
        for name, coeff in coeff_names:
            if coeff != 0:
                print('\t', name, '=', coeff)
    else:
        coeff_names = None
        
def print_accuracy(title, train_score, test_score):
    print('''
    {}
    Accuracy on training set: {:.2f}
    Accuracy on test set:     {:.2f}'''.format(title, train_score, test_score))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1, random_state=0)
linreg = LinearRegression().fit(X_train, y_train)

print_regression(linreg, linreg.coef_, linreg.intercept_, 
                 linreg.score(X_train, y_train), linreg.score(X_test, y_test))

### Linear regression: example plot 

In [None]:
plt.figure(figsize=(5,4))
plt.scatter(X_R1, y_R1, marker= 'o', s=50, alpha=0.8)
plt.plot(X_R1, linreg.coef_ * X_R1 + linreg.intercept_, 'r-')
plt.title('Least-squares linear regression')
plt.xlabel('Feature value (x)')
plt.ylabel('Target value (y)')
plt.show();

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime, random_state=0)
linreg = LinearRegression().fit(X_train, y_train)

print_regression(linreg, linreg.coef_, linreg.intercept_, 
                 linreg.score(X_train, y_train), linreg.score(X_test, y_test), features=crime_features)

### Ridge regression

In [None]:
from sklearn.linear_model import Ridge
X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime, random_state=0)

linreg = Ridge(alpha=20.0).fit(X_train, y_train)

print_regression(linreg, linreg.coef_, linreg.intercept_, linreg.score(X_train, y_train), 
                 linreg.score(X_test, y_test), non_zero_coef=np.sum(linridge.coef_ != 0), features=crime_features)

#### Ridge regression with feature normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

from sklearn.linear_model import Ridge
X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime, random_state=0)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

linreg = Ridge(alpha=20.0).fit(X_train_scaled, y_train)

print_regression(linreg, linreg.coef_, linreg.intercept_, linreg.score(X_train_scaled, y_train), 
                 linreg.score(X_test_scaled, y_test),  non_zero_coef=np.sum(linridge.coef_ != 0), features=crime_features)

#### Ridge regression with regularization parameter: alpha

In [None]:
print('Ridge regression: effect of alpha regularization parameter\n')

for this_alpha in [0, 1, 10, 20, 50, 100, 1000]:
    
    linridge = Ridge(alpha=this_alpha).fit(X_train_scaled, y_train)
    r2_train = linridge.score(X_train_scaled, y_train)
    r2_test = linridge.score(X_test_scaled, y_test)
    
    num_coeff_bigger = np.sum(abs(linridge.coef_) > 1.0)
    
    print('''
    Alpha = {:.2f}
    - non-zero coeffs: {}
    - r2 training:     {:.2f}
    - r2 test:         {:.2f}'''.format(alpha, num_coeff_bigger, r2_train, r2_test))

### Lasso regression

In [None]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,
                                                   random_state = 0)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

linreg = Lasso(alpha=2.0, max_iter = 10000).fit(X_train_scaled, y_train)

print_regression(linreg, linreg.coef_, linreg.intercept_, linreg.score(X_train_scaled, y_train), 
                 linreg.score(X_test_scaled, y_test),  non_zero_coef=np.sum(linridge.coef_ != 0), features=crime_features)

#### Lasso regression with regularization parameter: alpha

In [None]:
print('Lasso regression: effect of alpha regularization\n\
parameter on number of features kept in final model\n')

for alpha in [0.5, 1, 2, 3, 5, 10, 20, 50]:
    linlasso = Lasso(alpha, max_iter = 10000).fit(X_train_scaled, y_train)
    r2_train = linlasso.score(X_train_scaled, y_train)
    r2_test = linlasso.score(X_test_scaled, y_test)
    
    print('''
    Alpha = {:.2f}
    - non-zero coeffs: {}
    - r2 training:     {:.2f}
    - r2 test:         {:.2f}'''.format(alpha, np.sum(linlasso.coef_ != 0), r2_train, r2_test))

### Polynomial regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures

#### Linear regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_F1, y_F1, random_state=0)
linreg = LinearRegression().fit(X_train, y_train)

print_regression(linreg, linreg.coef_, linreg.intercept_, linreg.score(X_train, y_train), 
                 linreg.score(X_test, y_test),  non_zero_coef=np.sum(linridge.coef_ != 0))

#### Polynomial features (degree 2)

In [None]:
poly = PolynomialFeatures(degree=2)
X_F1_poly = poly.fit_transform(X_F1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_F1_poly, y_F1, random_state=0)
linreg = LinearRegression().fit(X_train, y_train)

print_regression(linreg, linreg.coef_, linreg.intercept_, linreg.score(X_train, y_train), 
                 linreg.score(X_test, y_test),  non_zero_coef=np.sum(linridge.coef_ != 0))

#### Ridge Polynomial (degree 2)

Addition of many polynomial features often leads to overfitting, so we often use polynomial features in combination with regression that has a regularization penalty, like ridge regression.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_F1_poly, y_F1, random_state=0)
linreg = Ridge().fit(X_train, y_train)

print_regression(linreg, linreg.coef_, linreg.intercept_, linreg.score(X_train, y_train), 
                 linreg.score(X_test, y_test),  non_zero_coef=np.sum(linridge.coef_ != 0))

## Linear models for classification

### Logistic regression

#### Logistic regression for binary classification on fruits dataset using height, width features (positive class: apple, negative class: others)

In [None]:
from sklearn.linear_model import LogisticRegression
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot

In [None]:
# Convert to binary problem: apples=1, other=0
y_fruits_apple = (y_fruits_2d == 1) * 1

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_fruits_2d, y_fruits_apple, random_state=0)

clf = LogisticRegression(C=100).fit(X_train, y_train)

h, w = 6, 8
print('height {:2}, width {} predicted fruit: {}'
     .format(h, w, ['other', 'apple'][clf.predict([[h, w]])[0]]))

h, w = 10, 7
print('height {:2}, width {} predicted fruit: {}'
     .format(h, w, ['other', 'apple'][clf.predict([[h, w]])[0]]))

title = 'Logistic regression, binary fruit dataset'.format(1.0)
print_accuracy(title, clf.score(X_train, y_train), clf.score(X_test, y_test))

In [None]:
fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))
title = 'Logistic regression for binary classification\nFruit dataset: Apple vs others'
plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None, None, title, subaxes)
subaxes.set_xlabel('height')
subaxes.set_ylabel('width');

#### Logistic regression on simple synthetic dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state=0)

clf = LogisticRegression().fit(X_train, y_train)

title = 'Logistic regression, simple synthetic dataset C = {:.3f}'.format(1.0)
print_accuracy(title, clf.score(X_train, y_train), clf.score(X_test, y_test))

In [None]:
fig, subaxes = plt.subplots(1, 1, figsize=(6, 6))
clf = LogisticRegression().fit(X_train, y_train)
title = 'Logistic regression, simple synthetic dataset C = {:.3f}'.format(1.0)
plot_class_regions_for_classifier_subplot(clf, X_train, y_train,
                                         None, None, title, subaxes)

#### Logistic regression regularization: C hyperparameter
- C penalty regulator on coeff weights

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_fruits_2d, y_fruits_apple, random_state=0)

fig, subaxes = plt.subplots(1, 3, figsize=(15, 5))

for C, subplot in zip([0.1, 1, 100], subaxes):
    clf = LogisticRegression(C=C).fit(X_train, y_train)
    title ='Logistic regression (apple vs rest), C = {:.3f}'.format(C)
    
    plot_class_regions_for_classifier_subplot(clf, X_train, y_train, X_test, y_test, title, subplot)
plt.tight_layout()

#### Application to real dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state=0)

clf = LogisticRegression().fit(X_train, y_train)
title = 'Breast cancer dataset'
print_accuracy(title, clf.score(X_train, y_train), clf.score(X_test, y_test))

### Support Vector Machines

#### Linear Support Vector Machine

In [None]:
from sklearn.svm import SVC

In [None]:
# from adspy_shared_utilities import plot_class_regions_for_classifier_subplot

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state=0)

fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))
C = 1.0
clf = SVC(kernel = 'linear', C=C).fit(X_train, y_train)
title = 'Linear SVC, C = {:.3f}'.format(C)
plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None, None, title, subaxes)

#### Linear Support Vector Machine: C parameter

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state=0)
fig, subaxes = plt.subplots(1, 3, figsize=(15, 5))

for C, subplot in zip([0.00001, 0.1, 100], subaxes):
    clf = LinearSVC(C=C).fit(X_train, y_train)
    title = 'Linear SVC, C = {:.5f}'.format(C)
    plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None, None, title, subplot)
plt.tight_layout()

#### Application to real dataset

In [None]:
from sklearn.svm import LinearSVC
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)

clf = LinearSVC().fit(X_train, y_train)
title = 'Breast cancer dataset'
print_accuracy(title, clf.score(X_train, y_train), clf.score(X_test, y_test))

### Multi-class classification with linear models

#### LinearSVC with M classes generates M one vs rest classifiers.

In [None]:
from sklearn.svm import LinearSVC

X_train, X_test, y_train, y_test = train_test_split(X_fruits_2d, y_fruits_2d, random_state = 0)

clf = LinearSVC(C=5, random_state = 67).fit(X_train, y_train)
print('Coefficients:\n', clf.coef_)
print('Intercepts:\n', clf.intercept_)

#### Multi-class results on the fruit dataset

In [None]:
plt.figure(figsize=(6,6))
colors = ['r', 'g', 'b', 'y']
cmap_fruits = ListedColormap(['#FF0000', '#00FF00', '#0000FF','#FFFF00'])

plt.scatter(X_fruits_2d[:,0], X_fruits_2d[:,1],
           c=y_fruits_2d, cmap=cmap_fruits, edgecolor = 'black', alpha=.7)

x_0_range = np.linspace(-10, 15)

for w, b, color in zip(clf.coef_, clf.intercept_, ['r', 'g', 'b', 'y']):
    # Since class prediction with a linear model uses the formula y = w_0 x_0 + w_1 x_1 + b, 
    # and the decision boundary is defined as being all points with y = 0, to plot x_1 as a 
    # function of x_0 we just solve w_0 x_0 + w_1 x_1 + b = 0 for x_1:
    plt.plot(x_0_range, -(x_0_range * w[0] + b) / w[1], c=color, alpha=.8)
    
plt.legend(target_names_fruits)
plt.xlabel('height')
plt.ylabel('width')
plt.xlim(-2, 12)
plt.ylim(-2, 15)
plt.show();

## Kernelized Support Vector Machines

### Classification

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0)

fig, subaxes = plt.subplots(1, 3, figsize=(18, 6))

models = [SVC(), SVC(kernel = 'poly', degree=2), SVC(kernel = 'poly', degree=4)]
titles = ['SVC - RBF kernel' , 'SVC - Polynomial kernel degree 2', 'SVC - Polynomial kernel degree 4']

for model, title, ax in zip(models, titles, subaxes):
    clf = model.fit(X_train, y_train)
    plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None, None, title, ax)
    plt.tight_layout();

#### Support Vector Machine with RBF kernel: gamma parameter

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0)
fig, subaxes = plt.subplots(1, 3, figsize=(15, 5))

for this_gamma, ax in zip([0.01, 1.0, 10.0], subaxes):
    clf = SVC(kernel = 'rbf', gamma=this_gamma).fit(X_train, y_train)
    title = 'SVC: \nRBF kernel, gamma = {:.2f}'.format(this_gamma)
    plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None, None, title, ax)
    plt.tight_layout();

#### Support Vector Machine with RBF kernel: using both C and gamma parameter 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0)
fig, subaxes = plt.subplots(3, 4, figsize=(20, 16), dpi=50)

for gamma, axis in zip([0.01, 1, 5], subaxes):
    for C, ax in zip([0.1, 1, 15, 250], axis):
        title = 'gamma = {:.2f}, C = {:.2f}'.format(gamma, C)
        clf = SVC(kernel = 'rbf', gamma=gamma, C=C).fit(X_train, y_train)
        plot_class_regions_for_classifier_subplot(clf, X_train, y_train, X_test, y_test, title,ax)
        plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)

### Application of SVMs to a real dataset: unnormalized data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state=0)

clf = SVC(C=10).fit(X_train, y_train)

print_accuracy('Breast cancer dataset (unnormalized features)', clf.score(X_train, y_train), clf.score(X_test, y_test))

### Application of SVMs to a real dataset: normalized data with feature preprocessing using minmax scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = SVC(C=10).fit(X_train_scaled, y_train)

title = 'Breast cancer dataset (normalized with MinMax scaling)'
print_accuracy(title, clf.score(X_train_scaled, y_train), clf.score(X_test_scaled, y_test))

## Cross-validation

### Example based on k-NN classifier with fruit dataset (2 features)

In [None]:
from sklearn.model_selection import cross_val_score

clf = KNeighborsClassifier(n_neighbors = 5)
X = X_fruits_2d
y = y_fruits_2d
cv_scores = cross_val_score(clf, X, y)

print('Cross-validation scores (3-fold):', cv_scores)
print('Mean cross-validation score (3-fold): {:.3f}'
     .format(np.mean(cv_scores)))

### A note on performing cross-validation for more advanced scenarios.

In some cases (e.g. when feature values have very different ranges), we've seen the need to scale or normalize the training and test sets before use with a classifier. The proper way to do cross-validation when you need to scale the data is *not* to scale the entire dataset with a single transform, since this will indirectly leak information into the training data about the whole dataset, including the test data (see the lecture on data leakage later in the course).  Instead, scaling/normalizing must be computed and applied for each cross-validation fold separately.  To do this, the easiest way in scikit-learn is to use *pipelines*.  While these are beyond the scope of this course, further information is available in the scikit-learn documentation here:

http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

or the Pipeline section in the recommended textbook: Introduction to Machine Learning with Python by Andreas C. Müller and Sarah Guido (O'Reilly Media).

## Validation curve example

In [None]:
from sklearn.model_selection import validation_curve

param_range = np.logspace(-3, 3, 4)
train_scores, test_scores = validation_curve(SVC(), X, y,
                                            param_name='gamma',
                                            param_range=param_range, cv=3)

In [None]:
print(train_scores)

In [None]:
print(test_scores)

In [None]:
# This code based on scikit-learn validation_plot example
#  See:  http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html
plt.figure(figsize=(6,6))

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title('Validation Curve with SVM')
plt.xlabel('$\gamma$ (gamma)')
plt.ylabel('Score')
plt.ylim(0.0, 1.1)
lw = 2

plt.semilogx(param_range, train_scores_mean, label='Training score',
            color='darkorange', lw=lw)

plt.fill_between(param_range, train_scores_mean - train_scores_std,
                train_scores_mean + train_scores_std, alpha=0.2,
                color='darkorange', lw=lw)

plt.semilogx(param_range, test_scores_mean, label='Cross-validation score',
            color='navy', lw=lw)

plt.fill_between(param_range, test_scores_mean - test_scores_std,
                test_scores_mean + test_scores_std, alpha=0.2,
                color='navy', lw=lw)

plt.legend(loc='best')
plt.show();