In [1]:
from sklearn import datasets
iris = datasets.load_iris()
data = iris.data
data.shape

(150, 4)

In [3]:
digits = datasets.load_digits()
digits.images.shape

(1797, 8, 8)

### Reshape digits dataset to (n_samples, n_features)

In [4]:
data = digits.images.reshape((digits.images.shape[0], -1))

## Supervised learning: predicting an output variable from high-dimensional observations 

In [5]:
import numpy as np
from sklearn import datasets

iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target
np.unique(iris_y)

array([0, 1, 2])

###  k-Nearest neighbors classifier

In [10]:
np.random.seed(0)
indices = np.random.permutation(len(iris_X))
iris_X_train = iris_X[indices[:-10]]
iris_y_train = iris_y[indices[:-10]]
iris_X_test = iris_X[indices[-10:]]
iris_y_test = iris_y[indices[-10:]]

In [11]:
# Create and fit a nearest-neighbor classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(iris_X_train, iris_y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=5, p=2, weights='uniform')

In [12]:
knn.predict(iris_X_test)

array([1, 2, 1, 0, 0, 0, 2, 1, 2, 0])

In [13]:
iris_y_test

array([1, 1, 1, 0, 0, 0, 2, 1, 2, 0])

## Linear model: from regression to sparsity 

### Diabetes dataset

In [14]:
diabetes = datasets.load_diabetes()
diabetes_X_train = diabetes.data[:-20]
diabetes_X_test = diabetes.data[-20:]
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]

### Linear regression 

In [15]:
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(diabetes_X_train, diabetes_y_train)

LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

In [16]:
print(regr.coef_)

[  3.03499549e-01  -2.37639315e+02   5.10530605e+02   3.27736980e+02
  -8.14131709e+02   4.92814588e+02   1.02848452e+02   1.84606489e+02
   7.43519617e+02   7.60951722e+01]


In [20]:
# The root mean square error
np.sqrt(np.mean((regr.predict(diabetes_X_test) - diabetes_y_test)**2))

44.772397776864963

In [23]:
# Variance score
# 1 - perfect prediction
# 0 - no linear relationship
regr.score(diabetes_X_test, diabetes_y_test)

0.58507530226905713

In [27]:
X = np.c_[ .5, 1].T
y = [.5, 1]
test = np.c_[ 0, 2].T
regr = linear_model.LinearRegression()

In [30]:
import pylab as pl
pl.figure()

<matplotlib.figure.Figure at 0x1073f1cd0>

In [31]:
np.random.seed(0)
for _ in range(6):
    this_X = .1*np.random.normal(size=(2, 1)) + X
    regr.fit(this_X, y)
    pl.plot(test, regr.predict(test))
    pl.scatter(this_X, y, s=3)

#### Ridge Regression

In [32]:
regr = linear_model.Ridge(alpha=.1)
pl.figure()
np.random.seed(0)
for _ in range(6):
    this_X = .1*np.random.normal(size=(2,1)) + X
    regr.fit(this_X, y)
    pl.plot(test, regr.predict(test))
    pl.scatter(this_X, y, s=3)
    
# NOTE: example of bias/variance tradeoff

##### Minimize alpha 

In [33]:
alphas = np.logspace(-4, -1, 6)
from __future__ import print_function
print([regr.set_params(alpha=alpha)
       .fit(diabetes_X_train, diabetes_y_train)
       .score(diabetes_X_test, diabetes_y_test) for alpha in alphas]) 


[0.58511106838835325, 0.58520730154446765, 0.58546775406984919, 0.58555120365039159, 0.58307170855541623, 0.570589994372801]


##### Terms
 - Bias/variance trade-off
 - Regularization
 - Overfitting

#### Sparsity: LASSO 

### Classification 

In [34]:
logistic = linear_model.LogisticRegression(C=1e5)
logistic.fit(iris_X_train, iris_y_train)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, penalty='l2',
          random_state=None, tol=0.0001)

#### Multiclass classification 

If you have several classes to predict, an option often used is to fit one-versus-all classifiers and then use a voting heuristic for the final decision.

#### Shrinkage and sparsity with logistic regression

The C parameter controls the amount of regularization in the LogisticRegression object: a large value for C results in less regularization. penalty="l2" gives Shrinkage (i.e. non-sparse coefficients), while penalty="l1" gives Sparsity.

####  Exercise 1: Classifying the digits dataset

Try classifying the digits dataset with nearest neighbors and a linear model. Leave out the last 10% and test prediction performance on these observations.

In [35]:
from sklearn import datasets, neighbors, linear_model

digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target

##### Nearest neighbors implementation

In [38]:
np.random.seed(0)
indices = np.random.permutation(len(X_digits))
X_digits_train = X_digits[indices[:-10]]
y_digits_train = y_digits[indices[:-10]]
X_digits_test = X_digits[indices[-10:]]
y_digits_test = y_digits[indices[-10:]]

In [43]:
# Create and fit a nearest-neighbor classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_digits_train, y_digits_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=5, p=2, weights='uniform')

In [44]:
y_digits_pred = knn.predict(X_digits_test)

In [45]:
y_digits_pred

array([1, 4, 8, 4, 5, 3, 3, 7, 7, 8])

In [46]:
y_digits_test

array([1, 4, 8, 4, 5, 3, 3, 7, 7, 8])

In [47]:
knn.score(y_digits_pred, y_digits_test)

ValueError: query data dimension must match training data dimension

In [49]:
# Get variance
knn.score(X_digits_test, y_digits_test)

1.0

#####  Logistic Regression

In [56]:
logistic = linear_model.LogisticRegression(C=1e5)
logistic.fit(X_digits_train, y_digits_train)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, penalty='l2',
          random_state=None, tol=0.0001)

In [60]:
logistic.score(X_digits_test, y_digits_test)

0.90000000000000002

### Support Vector Machine<S-CR> 

#### Exercise 2: Use SVC 

In [82]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, svm

In [61]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [72]:
# Get only classes 1 and 2
X = X[y != 0, :2]
y = y[y != 0]

In [78]:
n_sample = len(X)
np.random.seed(0)
order = np.random.permutation(n_sample)
X = X[order]
y = y[order].astype(np.float)

In [81]:
X_train = X[:.9 * n_sample]
y_train = y[:.9 * n_sample]
X_test = X[.9 * n_sample:]
y_test = y[.9 * n_sample:]

  if __name__ == '__main__':
  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [86]:
#fit the model
for fig_num, kernel in enumerate(('linear', 'rbf', 'poly')):
    clf = svm.SVC(kernel=kernel, gamma=10)
    clf.fit(X_train, y_train)
    
    plt.figure(fig_num)
    plt.clf()
    plt.scatter(X[:, 0], X[:, 1], c=y, zorder=10, cmap=plt.cm.Paired)
    
    # Circle out the test data
    plt.scatter(X_test[:, 0], X_test[:, 1], s=80, facecolors='none', zorder=10)
    
    plt.axis('tight')
    x_min = X[:, 0].min()
    x_max = X[:, 0].max()
    y_min = X[:, 1].min()
    y_max = X[:, 1].max()
    
    XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
    Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()])
    
    # Put the result into a color plot
    Z = Z.reshape(XX.shape)
    plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
    plt.contour(XX, YY, Z, colors=['k','k','k'], 
                linestyles=['--', '-', '--'], levels=[-.5, 0, .5])
    plt.title(kernel)
    
plt.show()
    
    