In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to run all arguments and not just the last one

In [2]:
# Training a perceptron via scikit-learn
# we will use iris with only 2 features (petal length and petal width)
from sklearn import datasets
import numpy as np
iris = datasets.load_iris()

X = iris.data[:, [2, 3]]
y = iris.target

In [3]:
np.unique(y) # already stored as 0,1,2 which is recommended for many machine learning libraries

array([0, 1, 2])

In [4]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state=0)

# we will also standardize the features using using StandardScalar from scikit-learn's preprocessing module
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)   # we used the same scaling parameter to standardize the test set so that the 2 are comparable

StandardScaler(copy=True, with_mean=True, with_std=True)

In [5]:
from sklearn.linear_model import Perceptron
ppn = Perceptron(n_iter=40, eta0=0.1, random_state=0)
ppn.fit(X_train_std, y_train)
# here also, we initiate a new perceptron object and train the model using the fit method; random_state for reproducibility
# of the initial shuffling of the training dataset after each epoch

Perceptron(alpha=0.0001, class_weight=None, eta0=0.1, fit_intercept=True,
      n_iter=40, n_jobs=1, penalty=None, random_state=0, shuffle=True,
      verbose=0, warm_start=False)

In [6]:
# after training a model in scikit-learn, we can make predictions using the predict method
y_pred = ppn.predict(X_test_std)
print("Misclassified Samples: %d" % (y_test != y_pred).sum()) # 4/45 samples misclassified

Misclassified Samples: 4


In [7]:
# A large variety of different performance metrics are available via the module metrics eg we can calculate the 
# classification accuracy of a perceptron
from sklearn.metrics import accuracy_score
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred)) # Don't really need the print here

Accuracy: 0.91


In [8]:
# We can use the plot_decision_regions functions we created to plot the classification
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import warnings


def versiontuple(v):
    return tuple(map(int, (v.split("."))))


def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):

    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
                    alpha=0.8, c=cmap(idx),
                    marker=markers[idx], label=cl)

    # highlight test samples
    if test_idx:
        # plot all samples
        if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
            X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
            warnings.warn('Please update to NumPy 1.9.0 or newer')
        else:
            X_test, y_test = X[test_idx, :], y[test_idx]

        plt.scatter(X_test[:, 0],
                    X_test[:, 1],
                    c='',
                    alpha=1.0,
                    linewidths=1,
                    marker='o',
                    s=55, label='test set')

In [9]:
X_combined_std = np.vstack((X_train_std, X_test_std))
y_combined = np.hstack((y_train, y_test))

plot_decision_regions(X=X_combined_std, y=y_combined,
                      classifier=ppn, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')

plt.tight_layout()
# plt.savefig('./figures/iris_perceptron_scikit.png', dpi=300)
plt.show()

<matplotlib.text.Text at 0x7fce640d3208>

<matplotlib.text.Text at 0x7fce640e58d0>

<matplotlib.legend.Legend at 0x7fce70914e80>

In [10]:
# Modelling class probabilities using logistic regression
import matplotlib.pyplot as plt
import numpy as np

# To see what a sigmoid function looks like
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

z = np.arange(-7, 7, 0.1)
phi_z = sigmoid(z)

plt.plot(z, phi_z)
plt.axvline(0.0, color='k')
plt.ylim(-0.1, 1.1)
plt.xlabel('z')
plt.ylabel('$\phi (z)$')

# y axis ticks and gridline
plt.yticks([0.0, 0.5, 1.0])
ax = plt.gca()
ax.yaxis.grid(True)

plt.tight_layout()
# plt.savefig('./figures/sigmoid.png', dpi=300)
plt.show()

[<matplotlib.lines.Line2D at 0x7fce640449e8>]

<matplotlib.lines.Line2D at 0x7fce5c081a20>

(-0.1, 1.1)

<matplotlib.text.Text at 0x7fce5c0852b0>

<matplotlib.text.Text at 0x7fce5c095860>

([<matplotlib.axis.YTick at 0x7fce64019198>,
  <matplotlib.axis.YTick at 0x7fce5c081f60>,
  <matplotlib.axis.YTick at 0x7fce5c095128>],
 <a list of 3 Text yticklabel objects>)

In [11]:
# The scikit-learn version of logistic regression is highly optimized and supports multiclass settings off the shelf
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1000, random_state=0)

lr.fit(X_train_std, y_train)

accuracy_score(y_test, lr.predict(X_test_std)) # Much better than perceptron

LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

0.97777777777777775

In [12]:
plot_decision_regions(X_combined_std, y_combined,
                      classifier=lr, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/logistic_regression.png', dpi=300)
plt.show()

<matplotlib.text.Text at 0x7fce640210f0>

<matplotlib.text.Text at 0x7fce5400d8d0>

<matplotlib.legend.Legend at 0x7fce70914160>

In [13]:
# We can predict the class membership probabilities by using predict_proba function
lr.predict_proba(X_test_std[0, :]) # deprecated: will have to use reshape to pass 1d array in the future



array([[  2.05743774e-11,   6.31620264e-02,   9.36837974e-01]])

In [14]:
# Tackling overfitting via regularization
# the concept behind regularization is to introduce additional information(bias) to penalize extreme parameter weights
# The most common form of regularization is L2 regularization (also called L2 shrinkage or weight decay) and
# uses a regularization parameter
# To apply regularization, we need to add the regularization term to the cost function that we define for logistic regression
# lambda = 1/C (regularization parameter): decreasing regularization parameter means increasing regularization strength
# We can visualize that by plotting the L2 regularization path for the two weight coefficients
weights, params = [], []
for c in np.arange(-5, 5):
    lr = LogisticRegression(C=10**c, random_state=0)
    lr.fit(X_train_std, y_train)
    weights.append(lr.coef_[1])
    params.append(10**c)

weights = np.array(weights)
plt.plot(params, weights[:, 0],
         label='petal length')
plt.plot(params, weights[:, 1], linestyle='--',
         label='petal width')
plt.ylabel('weight coefficient')
plt.xlabel('C')
plt.legend(loc='upper left')
plt.xscale('log')
# plt.savefig('./figures/regression_path.png', dpi=300)
plt.show()

LogisticRegression(C=1.0000000000000001e-05, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

LogisticRegression(C=0.10000000000000001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

LogisticRegression(C=10000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

[<matplotlib.lines.Line2D at 0x7fce4405e438>]

[<matplotlib.lines.Line2D at 0x7fce54013860>]

<matplotlib.text.Text at 0x7fce440a42e8>

<matplotlib.text.Text at 0x7fce44099eb8>

<matplotlib.legend.Legend at 0x7fce44133dd8>

In [15]:
# what about the error rate with multiple cost functions
score = []
for c in np.arange(-5, 5):
    lr = LogisticRegression(C=10**c, random_state=0)
    lr.fit(X_train_std, y_train)
    score.append(accuracy_score(y_test, lr.predict(X_test_std)))
    
# np.hstack(np.arange(-5, 5), np.array(score)) failed to stack

np.arange(-5, 5)
np.array(score) 

LogisticRegression(C=1.0000000000000001e-05, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

LogisticRegression(C=0.10000000000000001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

LogisticRegression(C=10000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

array([-5, -4, -3, -2, -1,  0,  1,  2,  3,  4])

array([ 0.6       ,  0.6       ,  0.6       ,  0.6       ,  0.6       ,
        0.8       ,  0.95555556,  0.97777778,  0.97777778,  0.97777778])

In [16]:
# Maximum margin classification with support vector machines
# SVM can be considered to be an extension of the perceptron. While in perceptron, we minimized the misclassification errors,
# in SVM, we will aim to maximize the margin
# Margin: the distance between the separating hyperplane and the training samples closest to this hyperplane (called support vectors)
# the idea is that models with large margins have lower generalization errors and are less prone to overfitting than models
# with small margins. The margin is the objective function and we maximize it under the constraint that the samples are 
# classified correctly

In [17]:
# Dealing with non linearly separable cases with slack variables
# Large values of C correspond to large error penalties and smaller to smaller penalties
# C can be used to tune the width of the margin and tune the bias variance trade-off
# Training an SVM model
from sklearn.svm import SVC
svm = SVC(kernel='linear', C=1, random_state=0)

In [18]:
svm.fit(X_train_std, y_train)
accuracy_score(y_test, svm.predict(X_test_std)) # performs pretty similar to Logistic Regression
# in most practical cases, linear logistic regression and linear SVM generate similar results

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

0.97777777777777775

In [19]:
plot_decision_regions(X_combined_std, y_combined,
                      classifier=svm, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/support_vector_machine_linear.png', dpi=300)
plt.show()

<matplotlib.text.Text at 0x7fce54018f60>

<matplotlib.text.Text at 0x7fce440132b0>

<matplotlib.legend.Legend at 0x7fce3f8c6320>

In [20]:
# scikit-learn also offers alternate implimentations using SGDClassifier class, which also supports online learning via the 
# partial_fit method
from sklearn.linear_model import SGDClassifier
ppn = SGDClassifier(loss='perceptron')
lr = SGDClassifier(loss='log')
svm = SGDClassifier(loss='hinge')

In [21]:
# Solving nonlinear problems using a kernel SVM
# To see how a nonlinear classification may look
np.random.seed(0)
X_xor = np.random.randn(200, 2)
y_xor = np.logical_xor(X_xor[:, 0] > 0, X_xor[:, 1] > 0)
y_xor = np.where(y_xor, 1, -1)

plt.scatter(X_xor[y_xor == 1, 0],
            X_xor[y_xor == 1, 1],
            c='b', marker='x',
            label='1')
plt.scatter(X_xor[y_xor == -1, 0],
            X_xor[y_xor == -1, 1],
            c='r',
            marker='s',
            label='-1')

plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.legend(loc='best')
plt.tight_layout()
# plt.savefig('./figures/xor.png', dpi=300)
plt.show()

<matplotlib.collections.PathCollection at 0x7fce3f89a828>

<matplotlib.collections.PathCollection at 0x7fce3f89a7f0>

(-3, 3)

(-3, 3)

<matplotlib.legend.Legend at 0x7fce3f88aa58>

In [22]:
# The data is not linearly separable. To solve a non linear problem using SVM, we have to transform the training data into 
# a higher dimension feature space via a mapping function. 
svm = SVC(kernel='rbf', random_state=0, gamma=.1, C=10)
svm.fit(X_xor, y_xor)
plot_decision_regions(X_xor, y_xor, classifier=svm)
plt.legend(loc='upper left')
plt.show() # Pretty good for the training set

# Gamma parameter is the cut off parameter for the gaussian sphere. Increasing gamma will increase the area of influence of 
# each training point and lead to a softer decision boundary

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

<matplotlib.legend.Legend at 0x7fce3f8e0c88>

In [23]:
# Applying to the flower dataset
svm = SVC(kernel='rbf', random_state=0, gamma=.2, C=1)
svm.fit(X_train_std, y_train)
accuracy_score(y_test, svm.predict(X_test_std)) # Similar accuracy
plot_decision_regions(X_combined_std, y_combined,
                      classifier=svm, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/support_vector_machine_rbf_iris_1.png', dpi=300)
plt.show()

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.2, kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

0.97777777777777775

<matplotlib.text.Text at 0x7fce3f8606d8>

<matplotlib.text.Text at 0x7fce3f38f080>

<matplotlib.legend.Legend at 0x7fce3f20dfd0>

In [24]:
# Increase the value of gamma
svm = SVC(kernel='rbf', random_state=0, gamma=100, C=1)
svm.fit(X_train_std, y_train)
accuracy_score(y_test, svm.predict(X_test_std)) # Accuracy down to 80%
plot_decision_regions(X_combined_std, y_combined,
                      classifier=svm, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/support_vector_machine_rbf_iris_1.png', dpi=300)
plt.show()

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=100, kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

0.80000000000000004

<matplotlib.text.Text at 0x7fce3f8adc50>

<matplotlib.text.Text at 0x7fce3f1bf128>

<matplotlib.legend.Legend at 0x7fce3f3fa898>

In [None]:
# Decision Tree
