# Classification

### Train and predict knn on Modified congress voting dataset

In [None]:
# Import KNeighborsClassifier from sklearn.neighbors
from sklearn.neighbors import KNeighborsClassifier 

# Create arrays for the features and the response variable
y = df['party'].values
X = df.drop('party', 1)

# Create a k-NN classifier with 6 neighbors: knn
knn = KNeighborsClassifier(n_neighbors=6)

# Fit the classifier to the data
knn.fit(X, y)

# Predict the labels for the training data X
y_pred = knn.predict(X)

# Predict and print the label for the new data point X_new
new_prediction = knn.predict(X_new)
print("Prediction: {}".format(new_prediction))

### Load mnist dataset

In [None]:
# Import necessary modules
from sklearn import datasets
import matplotlib.pyplot as plt

# Load the digits dataset: digits
digits = datasets.load_digits()

# Print the keys and DESCR of the dataset
print(digits.keys())
print(digits.DESCR)

# Print the shape of the images and data keys
print(digits.images.shape)
print(digits.data.shape)

# Display digit 1010
plt.imshow(digits.images[1010], cmap=plt.cm.gray_r, interpolation='nearest')
plt.show()

### fit, predict and calculate accuracy using knn on mnist dataset

In [None]:
# Import necessary modules
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import train_test_split

# Create feature and target arrays
X = digits.data
y = digits.target

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

# Create a k-NN classifier with 7 neighbors: knn
knn = KNeighborsClassifier(n_neighbors=7)

# Fit the classifier to the training data
knn.fit(X_train, y_train)

# Print the accuracy
print(knn.score(X_test, y_test))

# Methods of evaluating Model performance

## Accuracy

Accuracy is mainly used to measure model performance in classification problems.

Observe overfitting and underfitting by fitting models with differnt values for n_neighbours. Plot model accuracy scores on both training and testing data.

However, accuracy is not always an informative metric.

Class imbalance in a dataset makes accuracy as a measure of model performance useless.

In [None]:
# Setup arrays to store train and test accuracies
neighbors = np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

# Loop over different values of k
for i, k in enumerate(neighbors):
    # Setup a k-NN Classifier with k neighbors: knn
    knn = KNeighborsClassifier(n_neighbors=k)

    # Fit the classifier to the training data
    knn.fit(X_train, y_train)
    
    #Compute accuracy on the training set
    train_accuracy[i] = knn.score(X_train, y_train)

    #Compute accuracy on the testing set
    test_accuracy[i] = knn.score(X_test, y_test)

# Generate plot
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()

![Class Imbalance](class_imbalance.png)

## Confusion Matrix

![Confusion Matrix](confusion_matrix.png)

![Metrics from aConfusion Matrix](confusion_matrix_metrics.png)

A confusion matrix helps you get a better understanding of your model's performance.

[Concusion matrix explanation](https://www.geeksforgeeks.org/confusion-matrix-machine-learning/)

### fit, predict and evaluate knn using confusion matrix

In [None]:
# Import necessary modules
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Create training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42)

# Instantiate a k-NN classifier: knn
knn = KNeighborsClassifier(n_neighbors = 6)

# Fit the classifier to the training data
knn.fit(X_train, y_train)

# Predict the labels of the test data: y_pred
y_pred = knn.predict(X_test)

# Generate the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

### fit, predict and evaluate Logistic Regression (for Binary Classification) using confusion matrix 

In [None]:
# Import the necessary modules
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42)

# Create the classifier: logreg
logreg = LogisticRegression()

# Fit the classifier to the training data
logreg.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = logreg.predict(X_test)

# Compute and print the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

## ROC curves provide a way to visually evaluate models.

In [None]:
# Import necessary modules
from sklearn.metrics import roc_curve

# Compute predicted probabilities: y_pred_prob
y_pred_prob = logreg.predict_proba(X_test)[:,1]

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

## Precision-recall curve to visually evaluate model performance

relationship between threshold(roc) and precision-recall

## Area Under the ROC curve.

The bigger the area, the better the model.

If the AUC is greater than 0.5, the model is better than random guessing.

In [None]:
# Import necessary modules
from sklearn.metrics import roc_auc_score 
from sklearn.model_selection import cross_val_score

# Compute predicted probabilities: y_pred_prob
y_pred_prob = logreg.predict_proba(X_test)[:,1]

# Compute and print AUC score
print("AUC: {}".format(roc_auc_score(y_test, y_pred_prob)))

# Compute cross-validated AUC scores: cv_auc
cv_auc = cross_val_score(logreg, X, y, cv=5, scoring='roc_auc')

# Print list of AUC scores
print("AUC scores computed using 5-fold cross-validation: {}".format(cv_auc))

# Hyperparameter Tuning

Like the alpha parameter of lasso and ridge regularization that you saw earlier, logistic regression also has a regularization parameter: C. C controls the inverse of the regularization strength, and this is what you will tune in this exercise. A large C can lead to an overfit model, while a small C can lead to an underfit model.

## GridSearchCV
Aim is to use GridSearchCV and logistic regression to find the optimal C in this hyperparameter space.

In [None]:
# Import necessary modules
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Setup the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space}

# Instantiate a logistic regression classifier: logreg
logreg = LogisticRegression()

# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

# Fit it to the data
logreg_cv.fit(X, y)

# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
print("Best score is {}".format(logreg_cv.best_score_))

## RandomizedSearchCV

GridSearchCV can be computationally expensive, especially if you are searching over a large hyperparameter space and dealing with multiple hyperparameters. A solution to this is to use RandomizedSearchCV, in which not all hyperparameter values are tried out. Instead, a fixed number of hyperparameter settings is sampled from specified probability distributions.

In [None]:
# Import necessary modules
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

# Setup the parameters and distributions to sample from: param_dist
param_dist = {"max_depth": [3, None],
              "max_features": randint(1, 9),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

# Instantiate a Decision Tree classifier: tree
tree = DecisionTreeClassifier()

# Instantiate the RandomizedSearchCV object: tree_cv
tree_cv = RandomizedSearchCV(tree, param_dist, cv=5)

# Fit it to the data
tree_cv.fit(X, y)

# Print the tuned parameters and score
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

## Hold out set for final evaluation

![Hold out set for final evaluation](hold_out_set.png)

You want to be absolutely certain about your model's ability to generalize to unseen data.

In addition to C, logistic regression has a 'penalty' hyperparameter which specifies whether to use 'l1' or 'l2' regularization

In [None]:
# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Create the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space, 'penalty': ['l1', 'l2']}

# Instantiate the logistic regression classifier: logreg
logreg = LogisticRegression()

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg, param_grid, cv = 5)

# Fit it to the training data
logreg_cv.fit(X_train, y_train)

# Print the optimal parameters and best score
print("Tuned Logistic Regression Parameter: {}".format(logreg_cv.best_params_))
print("Tuned Logistic Regression Accuracy: {}".format(logreg_cv.best_score_))

# Pipeline for classification

In [None]:
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('SVM', SVC())]

pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {'SVM__C':[1, 10, 100],
              'SVM__gamma':[0.1, 0.01]}

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

# Instantiate the GridSearchCV object: cv
cv = GridSearchCV(pipeline, parameters)

# Fit to the training set
cv.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = cv.predict(X_test)

# Compute and print metrics
print("Accuracy: {}".format(cv.score(X_test, y_test)))
print(classification_report(y_test, y_pred))
print("Tuned Model Parameters: {}".format(cv.best_params_))

# Linear Classifiers

## Running LogisticRegression and SVC

In [None]:
from sklearn import datasets
digits = datasets.load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)

# Apply logistic regression and print scores
lr = LogisticRegression()
lr.fit(X_train, y_train)
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

# Apply SVM and print scores
svm = SVC()
svm.fit(X_train, y_train)
print(svm.score(X_train, y_train))
print(svm.score(X_test, y_test))

0.9955456570155902

0.9622222222222222

1.0

0.48

### Sentiment analysis for movie reviews

In this exercise you'll explore the probabilities outputted by logistic regression on a subset of the Large Movie Review Dataset.

The variables X and y are already loaded into the environment. X contains features based on the number of times words appear in the movie reviews, and y contains labels for whether the review sentiment is positive (+1) or negative (-1).

In [None]:
# Instantiate logistic regression and train
lr = LogisticRegression()
lr.fit(X, y)

# Predict sentiment for a glowing review
review1 = "LOVED IT! This movie was amazing. Top 10 this year."
review1_features = get_features(review1)
print("Review:", review1)
print("Probability of positive review:", lr.predict_proba(review1_features)[0,1])

# Predict sentiment for a poor review
review2 = "Total junk! I'll never watch a film by that director again, no matter how good the reviews."
review2_features = get_features(review2)

Review: LOVED IT! This movie was amazing. Top 10 this year.
    
Probability of positive review: 0.8079007873616059
    
Review: Total junk! I'll never watch a film by that director again, no matter how good the reviews.
    
Probability of positive review: 0.5855117402793947 The second probability would have been even lower, but the word "good" trips it up a bit, since that's considered a "positive" word.

![Definitions](definitions.png)

![Linear decision boundary](linear_decision_boundary.png)

![linearly_separable_data](linearly_separable_data.png)

In their basic forms LogisticRegression and SVC are linear classifiers i.e they learn linear decision boundaries.

## Visualizing decision boundaries

plot_4_classifers() function (similar to the code [here](https://scikit-learn.org/stable/auto_examples/svm/plot_iris_svc.html))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier

# Define the classifiers
classifiers = [LogisticRegression(), LinearSVC(), SVC(), KNeighborsClassifier()]

# Fit the classifiers
for c in classifiers:
    c.fit(X, y)

# Plot the classifiers
plot_4_classifiers(X, y, classifiers)
plt.show()

![decision_boundaries](decision_boundaries.svg)

As you can see, logistic regression and linear SVM are linear classifiers whereas the default SVM and KNN are not.

![dot_product](dot_product.png)

![linear_classifier_prediction](linear_classifier_prediction.png)

## How Logistic Regression makes predictions

![how_lr_makes_predictions](how_lr_makes_predictions.png)

## Visually look at prediction equation

![example_decision_boundary](example_decision_boundary.png)

Values of coefficients and intercepts determine the boundary.

We change the intercept and get the below boundary.

![intercept_decision_boundary](intercept_decision_boundary.png)

To change the orientation of the boundary, we can change the coefficients.

![coefficient_decision_boundary](coefficient_decision_boundary.png)


Since logistic regression and SVMs are both linear classifiers, the raw model output is a linear function of x.

### Changing the model coefficients

When you call fit with scikit-learn, the logistic regression coefficients are automatically learned from your dataset. In this exercise you will explore how the decision boundary is represented by the coefficients. To do so, you will change the coefficients manually (instead of with fit), and visualize the resulting classifiers.

- Set the two coefficients and the intercept to various values and observe the resulting decision boundaries.
- Try to build up a sense of how the coefficients relate to the decision boundary.
- Set the coefficients and intercept such that the model makes no errors on the given training data.

In [None]:
# Set the coefficients
model.coef_ = np.array([[0,1]])
model.intercept_ = np.array([0])

# Plot the data and decision boundary
plot_classifier(X,y,model)

# Print the number of errors
num_err = np.sum(y != model.predict(X))
print("Number of errors:", num_err)

![initial.svg](initial.svg)

In [None]:
# Set the coefficients
model.coef_ = np.array([[-1,1]])
model.intercept_ = np.array([-4])

# Plot the data and decision boundary
plot_classifier(X,y,model)

# Print the number of errors
num_err = np.sum(y != model.predict(X))
print("Number of errors:", num_err)

![final](final.svg)

The coefficients determine the slope of the boundary and the intercept shifts it.

## Loss function

Tells us how well/poorly our model is doing in the training set.

**.fit** runs code that minimizes the loss.

Classification uses loss functions that are different from regression loss functions.

An example is the **no. of errors (0 1 Loss)**. 0 if prediction is correct, 1 if wrong. 

By summing the function over the number of training examples, we get the number of mistakes made in the training set.

It's however very hard to minimize in practice hence not used by Logistic Regression and SVMs.

**scipy.optimize.minimize** - for minimizing loss functions.

![least_squares](least_squares.png)

## Minimizing a loss function
In this exercise you'll implement linear regression "from scratch" using scipy.optimize.minimize.

We'll train a model on the Boston housing price data set, which is already loaded into the variables X and y. For simplicity, we won't include an intercept in our regression model.

In [None]:
# The squared error, summed over training examples
def my_loss(w):
    s = 0
    for i in range(y.size):
        # Get the true and predicted target values for example 'i'
        y_i_true = y[i]
        y_i_pred = w@X[i]
        s = s + (y_i_true-y_i_pred)**2
    return s

# Returns the w that makes my_loss(w) smallest
w_fit = minimize(my_loss, X[0]).x
print(w_fit)

# Compare with scikit-learn's LinearRegression coefficients
lr = LinearRegression(fit_intercept=False).fit(X,y)
print(lr.coef_)

![linear_regression_loss](linear_regression_loss.png)

![logistic_loss](logistic_loss.png)

Logistic Loss is used by Logistic Regression.

![hinge_loss](hinge_loss.png)

Hinge Loss is used by Support Vector Machines.

### Comparing the logistic and hinge losses

![log_hinge](log_hinge.svg)

#### Create a plot of the logistic and hinge losses using their mathematical expressions.

In [None]:
# Mathematical functions for logistic and hinge losses
def log_loss(raw_model_output):
   return np.log(1+np.exp(-raw_model_output))
def hinge_loss(raw_model_output):
   return np.maximum(0,1-raw_model_output)

# Create a grid of values and plot
grid = np.linspace(-2,2,1000)
plt.plot(grid, log_loss(grid), label='logistic')
plt.plot(grid, hinge_loss(grid), label='hinge')
plt.legend()
plt.show()

![log_hinge_1](log_hinge_1.svg)

These match up with the loss function diagrams we saw.

### Implementing logistic regression

This is very similar to implementing linear regression "from scratch" using scipy.optimize.minimize. However, this time we'll minimize the logistic loss and compare with scikit-learn's LogisticRegression (we've set C to a large value to disable regularization; more on this later).

In [None]:
# The logistic loss, summed over training examples
def my_loss(w):
    s = 0
    for i in range(len(X)):
        raw_model_output = w@X[i]
        s = s + log_loss(raw_model_output * y[i])
    return s

# Returns the w that makes my_loss(w) smallest
w_fit = minimize(my_loss, X[0]).x
print(w_fit)

# Compare with scikit-learn's LogisticRegression
lr = LogisticRegression(fit_intercept=False, C=1000000).fit(X,y)
print(lr.coef_)

[ 1.03592182 -1.65378492  4.08331342 -9.40923002 -1.06786489  0.07892114
 -0.85110344 -2.44103305 -0.45285671  0.43353448]
 
[[ 1.03731085 -1.65339037  4.08143924 -9.40788356 -1.06757746  0.07895582
  -0.85072003 -2.44079089 -0.45271     0.43334997]]

As you can see, logistic regression is just minimizing the loss function we've been looking at. 

## Regularization

Combats overfitting by punishing large model weights.

Hyperparameter **c** is the inverse of the regularization strength i.e larger **c** means less regularization whereas smaller **c** means more regularization.

regularized loss = original loss + coefficient penalty (high or low)

Regularization:
- reduces training error
- improves test error

### Logistic Regression with default regularization term
![lr_default_regularization](lr_default_regularization.png)

### Logistic Regression with high and low regularization (low c and high c)
![lr_regularized](lr_regularized.png)

### L1 VS L2 Regularization
![l1_l2](l1_l2.png)

![l2_l1_regularization](l2_l1_regularization.png)

**L1 regularization** reduces coefficients to 0 hence perfoming feature selection.

**L2 regularization** just shrinks the coefficients to be smaller.

## Regularized logistic regression

We'll explore the effect of L2 regularization.

- Loop over the different values of C_value, creating and fitting a LogisticRegression model each time.

- Save the error on the training set and the validation set for each model.

- Create a plot of the training and testing error as a function of the regularization parameter, C.

- Looking at the plot, what's the best value of C?

In [None]:
# Train and validaton errors initialized as empty list
train_errs = list()
valid_errs = list()

# Loop over values of C_value
for C_value in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:
    # Create LogisticRegression object and fit
    lr = LogisticRegression(C=C_value)
    lr.fit(X_train, y_train)
    
    # Evaluate error rates and append to lists
    train_errs.append( 1.0 - lr.score(X_train, y_train) )
    valid_errs.append( 1.0 - lr.score(X_valid, y_valid) )
    
# Plot results
plt.semilogx(C_values, train_errs, C_values, valid_errs)
plt.legend(("train", "validation"))
plt.show()

![train_valid](train_valid.svg)

As you can see, too much regularization (small C) doesn't work well - due to underfitting - and too little regularization (large C) doesn't work well either - due to overfitting.

## Logistic regression and feature selection

Perform feature selection on the movie review sentiment data set using L1 regularization.

Search for the best value of C using scikit-learn's GridSearchCV()

In [None]:
# Specify L1 regularization
lr = LogisticRegression(penalty='l1')

# Instantiate the GridSearchCV object and run the search
searcher = GridSearchCV(lr, {'C':[0.001, 0.01, 0.1, 1, 10]})
searcher.fit(X_train, y_train)

# Report the best parameters
print("Best CV params", searcher.best_params_)

# Find the number of nonzero coefficients (selected features)
best_lr = searcher.best_estimator_
coefs = best_lr.coef_
print("Total number of features:", coefs.size)
print("Number of selected features:", np.count_nonzero(coefs))

    Best CV params {'C': 1}
    Total number of features: 2500
    Number of selected features: 1220

### Identifying the most positive and negative words

We'll try to interpret the coefficients of a logistic regression fit on the movie review sentiment dataset.

The words corresponding to the different features are loaded into the variable vocab. 

For example, since vocab[100] is "think", that means feature 100 corresponds to the number of times the word "think" appeared in that movie review.


In [None]:
# Get the indices of the sorted cofficients
inds_ascending = np.argsort(lr.coef_.flatten()) 
inds_descending = inds_ascending[::-1]

# Print the most positive words
print("Most positive words: ", end="")
for i in range(5):
    print(vocab[inds_descending[i]], end=", ")
print("\n")

# Print most negative words
print("Most negative words: ", end="")
for i in range(5):
    print(vocab[inds_ascending[i]], end=", ")
print("\n")

Most positive words: favorite, superb, noir, knowing, loved, 

Most negative words: disappointing, waste, worst, boring, lame,

## Logistic Regression and probabilities

![lr_probabilites_unregularized](lr_probabilites_unregularized.png)

The figure above shows the predicted probabilities of the red class. 

If we're more than 50% sure it's red, we predict red.

If we're less than 50% sure it's red, we predict blue.

We become more confident as we move away from the decision boundary.

![lr_probabilities](lr_probabilities.png)

Regularization makes the coefficients smaller.

The probabilities are closer to 0.5 (We don't get to the very dark red and blue).

This means we're less confident in our predictions.Hence preventing overconfidence/overfitting.

Ratio of the coefficients gives us the slope of the line.

Magnitude of the coefficients gives us the confidence level.

Regularization affects the confidence and slope of boundary.

![computing_probabilities](computing_probabilities.png)

Sigmoid function used to squash raw model outputs to be between 0 and 1.

### Regularization and probabilities

Observe the effects of changing the regularization strength on the predicted probabilities.

- Compute the maximum predicted probability.
- Run the provided code and take a look at the plot.

In [None]:
# Set the regularization strength
model = LogisticRegression(C=1)

# Fit and plot
model.fit(X,y)
plot_classifier(X,y,model,proba=True)

# Predict probabilities on training points
prob = model.predict_proba(X)
print("Maximum predicted probability", prob.max())

Maximum predicted probability 0.9761229966765974

![prob_1](prob_1.svg)

- Create a model with C=0.1 and examine how the plot and probabilities change.

![prob_2](prob_2.svg)

As you probably noticed, smaller values of C lead to less confident predictions. That's because smaller C means more regularization, which in turn means smaller coefficients, which means raw model outputs closer to zero and, thus, probabilities closer to 0.5 after the raw model output is squashed through the sigmoid function. That's quite a chain of events!

### Visualizing easy and difficult examples

You'll visualize the examples that the logistic regression model is most and least confident about by looking at the largest and smallest predicted probabilities.

The show_digit function takes in an integer index and plots the corresponding image, with some extra information displayed above the image.

- Fill in the first blank with the index of the digit that the model is most confident about.
- Fill in the second blank with the index of the digit that the model is least confident about.

In [None]:
lr = LogisticRegression()
lr.fit(X,y)

# Get predicted probabilities
proba = lr.predict_proba(X)

# Sort the example indices by their maximum probability
proba_inds = np.argsort(np.max(proba,axis=1))

# Show the most confident (least ambiguous) digit
show_digit(proba_inds[-1], lr)

# Show the least confident (most ambiguous) digit
show_digit(proba_inds[0], lr)

![most_confident](most_confident.svg)

![least_confident](least_confident.svg)

As you can see, the least confident example looks like a weird 4, and the most confident example looks like a very typical 0.

## Multiclass Logistic Regression

### Combining binary classifier with one vs rest 

![one_vs_rest](one_vs_rest.png)

Take the classifier that has the highest raw model output/ decision function.

In the case above, it would be classifier 0 i.e it's more confident that the class is 0 than the other classes, so we predict class 0.

One vs rest is the default behaviour of scikit learn's logistic regression. Consequently, the way to calculate loss (solver) is for the one vs rest case hence one needs to change this when moving to a multinomial approach. Additionally, one needs to add the multi-class parameter.

### Multinomial or softmax

![one_rest_vs_multinomial](one_rest_vs_multinomial.png)

Explanation:
- Simple and modular: You can reuse your binary class implementation instead of creating a new one.
- Tackle the problem directly: better accuracy since its loss is directly aligned with accuracy.

"Multinomial or softmax" is standard in the field of neural networks.

#### Counting the coefficients

If you fit a logistic regression model on a classification problem with 3 classes and 100 features, how many coefficients would you have, including intercepts? **303**

## Fitting multi-class logistic regression

Fit the two types of multi-class logistic regression, one-vs-rest and softmax/multinomial, on the handwritten digits data set and compare the results. 

In [None]:
# Fit one-vs-rest logistic regression classifier
lr_ovr = LogisticRegression()
lr_ovr.fit(X_train, y_train)

print("OVR training accuracy:", lr_ovr.score(X_train, y_train))
print("OVR test accuracy    :", lr_ovr.score(X_test, y_test))

# Fit softmax classifier
lr_mn = LogisticRegression(multi_class="multinomial", solver="lbfgs")
lr_mn.fit(X_train, y_train)

print("Softmax training accuracy:", lr_mn.score(X_train, y_train))
print("Softmax test accuracy    :", lr_mn.score(X_test, y_test))

    OVR training accuracy: 0.9948032665181886
    OVR test accuracy    : 0.9644444444444444
    Softmax training accuracy: 1.0
    Softmax test accuracy    : 0.9688888888888889
    
    As you can see, the accuracies of the two methods are fairly similar on this data set.

## Visualizing multi-class logistic regression

In this exercise we'll continue with the two types of multi-class logistic regression, but on a toy 2D data set specifically designed to break the one-vs-rest scheme.

The data set is loaded into X_train and y_train. The two logistic regression objects,lr_mn and lr_ovr, are already instantiated (with C=100), fit, and plotted. As shown below.

Notice that lr_ovr never predicts the dark blue class... yikes! 

![softmax_1](softmax_1.svg)

![ovr_1](ovr_1.svg)

Let's explore why this happens by plotting one of the binary classifiers that it's using behind the scenes.

In [None]:
# Print training accuracies
print("Softmax     training accuracy:", lr_mn.score(X_train, y_train))
print("One-vs-rest training accuracy:", lr_ovr.score(X_train, y_train))

# Create the binary classifier (class 1 vs. rest)
lr_class_1 = LogisticRegression(C=100)
lr_class_1.fit(X_train, y_train==1)

# Plot the binary classifier (class 1 vs. rest)
plot_classifier(X_train, y_train==1, lr_class_1)

    Softmax     training accuracy: 0.996
    One-vs-rest training accuracy: 0.916

![ovr_2](ovr_2.svg)

As you can see, the binary classifier incorrectly labels almost all points in class 1 (shown as red triangles in the final plot)! 

Thus, this classifier is not a very effective component of the one-vs-rest classifier. In general, though, one-vs-rest often works well.

## One-vs-rest SVM

We'll repeat the previous exercise with a non-linear SVM.

Instead of using LinearSVC, we'll now use scikit-learn's SVC object, which is a non-linear "kernel" SVM.

In [None]:
# We'll use SVC instead of LinearSVC from now on
from sklearn.svm import SVC

# Create/plot the binary classifier (class 1 vs. rest)
svm_class_1 = SVC()
svm_class_1.fit(X_train, y_train==1)
plot_classifier(X_train, y_train==1, svm_class_1)

![svc](svc.svg)

The non-linear SVM works fine with one-vs-rest on this dataset because it learns to "surround" class 1.

## Support Vectors

![svm](svm.png)

![svm_ctd](svm_ctd.png)

![max_margin](max_margin.png)

If the regularization strength is not too large, SVM's maximize the margins of linearly separable datasets.

Which of the following is a true statement about support vectors? 
1. All support vectors are classified correctly.
2. All support vectors are classified incorrectly.
3. All correctly classified points are support vectors.
4. All incorrectly classified points are support vectors.

Answer: **4** (Also points close to the decision boundary)

### Effect of removing examples

**Support vectors** are defined as training examples that influence the decision boundary. 

In this exercise, you'll observe this behavior by removing non support vectors from the training set.

- Train a linear SVM on the whole data set.

In [None]:
# Train a linear SVM
svm = SVC(kernel="linear")
svm.fit(X, y)
plot_classifier(X, y, svm, lims=(11,15,0,6))

![svm_1](svm_1.svg)

- Create a new data set containing only the support vectors.
- Train a new linear SVM on the smaller data set.

In [None]:
# Make a new data set keeping only the support vectors
print("Number of original examples", len(X))
print("Number of support vectors", len(svm.support_))
X_small = X[svm.support_]
y_small = y[svm.support_]

# Train a new SVM using only the support vectors
svm_small = SVC(kernel="linear")
svm_small.fit(X_small, y_small)
plot_classifier(X_small, y_small, svm_small, lims=(11,15,0,6))

![svm_2](svm_2.svg)

    Number of original examples 178
    Number of support vectors 81
    
Compare the decision boundaries of the two trained models: are they the same? By the definition of support vectors, they should be!

## Kernel SVM

![feature_transformation](feature_transformation.png)

![transformation_decision_boundaries](transformation_decision_boundaries.png)

Fitting a linear model in a transformed space is the same as fitting a non-linear model in an untransformed space.

**Kernel SVM** performs feature transformation in a computationally efficient way.

You can control the shape of the boundary using hyperparameters.

**C** - controls regularization

**kernel** - default is **rbf(radial basis function)**

**gamma** - influence smoothness of boundary:
- decreasing the value leads to a smoother boundary
- a larger values for gamma lead to more complex decision boundaries

### GridSearchCV warm-up
We saw that increasing the RBF kernel hyperparameter gamma increases training accuracy. 

In this exercise we'll search for the gamma that maximizes cross-validation accuracy using scikit-learn's GridSearchCV. 

A binary version of the handwritten digits dataset, in which you're just trying to predict whether or not an image is a "2", is already loaded into the variables X and y.

In [None]:
# Instantiate an RBF SVM
svm = SVC()

# Instantiate the GridSearchCV object and run the search
parameters = {'gamma':[0.00001, 0.0001, 0.001, 0.01, 0.1]}
searcher = GridSearchCV(svm, parameters)
searcher.fit(X, y)

# Report the best parameters
print("Best CV params", searcher.best_params_)

Best CV params {'gamma': 0.001}

Larger values of gamma are better for training accuracy, but cross-validation helped us find something different (and better!).

### Jointly tuning gamma and C with GridSearchCV
In the previous exercise the best value of gamma was 0.001 using the default value of C, which is 1. In this exercise you'll search for the best combination of C and gamma using GridSearchCV.

As in the previous exercise, the 2-vs-not-2 digits dataset is already loaded, but this time it's split into the variables X_train, y_train, X_test, and y_test. Even though cross-validation already splits the training set into parts, it's often a good idea to hold out a separate test set to make sure the cross-validation results are sensible.

In [None]:
# Instantiate an RBF SVM
svm = SVC()

# Instantiate the GridSearchCV object and run the search
parameters = {'C':[0.1, 1, 10], 'gamma':[0.00001, 0.0001, 0.001, 0.01, 0.1]}
searcher = GridSearchCV(svm, parameters)
searcher.fit(X_train, y_train)

# Report the best parameters and the corresponding score
print("Best CV params", searcher.best_params_)
print("Best CV accuracy", searcher.best_score_)

# Report the test accuracy using these best parameters
print("Test accuracy of best grid search hypers:", searcher.score(X_test, y_test))

    Best CV params {'C': 10, 'gamma': 0.0001}
    Best CV accuracy 0.9988864142538976
    Test accuracy of best grid search hypers: 0.9988876529477196

Note that the best value of gamma, 0.0001, is different from the value of 0.001 that we got in the previous exercise, when we fixed C=1. Hyperparameters can affect each other!

## Comparing Logistic Regression and SVM (and beyond)

![logistic_svm](logistic_svm.png)

**Advantage of Logistic Regression over SVM?** 
It naturally outputs meaningful probabilities.

**Advantage of SVM over Logistic Regression?**
It is faster when used with kernels. Having a limited number of support vectors makes kernel SVMs computationally efficient.


## Stochastic Gradient Descent Classifier

![SGDClassifier](SGDClassifier.png)

### Using SGDClassifier

You'll do a hyperparameter search over the regularization type, regularization strength, and the loss (logistic regression vs. linear SVM) using SGDClassifier().

In [None]:
# We set random_state=0 for reproducibility 
linear_classifier = SGDClassifier(random_state=0)

# Instantiate the GridSearchCV object and run the search
parameters = {'alpha':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1], 
             'loss':['hinge', 'log'], 'penalty':['l1', 'l2']}
searcher = GridSearchCV(linear_classifier, parameters, cv=10)
searcher.fit(X_train, y_train)

# Report the best parameters and the corresponding score
print("Best CV params", searcher.best_params_)
print("Best CV accuracy", searcher.best_score_)
print("Test accuracy of best grid search hypers:", searcher.score(X_test, y_test))

    Best CV params {'alpha': 0.0001, 'loss': 'hinge', 'penalty': 'l1'}
    Best CV accuracy 0.94351630867144
    Test accuracy of best grid search hypers: 0.9592592592592593
    
**One advantage of SGDClassifier is that:** it's very fast - this would have taken a lot longer with LogisticRegression or LinearSVC.