In [1]:
from scipy.io import loadmat
    
mnist_path = "./mnist-original.mat"
mnist_raw = loadmat(mnist_path)
mnist = {
    "data": mnist_raw["data"].T,
    "target": mnist_raw["label"][0],
    "COL_NAMES": ["label", "data"],
    "DESCR": "mldata.org dataset: mnist-original",
}

In [2]:
mnist

{'COL_NAMES': ['label', 'data'],
 'DESCR': 'mldata.org dataset: mnist-original',
 'data': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
 'target': array([0., 0., 0., ..., 9., 9., 9.])}

In [3]:
X, y = mnist['data'], mnist['target']
X.shape

(70000, 784)

In [4]:
y.shape

(70000,)

In [5]:
#Sample image
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
arr = X[40000].reshape(28,28)
plt.imshow(arr, interpolation="nearest", cmap=matplotlib.cm.binary)
plt.axis("off")
plt.show()

<Figure size 640x480 with 1 Axes>

In [6]:
y[40000]

6.0

# Train test split

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(X, test_size = 10000/70000)
y_train, y_test = train_test_split(y, test_size = 10000/70000)

## Shuffling dataset 

In [8]:
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

## Training a binary classifier (Classifies as 5 or not 5)

In [9]:
# Targets
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

### Stochastic Gradient Descent
Handles large datasets efficiently, deals with training instances independently, one at a time.  
Suited for online learining

In [10]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(max_iter=5, random_state = 42) #random_state for consistent results

sgd = sgd.fit(X_train, y_train_5)

prediction = sgd.predict(X[40000].reshape(1,-1))

In [11]:
print(prediction) #Predicition is true it is a 6, so False

[False]


## Performace measures (for a classifier)

### Accuracy

Custom fuction to perform K-fold cross validation by cloning classifier at each fold and returning ratio of correct predictions to total predictions (accuracy).

In [12]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3, random_state=42)

for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd)
    X_train_folds = X_train[train_index]
    y_train_folds = (y_train_5[train_index])
    X_test_fold = X_train[test_index]
    y_test_fold = (y_train_5[test_index])
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred)) 

0.8913554322283885
0.88545
0.8761938096904845


In [13]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(sgd, X_train, y_train_5, cv=3, scoring="accuracy")
print(scores)

[0.89135543 0.88545    0.87619381]


A dumb classifier(that says every digit is not 5) has 90% accuracy because about 10% of images are 5 (skewed dataset). Any classifier can beat that. This is why ** accuracy is not the right performace measure **

In [14]:
from sklearn.base import BaseEstimator
import numpy as np

class Never5Estimator(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)

dumb = Never5Estimator()
#dumb.fit(X_train, y_train_5)

scores_dumb = cross_val_score(dumb, X_train, y_train_5, cv=3, scoring="accuracy")   
print(scores_dumb)

[0.9116  0.91025 0.90825]


### Confusion Matrix
A much better way to evaluate the performance of a classifier is to look at the confusion matrix. The
general idea is to count the number of times instances of class A are classified as class B. For example, to
know the number of times the classifier confused images of 5s with 3s, you would look in the 5 th row and
3 rd column of the confusion matrix.  
  
```[[True Negatives, False Positives],
[False Negatives, True Positives]]```
  
Ideal confusion matrix will only have elements in its diagonal.

In [15]:
from sklearn.model_selection import cross_val_predict #returns prediction

y_pred_5 = cross_val_predict(sgd, X_train, y_train_5, cv=3)

from sklearn.metrics import confusion_matrix

mat = confusion_matrix(y_train_5, y_pred_5)

mat

array([[52889,  1713],
       [ 5227,   171]])

### Precision (accuracy of positive prediction) and recall or true positive rate or sensetivity  

```precision = True Positive / True Positive + True Negative ```  
```recall = True Negative / True Positive + False Negative ```  

recall is ratio of postive instances that are correctly predicted by classifier

In [16]:
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_train_5, y_pred_5)

print(precision)

recall = recall_score(y_train_5, y_pred_5)

print(recall)

0.09076433121019108
0.03167839940718785


Precision and Recall scores tells us that our model is not that good as we expected it to be

#### F1 Score (The harmonic mean of Precision and Recall)
It is often convenient to combine precision and recall into a single metric called the F 1 score, in
particular if you need a simple way to compare two classifiers. The F 1 score is the harmonic mean of
precision and recall. Whereas the regular mean treats all values equally, the harmonic
mean gives much more weight to low values. As a result, the classifier will only get a high F 1 score if
both recall and precision are high.

In [17]:
from sklearn.metrics import f1_score

f1 = f1_score(y_train_5, y_pred_5)

f1

0.046965119472672345

Some cases need high precision while some needs high recall, **metrics should be chosed according to case.** Increasing precision will decrease recall and vice versa, this is precision-recall tradeoff 

#### Decision Threshold

Decision Threshold is the function using precision and recall to classify as positve class or negative class.

In [18]:
y_score = cross_val_predict(sgd, X_train, y_train_5, cv=5, method="decision_function")
y_score

array([ -83349.10980628,  104313.40461317,   15321.56249261, ...,
       -143318.13426164, -124345.00518496,  100475.49571959])

In [19]:
#Precision-Recall Curve 
"""from sklearn.metrics import precision_recall_curve

precision, recall, threshold = precision_recall_curve(y_train_5, y_score)

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(threshold, precision[:-1], "b--", label="Precision")
    plt.plot(threshold, recall[:-1], "g-", label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="upper left")
    plt.ylim([0, 1])
    plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
    plt.show()
    
plot_precision_recall_vs_threshold(precision, recall, threshold)    """

'from sklearn.metrics import precision_recall_curve\n\nprecision, recall, threshold = precision_recall_curve(y_train_5, y_score)\n\ndef plot_precision_recall_vs_threshold(precisions, recalls, thresholds):\n    plt.plot(threshold, precision[:-1], "b--", label="Precision")\n    plt.plot(threshold, recall[:-1], "g-", label="Recall")\n    plt.xlabel("Threshold")\n    plt.legend(loc="upper left")\n    plt.ylim([0, 1])\n    plot_precision_recall_vs_threshold(precisions, recalls, thresholds)\n    plt.show()\n    \nplot_precision_recall_vs_threshold(precision, recall, threshold)    '

#### ROC Curve

Reciever Operating Charact curve is a graph between **True Positive Rate Vs. False Positive Rate**.
  
```False Positive Rate = 1 - True Negative Rate```  
  
True Negative Rate is also knows as Specificity. So ROC curve is the curve between sensetivity(recall) and 1-specificity.
  
Different classifiers can be compared using Area Under Curve (which should be 1 in ideal case).

In [20]:
from sklearn.metrics import roc_curve

tpr, fpr, threshold = roc_curve(y_train_5, y_score)

from sklearn.metrics import roc_auc_score

print(roc_auc_score(y_train_5, y_score))

0.500210577674961


**Use Precision Ratio curve when you care more about false positives then false negatives.**

### Random Forest Classifier

In [21]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier()

y_probas_forest = cross_val_predict(forest, X_train, y_train_5, cv=3, method="predict_proba") #Returns probablities

#Probablities of positive class is score

y_scores_forest = y_probas_forest[:,-1]

tpr, fpr, threshold = roc_curve(y_train_5, y_scores_forest)

print(roc_auc_score(y_train_5, y_scores_forest))

0.49669217031721574


## Training a Multiclass Classifier

### One Versus One Classifier

In [22]:
from sklearn.multiclass import OneVsOneClassifier

ovo_clf = OneVsOneClassifier(SGDClassifier())

ovo_clf = ovo_clf.fit(X_train, y_train)

print(ovo_clf.predict(X_train[40000].reshape(1,-1)))







[0.]


In [23]:
len(ovo_clf.estimators_)

45

In [24]:
forest = RandomForestClassifier()
forest = forest.fit(X_train, y_train)

print(forest.predict(X[40000].reshape(1,-1)))


[9.]


## Before Standard scaling

In [25]:
sgd_new = SGDClassifier()
sgd_score = cross_val_score(sgd_new, X_train, y_train, cv=3, scoring="accuracy")
print(sgd_score)



[0.1004799  0.1009     0.10387077]


## After Standard Scaling

In [26]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train.astype(np.float64))
scores_scaled = cross_val_score(sgd_new, X_scaled, y_train, cv=3, scoring="accuracy")

scores_scaled



array([0.09633073, 0.1009    , 0.09621924])

## Multilabel Classification

In [33]:
from sklearn.neighbors import KNeighborsClassifier

y_large =  (y_train >= 7)
y_odd = (y_train % 2 !=0)
y_multilabel = np.c_[y_large, y_odd]

neigh = KNeighborsClassifier()
neigh = neigh.fit(X_train, y_multilabel)
print(neigh.predict(X_train[40000].reshape(1,-1)))

MemoryError: 

** IF MNIST CANNOT BE FETCHED **

In [None]:
"""from six.moves import urllib
from sklearn.datasets import fetch_mldata
try:
    mnist = fetch_mldata('MNIST original')
except urllib.error.HTTPError as ex:
    print("Could not download MNIST data from mldata.org, trying alternative...")

    # Alternative method to load MNIST, if mldata.org is down
    from scipy.io import loadmat
    mnist_alternative_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat"
    mnist_path = "./mnist-original.mat"
    response = urllib.request.urlopen(mnist_alternative_url)
    with open(mnist_path, "wb") as f:
        content = response.read()
        f.write(content)
    mnist_raw = loadmat(mnist_path)
    mnist = {
        "data": mnist_raw["data"].T,
        "target": mnist_raw["label"][0],
        "COL_NAMES": ["label", "data"],
        "DESCR": "mldata.org dataset: mnist-original",
    }
    print("Success!")"""