# Machine Learning with SciKit-Learn

## SciKits
Packages that use SciPy/NumPy, but are not included in the default installation because they are not "general" enough

In [None]:
from IPython.display import HTML
HTML('<iframe src="http://scikit-learn.org/stable/" width=800 height=500></iframe>')

## An example: Handwriting recognition!

We will:
* load some data that contains a list of images of handwritten digits and the correct digit as a label.
* split the data in two sets: **training** and **testing**
* train a classifier on the training set
* use that classifier to predict the value of the images in the testing set!



In [None]:
from sklearn import datasets, svm, metrics
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
digits = datasets.load_digits()

n_samples = len(digits.images)

# what is in the data?
images_and_labels = list(zip(digits.images, digits.target))
print "There are {} images in the dataset".format(n_samples)

for index, (image, label) in enumerate(images_and_labels[:10]):
    plt.subplot(2, 5, index + 1)
    plt.axis('off')
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Training: %i' % label)


To apply a classifier on this data, we need to flatten the image, to
turn the data in a (samples, feature) matrix:

In [None]:
print digits.images[0].shape
labels = digits.target
data = digits.images.reshape((n_samples, -1)) # flattens each image

print n_samples
print "DATA\n",data[0:3]
print "LABELS:",labels[0:3]


 split the data set into 2: half for training, half for testing


In [None]:
# the training data
training_data = data[:n_samples / 2]
training_labels = labels[:n_samples / 2]

# the testing data
testing_labels = digits.target[n_samples / 2:]
testing_data = data[n_samples / 2:]

### Training:
create a classifier (in this case a Support Vector Machine Classifier)

In [None]:
classifier = svm.SVC(gamma=0.0005)
classifier.fit( training_data, training_labels )  # train it!

That's it! **The classifier is now trained on the training data!**

Now predict the value of the digit on the second half:

### Classification!

In [None]:
predicted_labels = classifier.predict( testing_data )

We now have predicted labels for each of the images in the test data set. We can compare those to the testing labels (which are the correct labels)

Let's print a report, and display the predictions for some of the test data:

In [None]:
for expected,predicted in zip(testing_labels, predicted_labels)[:50]:
    print "EXPECTED:",expected, "PREDICTED:",predicted,expected==predicted

In [None]:
print "Classification report for classifier {0}:".format(classifier)
print metrics.classification_report(testing_labels, predicted_labels)

print "Confusion matrix:\n",metrics.confusion_matrix(testing_labels, predicted_labels)

images_and_predictions = list(zip(digits.images[n_samples / 2:], predicted_labels))

plt.figure(figsize=(10,4))
for index, (image, prediction) in enumerate(images_and_predictions[:6]):
    plt.subplot(2, 6, index+1)
    #plt.axis('off')
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Predicted: %i' % prediction)

plt.show()

## Cool, but let's go further: I'll load some of my own handwriting!

<img src="handwrite-me.jpg">
<img src="handwrite-large.png" width="50%"> 

In [None]:
from scipy import misc, ndimage
im = misc.imread("handwrite-small.png")
im = (255-im)/16.0

In [None]:
plt.set_cmap(plt.cm.gray_r)
plt.imshow(im, interpolation='nearest')


In [None]:
four = im[:,6:14]
two = im[:,14:14+8]
six = im[:,25:25+8]
one = im[:,20:20+8].copy()
one[:,6:] = 0 # the next letter was too close so have to overwrite that part

plt.subplot(1,4,1)
plt.imshow(one, interpolation='nearest')
plt.subplot(1,4,2)
plt.imshow(two, interpolation='nearest')
plt.subplot(1,4,3)
plt.imshow(four, interpolation='nearest')
plt.subplot(1,4,4)
plt.imshow(six, interpolation='nearest')



In [None]:
classifier.predict( [one.flatten(),two.flatten(),four.flatten(), six.flatten()] )

Not bad!  My handwriting isn't so great! (the 4 and the 6 get confused with 9 and 4)





# Another demo:
Visualizing what a SVM does!
(see console demo)
