
<br>
===================================================<br>
Label Propagation digits: Demonstrating performance<br>
===================================================<br>
This example demonstrates the power of semisupervised learning by<br>
training a Label Spreading model to classify handwritten digits<br>
with sets of very few labels.<br>
The handwritten digit dataset has 1797 total points. The model will<br>
be trained using all points, but only 30 will be labeled. Results<br>
in the form of a confusion matrix and a series of metrics over each<br>
class will be very good.<br>
At the end, the top 10 most uncertain predictions will be shown.<br>


In [None]:
print(__doc__)

Authors: Clay Woolam <clay@woolam.org><br>
License: BSD

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from scipy import stats

In [None]:
from sklearn import datasets
from sklearn.semi_supervised import LabelSpreading

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
digits = datasets.load_digits()
rng = np.random.RandomState(2)
indices = np.arange(len(digits.data))
rng.shuffle(indices)

In [None]:
X = digits.data[indices[:340]]
y = digits.target[indices[:340]]
images = digits.images[indices[:340]]

In [None]:
n_total_samples = len(y)
n_labeled_points = 40

In [None]:
indices = np.arange(n_total_samples)

In [None]:
unlabeled_set = indices[n_labeled_points:]

#############################################################################<br>
Shuffle everything around

In [None]:
y_train = np.copy(y)
y_train[unlabeled_set] = -1

#############################################################################<br>
Learn with LabelSpreading

In [None]:
lp_model = LabelSpreading(gamma=.25, max_iter=20)
lp_model.fit(X, y_train)
predicted_labels = lp_model.transduction_[unlabeled_set]
true_labels = y[unlabeled_set]

In [None]:
cm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_)

In [None]:
print("Label Spreading model: %d labeled & %d unlabeled points (%d total)" %
      (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples))

In [None]:
print(classification_report(true_labels, predicted_labels))

In [None]:
print("Confusion matrix")
print(cm)

#############################################################################<br>
Calculate uncertainty values for each transduced distribution

In [None]:
pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)

#############################################################################<br>
Pick the top 10 most uncertain labels

In [None]:
uncertainty_index = np.argsort(pred_entropies)[-10:]

#############################################################################<br>
Plot

In [None]:
f = plt.figure(figsize=(7, 5))
for index, image_index in enumerate(uncertainty_index):
    image = images[image_index]
    sub = f.add_subplot(2, 5, index + 1)
    sub.imshow(image, cmap=plt.cm.gray_r)
    plt.xticks([])
    plt.yticks([])
    sub.set_title('predict: %i\ntrue: %i' % (
        lp_model.transduction_[image_index], y[image_index]))

In [None]:
f.suptitle('Learning with small amount of labeled data')
plt.show()