Please refer to the blog post '' for a step-by-step explanation.

In [1]:
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

from sklearn import datasets
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.semi_supervised import LabelSpreading

In [2]:
digits = datasets.load_digits()

In [3]:
rng = np.random.RandomState(0)
indices = np.arange(len(digits.data))
rng.shuffle(indices)

In [4]:
X = digits.data[indices[:330]]
y = digits.target[indices[:330]]
images = digits.images[indices[:330]]

In [7]:
n_total_samples = len(y)
n_labeled_points = 40
max_iterations = 5

In [6]:
unlabeled_indices = np.arange(n_total_samples)[n_labeled_points:]

In [8]:
y_train = np.copy(y)
y_train[unlabeled_indices] = -1

## Iteration 1

In [9]:
lp_model = LabelSpreading(gamma=0.25, max_iter=20)
lp_model.fit(X, y_train)

In [10]:
predicted_labels = lp_model.transduction_[unlabeled_indices]
true_labels = y[unlabeled_indices]

In [11]:
cm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_)

In [12]:
print("Iteration %i %s" % (1, 70 * "_"))
print(
    "Label Spreading model: %d labeled & %d unlabeled (%d total)"
    % (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples)
)


Iteration 1 ______________________________________________________________________
Label Spreading model: 40 labeled & 290 unlabeled (330 total)


In [13]:
print(classification_report(true_labels, predicted_labels))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        22
           1       0.78      0.69      0.73        26
           2       0.93      0.93      0.93        29
           3       1.00      0.89      0.94        27
           4       0.92      0.96      0.94        23
           5       0.96      0.70      0.81        33
           6       0.97      0.97      0.97        35
           7       0.94      0.91      0.92        33
           8       0.62      0.89      0.74        28
           9       0.73      0.79      0.76        34

    accuracy                           0.87       290
   macro avg       0.89      0.87      0.87       290
weighted avg       0.88      0.87      0.87       290



In [14]:
print("Confusion matrix")
print(cm)

Confusion matrix
[[22  0  0  0  0  0  0  0  0  0]
 [ 0 18  2  0  0  0  1  0  5  0]
 [ 0  0 27  0  0  0  0  0  2  0]
 [ 0  0  0 24  0  0  0  0  3  0]
 [ 0  1  0  0 22  0  0  0  0  0]
 [ 0  0  0  0  0 23  0  0  0 10]
 [ 0  1  0  0  0  0 34  0  0  0]
 [ 0  0  0  0  0  0  0 30  3  0]
 [ 0  3  0  0  0  0  0  0 25  0]
 [ 0  0  0  0  2  1  0  2  2 27]]


In [15]:
lp_model.label_distributions_.shape

(330, 10)

In [22]:
lp_model.label_distributions_[0]

array([6.76139169e-148, 5.72305415e-087, 1.00000000e+000, 8.01745566e-102,
       1.50947021e-133, 1.55792738e-166, 6.69030793e-141, 7.12758726e-105,
       4.00562208e-100, 1.32090505e-125])

In [23]:
stats.distributions.entropy([0, 1, 0, 0, 0, 0, 0, 0, 0, 0])

0.0

In [24]:
stats.distributions.entropy([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1])

2.3025850929940455

In [25]:
# compute the entropies of transduced label distributions
pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)

# select up to 5 digit examples that the classifier is most uncertain about
uncertainty_index = np.argsort(pred_entropies)[::-1]
uncertainty_index = uncertainty_index[
    np.isin(uncertainty_index, unlabeled_indices)
][:5]

In [26]:
# keep track of indices that we get labels for
delete_indices = np.array([], dtype=int)

for index, image_index in enumerate(uncertainty_index):
    image = images[image_index] 

    # labeling 5 points, remote from labeled set
    (delete_index,) = np.where(unlabeled_indices == image_index)
    delete_indices = np.concatenate((delete_indices, delete_index))

unlabeled_indices = np.delete(unlabeled_indices, delete_indices)
n_labeled_points += len(uncertainty_index)

## Iteration 2

In [27]:
y_train = np.copy(y)
y_train[unlabeled_indices] = -1

lp_model = LabelSpreading(gamma=0.25, max_iter=20)
lp_model.fit(X, y_train)

predicted_labels = lp_model.transduction_[unlabeled_indices]
true_labels = y[unlabeled_indices]

cm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_)
print("Confusion matrix")
print(cm)
print(classification_report(true_labels, predicted_labels))

# compute the entropies of transduced label distributions
pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)

# select up to 5 digit examples that the classifier is most uncertain about
uncertainty_index = np.argsort(pred_entropies)[::-1]
uncertainty_index = uncertainty_index[
    np.isin(uncertainty_index, unlabeled_indices)
][:5]

# keep track of indices that we get labels for
delete_indices = np.array([], dtype=int)

for index, image_index in enumerate(uncertainty_index):
    image = images[image_index] 

    # labeling 5 points, remote from labeled set
    (delete_index,) = np.where(unlabeled_indices == image_index)
    delete_indices = np.concatenate((delete_indices, delete_index))

unlabeled_indices = np.delete(unlabeled_indices, delete_indices)
n_labeled_points += len(uncertainty_index)

Confusion matrix
[[22  0  0  0  0  0  0  0  0  0]
 [ 0 22  0  0  0  0  0  0  0  0]
 [ 0  0 27  0  0  0  0  0  2  0]
 [ 0  0  0 26  0  0  0  0  0  0]
 [ 0  1  0  0 22  0  0  0  0  0]
 [ 0  0  0  0  0 23  0  0  0 10]
 [ 0  1  0  0  0  0 34  0  0  0]
 [ 0  0  0  0  0  0  0 30  3  0]
 [ 0  4  0  0  0  0  0  0 24  0]
 [ 0  0  0  0  2  1  0  2  2 27]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        22
           1       0.79      1.00      0.88        22
           2       1.00      0.93      0.96        29
           3       1.00      1.00      1.00        26
           4       0.92      0.96      0.94        23
           5       0.96      0.70      0.81        33
           6       1.00      0.97      0.99        35
           7       0.94      0.91      0.92        33
           8       0.77      0.86      0.81        28
           9       0.73      0.79      0.76        34

    accuracy                           0.90       285
  

## Iteration 3

In [28]:
y_train = np.copy(y)
y_train[unlabeled_indices] = -1

lp_model = LabelSpreading(gamma=0.25, max_iter=20)
lp_model.fit(X, y_train)

predicted_labels = lp_model.transduction_[unlabeled_indices]
true_labels = y[unlabeled_indices]

cm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_)
print("Confusion matrix")
print(cm)
print(classification_report(true_labels, predicted_labels))

# compute the entropies of transduced label distributions
pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)

# select up to 5 digit examples that the classifier is most uncertain about
uncertainty_index = np.argsort(pred_entropies)[::-1]
uncertainty_index = uncertainty_index[
    np.isin(uncertainty_index, unlabeled_indices)
][:5]

# keep track of indices that we get labels for
delete_indices = np.array([], dtype=int)

for index, image_index in enumerate(uncertainty_index):
    image = images[image_index] 

    # labeling 5 points, remote from labeled set
    (delete_index,) = np.where(unlabeled_indices == image_index)
    delete_indices = np.concatenate((delete_indices, delete_index))

unlabeled_indices = np.delete(unlabeled_indices, delete_indices)
n_labeled_points += len(uncertainty_index)

Confusion matrix
[[22  0  0  0  0  0  0  0  0  0]
 [ 0 22  0  0  0  0  0  0  0  0]
 [ 0  0 28  0  0  0  0  0  0  0]
 [ 0  0  0 26  0  0  0  0  0  0]
 [ 0  0  0  0 20  0  0  0  0  0]
 [ 0  0  0  0  0 23  0  0  0 10]
 [ 0  1  0  0  0  0 34  0  0  0]
 [ 0  0  0  0  0  0  0 32  0  0]
 [ 0  3  0  0  1  0  0  0 24  0]
 [ 0  0  0  0  2  1  0  2  2 27]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        22
           1       0.85      1.00      0.92        22
           2       1.00      1.00      1.00        28
           3       1.00      1.00      1.00        26
           4       0.87      1.00      0.93        20
           5       0.96      0.70      0.81        33
           6       1.00      0.97      0.99        35
           7       0.94      1.00      0.97        32
           8       0.92      0.86      0.89        28
           9       0.73      0.79      0.76        34

    accuracy                           0.92       280
  

## Iteration 4

In [29]:
y_train = np.copy(y)
y_train[unlabeled_indices] = -1

lp_model = LabelSpreading(gamma=0.25, max_iter=20)
lp_model.fit(X, y_train)

predicted_labels = lp_model.transduction_[unlabeled_indices]
true_labels = y[unlabeled_indices]

cm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_)
print("Confusion matrix")
print(cm)
print(classification_report(true_labels, predicted_labels))

# compute the entropies of transduced label distributions
pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)

# select up to 5 digit examples that the classifier is most uncertain about
uncertainty_index = np.argsort(pred_entropies)[::-1]
uncertainty_index = uncertainty_index[
    np.isin(uncertainty_index, unlabeled_indices)
][:5]

# keep track of indices that we get labels for
delete_indices = np.array([], dtype=int)

for index, image_index in enumerate(uncertainty_index):
    image = images[image_index] 

    # labeling 5 points, remote from labeled set
    (delete_index,) = np.where(unlabeled_indices == image_index)
    delete_indices = np.concatenate((delete_indices, delete_index))

unlabeled_indices = np.delete(unlabeled_indices, delete_indices)
n_labeled_points += len(uncertainty_index)

Confusion matrix
[[22  0  0  0  0  0  0  0  0  0]
 [ 0 22  0  0  0  0  0  0  0  0]
 [ 0  0 27  0  0  0  0  0  0  0]
 [ 0  0  0 26  0  0  0  0  0  0]
 [ 0  0  0  0 20  0  0  0  0  0]
 [ 0  0  0  0  0 27  0  0  0  4]
 [ 0  1  0  0  0  0 34  0  0  0]
 [ 0  0  0  0  0  0  0 31  0  0]
 [ 0  3  0  0  1  0  0  0 24  0]
 [ 0  0  0  0  2  1  0  0  2 28]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        22
           1       0.85      1.00      0.92        22
           2       1.00      1.00      1.00        27
           3       1.00      1.00      1.00        26
           4       0.87      1.00      0.93        20
           5       0.96      0.87      0.92        31
           6       1.00      0.97      0.99        35
           7       1.00      1.00      1.00        31
           8       0.92      0.86      0.89        28
           9       0.88      0.85      0.86        33

    accuracy                           0.95       275
  

## Iteration 5

In [30]:
y_train = np.copy(y)
y_train[unlabeled_indices] = -1

lp_model = LabelSpreading(gamma=0.25, max_iter=20)
lp_model.fit(X, y_train)

predicted_labels = lp_model.transduction_[unlabeled_indices]
true_labels = y[unlabeled_indices]

cm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_)
print("Confusion matrix")
print(cm)
print(classification_report(true_labels, predicted_labels))

# compute the entropies of transduced label distributions
pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)

# select up to 5 digit examples that the classifier is most uncertain about
uncertainty_index = np.argsort(pred_entropies)[::-1]
uncertainty_index = uncertainty_index[
    np.isin(uncertainty_index, unlabeled_indices)
][:5]

# keep track of indices that we get labels for
delete_indices = np.array([], dtype=int)

for index, image_index in enumerate(uncertainty_index):
    image = images[image_index] 

    # labeling 5 points, remote from labeled set
    (delete_index,) = np.where(unlabeled_indices == image_index)
    delete_indices = np.concatenate((delete_indices, delete_index))

unlabeled_indices = np.delete(unlabeled_indices, delete_indices)
n_labeled_points += len(uncertainty_index)

Confusion matrix
[[22  0  0  0  0  0  0  0  0  0]
 [ 0 22  0  0  0  0  0  0  0  0]
 [ 0  0 26  1  0  0  0  0  0  0]
 [ 0  0  0 25  0  0  0  0  0  0]
 [ 0  0  0  0 19  0  0  0  0  0]
 [ 0  0  0  0  0 27  0  0  0  4]
 [ 0  1  0  0  0  0 34  0  0  0]
 [ 0  0  0  0  0  0  0 31  0  0]
 [ 0  0  0  0  1  0  0  0 24  0]
 [ 0  0  0  0  2  1  0  0  2 28]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        22
           1       0.96      1.00      0.98        22
           2       1.00      0.96      0.98        27
           3       0.96      1.00      0.98        25
           4       0.86      1.00      0.93        19
           5       0.96      0.87      0.92        31
           6       1.00      0.97      0.99        35
           7       1.00      1.00      1.00        31
           8       0.92      0.96      0.94        25
           9       0.88      0.85      0.86        33

    accuracy                           0.96       270
  

## Baseline

In [31]:
unlabeled_indices = np.arange(n_total_samples)[60:]

y_train = np.copy(y)
y_train[unlabeled_indices] = -1

lp_model = LabelSpreading(gamma=0.25, max_iter=20)
lp_model.fit(X, y_train)

predicted_labels = lp_model.transduction_[unlabeled_indices]
true_labels = y[unlabeled_indices]

cm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_)
print("Confusion matrix")
print(cm)
print(classification_report(true_labels, predicted_labels))

Confusion matrix
[[19  0  0  0  0  0  0  0  0  0]
 [ 0 20  2  0  0  0  1  0  0  0]
 [ 0  0 26  0  0  0  0  0  2  0]
 [ 0  0  0 23  0  0  0  0  3  0]
 [ 0  2  0  0 21  0  0  0  0  0]
 [ 0  0  0  0  0 29  0  0  0  3]
 [ 0  1  0  0  0  0 30  0  0  0]
 [ 0  0  0  0  0  0  0 26  3  0]
 [ 0  3  0  0  0  0  0  0 24  0]
 [ 0  1  0  0  2  1  0  2  1 25]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       0.74      0.87      0.80        23
           2       0.93      0.93      0.93        28
           3       1.00      0.88      0.94        26
           4       0.91      0.91      0.91        23
           5       0.97      0.91      0.94        32
           6       0.97      0.97      0.97        31
           7       0.93      0.90      0.91        29
           8       0.73      0.89      0.80        27
           9       0.89      0.78      0.83        32

    accuracy                           0.90       270
  