In [1]:
# install modAL library
! pip install modAL



In [2]:
pip install modAL

Note: you may need to restart the kernel to use updated packages.


In [None]:
from collections import Counter
from IPython.display import Image

import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix    
from modAL.models import ActiveLearner
from modAL.uncertainty import entropy_sampling
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [4]:
'''Create useful functions'''

# create classification metrics function
def get_classification_metrics(y_real, y_pred):
    cm = confusion_matrix(y_real, y_pred)
    print('Confusion Matrix:\n', cm)

    tp = cm[0, 0]
    tn = cm[1,1]
    fn = cm[0,1]
    fp = cm[1,0]

    accuracy = (tp + tn) / cm.sum()
    print(f'Accuracy: {round(accuracy, 4)}.')

    sensitivity = tp / (tp + fn)
    print(f'Sensitivity: {round(sensitivity, 4)}.')

    specifity = tn / (tn + fp)
    print(f'Specifity: {round(specifity, 4)}.')

    mean_sensitivity_and_specifity = (sensitivity + specifity) / 2
    print(f'Mean Sensitivity and Specifity: {round(mean_sensitivity_and_specifity, 4)}.')

    return {'accuracy': accuracy, 'sensitivity': sensitivity, 'specifity' : specifity, 
            'mean_sensitivity_and_specifity': mean_sensitivity_and_specifity}

In [5]:
# create artifical dataset function
def get_classification_metrics(y_real, y_pred):
    cm = confusion_matrix(y_real, y_pred)
    print('Confusion Matrix:\n', cm)
    tp = cm[0,0]
    tn = cm[1,1]
    fn = cm[0,1]
    fp = cm[1,0]

    accuracy = (tp + tn) / cm.sum()
    print(f'Accuracy: {round(accuracy, 4)}.')

    sensitivity = tp / (tp + fn)
    print(f'Sensitivity: {round(sensitivity, 4)}.')

    specifity = tn / (tn + fp)
    print(f'Specifity: {round(specifity, 4)}.')

    mean_sensitivity_and_specifity = (sensitivity + specifity) / 2
    print(f'Mean Sensitivity and Specifity: {round(mean_sensitivity_and_specifity, 4)}.')

    return {'accuracy': accuracy, 'sensitivity': sensitivity, 'specifity' : specifity,
            'mean_sensitivity_and_specifity': mean_sensitivity_and_specifity}

In [8]:
# create artificial dataset function

def get_artificial_dataset(use_scaling):
    x_array, y_array = make_classification(
        n_classes=2, class_sep=1.2, weights=[0.98, 0.02],
        n_informative=3, n_redundant=1, flip_y=0, n_features=10, 
        n_clusters_per_class=3, n_samples=30000, random_state=10
    )
    if use_scaling:
        scaled_x_array =StandardScaler().fit_transform(x_array)
        return scaled_x_array, y_array
    else: 
        return x_array, y_array



In [None]:
'''Create_artificial_datasets'''

x_array, y_array = get_artificial_dataset(use_scaling=True)
x_train, x_test, y_train, y_test = train_test_split(x_array, y_array, test_size=0.3, random_state=500)

In [10]:
'''active_learning'''

Image(url='https://bit.ly/3szYeTF', width=900, height=300)

In [11]:
# classification uncertainty
Image(url='https://bit.ly/3sGJF5G', height=80)
# gdzie x przewidywana obserwacja x^ najbardziej prawdopodobna wartość predykcji

In [12]:
obserwacje_predykcje = np.array([[0.85,0.15],
                                 [0.3,0.7],
                                 [0.61, 0.39]])
obserwacje_predykcje.min(axis=1).reshape(3,1)

array([[0.15],
       [0.3 ],
       [0.39]])

In [None]:
# selecting initial training set - all obs from minority class + 50 random obs from majority class
minority_indices = np.where(y_train == 1)[0]
np.random.seed(123)
majority_indices = np.random.choice(np.where(y_train == 0)[0], size=50)
starting_training_indices = np.concatenate([minority_indices, majority_indices])

x_train_al = x_train[starting_training_indices]
y_train_al = y_train[starting_training_indices]
print(Counter(y_train_al))

# Isolate the non-training examples we'll be querying.
x_pool = np.delete(x_train, starting_training_indices, axis=0)
y_pool = np.delete(y_train, starting_training_indices, axis=0)
x_pool.shape

In [None]:
knn = KNeighborsClassifier()
learner = ActiveLearner(estimator=knn, X_training=x_train_al, y_training=y_train_al, query_strategy=entropy_sampling)

# Record our learner's score on the raw data
al_predicted = learner.predict(x_test)
al_classification_metrics = get_classification_metrics(y_test, al_predicted)

N_QUERIES = 25
classification_performance_history = [al_classification_metrics['mean_sensitivity_and_specifity']]

In [None]:
n_of_queried_instances = 10
for index in range (N_QUERIES):
    query_index, query_instance = learner.query(x_pool, n_instances=n_of_queried_instances)

    temp_x_al, temp_y_al = x_pool[query_index].reshape(n_of_queried_instances, -1), y_pool[query_index]
    learner.teach(X = temp_x_al, y=temp_y_al)

    x_pool, y_pool = np.delete(x_pool, query_index, axis=0), np.delete(y_pool, query_index)

    temp_predicted = learner.predict(x_test)
    temp_mean_sensitivity_and_specifity = get_classification_metrics(y_test, temp_predicted)['mean_sensitivity_and_specifity']
    print(f'Mean sensitivity and specifity after query {index+1}: {temp_mean_sensitivity_and_specifity:0.4f}.')

    classification_performance_history.append('mean_sensitivity_and_specifity')

In [None]:
# final trining set
Counter(learner.y_training)

In [None]:
# changes in classification quality metric during active learning training iterations
plt.plot([20 + n_of_queried_instances * index for index in range(N_QUERIES + 1)], classification_performance_history)