In [1]:
import cv2
import numpy as np
import random
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from keras.datasets import mnist
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

def rotate_image(image, angle):
    """
    Rotates an image by a given angle using OpenCV.
    """
    rows, cols = image.shape[:2]
    rotation_matrix = cv2.getRotationMatrix2D((cols / 2, rows / 2), angle, 1)
    rotated = cv2.warpAffine(image, rotation_matrix, (cols, rows))
    return rotated

def augment_dataset(dataset, oversample_factor, rotation_angle=None):
    """
    Augments a dataset by rotating images and oversampling.
    """
    X, y = dataset
    augmented_images = []
    augmented_labels = []

    # Determine the number of images to generate
    additional_samples = int(X.shape[0] * (oversample_factor - 1))

    for _ in range(additional_samples):
        index = random.randint(0, X.shape[0] - 1)
        angle = rotation_angle if rotation_angle is not None else random.choice([-30, -20, -10, 10, 20, 30])
        rotated = rotate_image(X[index], angle)
        augmented_images.append(rotated)
        augmented_labels.append(y[index])

    # Convert augmented data to numpy arrays and concatenate with original dataset
    augmented_images = np.array(augmented_images)
    augmented_labels = np.array(augmented_labels)
    X = np.concatenate([X, augmented_images], axis=0)
    y = np.concatenate([y, augmented_labels], axis=0)

    return X, y

def train_model(dataset):
    """
    Trains a model using the provided dataset.
    """
    X, y = dataset
    X_train, _, y_train, _ = train_test_split(X.reshape(X.shape[0], -1), y, test_size=0.2, random_state=42)

    # Set up and train a RandomForest classifier with GridSearchCV
    classifier = RandomForestClassifier()
    param_grid = {'n_estimators': [10, 25], 'criterion': ['gini', 'entropy']}
    grid_search = GridSearchCV(classifier, param_grid, cv=2, verbose=3)
    grid_search.fit(X_train, y_train)

    return grid_search.best_estimator_

def evaluate_performance(model, dataset, performance_threshold):
    """
    Evaluates the model's performance and checks for drift.
    """
    X, y_true = dataset
    y_pred = model.predict(X.reshape(X.shape[0], -1))
    accuracy = accuracy_score(y_true, y_pred)
    return accuracy > performance_threshold

# Load the MNIST dataset
original_data, ground_truth_data = mnist.load_data()

# Train the initial model
model = train_model(original_data)

# Evaluate model performance
threshold = 0.95
performance_flag = evaluate_performance(model, ground_truth_data, threshold)

# React to model drift by augmenting the dataset and retraining if necessary
train_imgs, train_labels = original_data
gt_imgs, gt_labels = ground_truth_data

for angle in [-30, -20, -10, 10, 20, 30]:
    oversample_rate = 2
    print(f'Augmenting ground truth data with images rotated by {angle} degrees.')
    gt_imgs, gt_labels = augment_dataset((gt_imgs, gt_labels), oversample_rate, angle)
    performance_flag = evaluate_performance(model, (gt_imgs, gt_labels), threshold)
    print(f'Model performance satisfactory after augmentation: {performance_flag}.')

    while not performance_flag:
        print('-' * 80)
        print(f'Augmenting training data with oversample rate = {oversample_rate}.')
        train_imgs, train_labels = augment_dataset((train_imgs, train_labels), oversample_rate, angle)
        model = train_model((train_imgs, train_labels))
        performance_flag = evaluate_performance(model, (gt_imgs, gt_labels), threshold)
        print(f'Model performance for oversample rate {oversample_rate} and angle {angle}: {performance_flag}.')
        print('-' * 80, '\n')
        oversample_rate += 1


Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV 1/2] END ...criterion=gini, n_estimators=10;, score=0.930 total time=   6.8s
[CV 2/2] END ...criterion=gini, n_estimators=10;, score=0.929 total time=   6.9s
[CV 1/2] END ...criterion=gini, n_estimators=25;, score=0.951 total time=  18.0s
[CV 2/2] END ...criterion=gini, n_estimators=25;, score=0.951 total time=  17.2s
[CV 1/2] END criterion=entropy, n_estimators=10;, score=0.934 total time=   8.1s
[CV 2/2] END criterion=entropy, n_estimators=10;, score=0.931 total time=   8.1s
[CV 1/2] END criterion=entropy, n_estimators=25;, score=0.951 total time=  20.6s
[CV 2/2] END criterion=entropy, n_estimators=25;, score=0.950 total time=  20.8s
Augmenting ground truth data with images rotated by -30 degrees.
Model performance satisfactory after augmentation: False.
--------------------------------------------------------------------------------
Augmenting training data with oversample rate = 2.
Fitting 2 folds for each of 4 candidat