In [1]:
import numpy as np
import joblib
import matplotlib.pyplot as plt
from numpy.linalg import norm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from PIL import Image, ImageEnhance, ImageOps, ImageFilter
from numpy import random


#### Import Data

In [2]:
data_train = joblib.load('train.joblib')
data_test = joblib.load('eval1.joblib')

In [3]:
#Train images and labels
train_images = data_train['data']
train_labels = data_train['target']
#Test images and labels
test_images = data_test['data']
test_labels = data_test['target']

### Augmentation

In [4]:
def hflip(image):
    image_aug = ImageOps.mirror(image)
    return image_aug

def brightness(image1, image2, min_factor=0.5, max_factor=1.5):
    enhancer1 = ImageEnhance.Brightness(image1)
    enhancer2 = ImageEnhance.Brightness(image2)
    factor = np.random.uniform(min_factor, max_factor)
    image1_aug = enhancer1.enhance(factor)
    image2_aug = enhancer2.enhance(factor)
    return image1_aug, image2_aug

def contrast(image1, image2, min_factor=0.5, max_factor=1.5):
    enhancer1 = ImageEnhance.Contrast(image1)
    enhancer2 = ImageEnhance.Contrast(image2)
    factor = np.random.uniform(min_factor, max_factor)
    image1_aug = enhancer1.enhance(factor)
    image2_aug = enhancer2.enhance(factor)
    return image1_aug, image2_aug

def rotate(image1, image2, angles=15):
    angle = np.random.uniform(-angles, angles)
    image1_aug = image1.rotate(angle)
    image2_aug = image2.rotate(angle)
    return image1_aug, image2_aug

def vflip(image):
    image_aug = ImageOps.flip(image)
    return image_aug

def edges(image):
    image_aug = image.filter(ImageFilter.FIND_EDGES)
    return image_aug

def blur(image, radius=0.5):
    image_aug = image.filter(ImageFilter.GaussianBlur(radius))
    return image_aug

def invert_colours(image):
    image_aug = ImageOps.invert(image)
    return image_aug

In [5]:
def augment_images_set(train_images, train_labels, augmentation, augmented_sets):

    indices = np.random.choice(len(train_images), size=augmented_sets, replace=False)

    augmented_dataset = []
    augmented_labels = []

    for idx in indices:
        pair = train_images[idx]
        label = train_labels[idx]
        
        image1 = Image.fromarray(pair[:2914].reshape(62, 47) * 255).convert("L")
        image2 = Image.fromarray(pair[2914:].reshape(62, 47) * 255).convert("L")

        # Apply augmentation (e.g., rotate both images by the same angle)
        if augmentation in [rotate, brightness, contrast]:
            image1_aug, image2_aug = augmentation(image1, image2)
        else:
            image1_aug = augmentation(image1)
            image2_aug = augmentation(image2)


        image1_aug_flat = np.array(image1_aug).flatten()
        image2_aug_flat = np.array(image2_aug).flatten()
        
        augmented_pair = np.hstack([image1_aug_flat, image2_aug_flat])
        augmented_dataset.append(augmented_pair)
        augmented_labels.append(label)
    
    # Convert lists to numpy arrays
    augmented_dataset = np.array(augmented_dataset)
    augmented_labels = np.array(augmented_labels)
    return augmented_dataset, augmented_labels

# augmented_dataset_vflip, augmented_labels_vflip = augment_images_set(train_images, train_labels,vflip, 300)
augmented_dataset_hflip, augmented_labels_hflip = augment_images_set(train_images, train_labels,hflip, 200)
augmented_dataset_brightness, augmented_labels_brightness = augment_images_set(train_images, train_labels,brightness, 300)
augmented_dataset_contrast, augmented_labels_contrast = augment_images_set(train_images, train_labels,contrast, 300)
# augmented_dataset_edges, augmented_labels_edges = augment_images_set(train_images, train_labels,edges, 300)
augmented_dataset_rotate, augmented_labels_rotate = augment_images_set(train_images, train_labels,rotate, 200)
# augmented_dataset_blur, augmented_labels_blur = augment_images_set(train_images, train_labels,blur, 300)
# augmented_dataset_invert_colours, augmented_labels_invert_colours = augment_images_set(train_images, train_labels,invert_colours, 300)

augmented_dataset_combined = np.vstack([
    # augmented_dataset_vflip,
    augmented_dataset_hflip,
    augmented_dataset_brightness,
    augmented_dataset_contrast,
    # augmented_dataset_edges,
    augmented_dataset_rotate,
    # augmented_dataset_blur,
    # augmented_dataset_invert_colours
])

augmented_labels_combined = np.hstack([
    # augmented_labels_vflip,
    augmented_labels_hflip,
    augmented_labels_brightness,
    augmented_labels_contrast,
    # augmented_labels_edges,
    augmented_labels_rotate,
    # augmented_labels_blur,
    # augmented_labels_invert_colours
])

train_images_aug = np.vstack([train_images, augmented_dataset_combined])
train_labels_aug = np.hstack([train_labels, augmented_labels_combined])

#### Feature Engineering Train

In [6]:
# Split the images into two parts
train_images1_aug = train_images_aug[:, :5828 // 2]
train_images2_aug = train_images_aug[:, 5828 // 2:]
# Pixel + euc distance
train_pixel_diff_aug = np.abs(train_images1_aug - train_images2_aug)
train_euc_dist_aug = np.linalg.norm(train_images1_aug - train_images1_aug, axis=1).reshape(-1, 1)
#feature stacking
train_features_aug = np.hstack([train_pixel_diff_aug, train_euc_dist_aug])

#### Feature Engineering Test

In [7]:
# Test Data Prep
test_images1 = test_images[:, :5828 // 2]
test_images2 = test_images[:, 5828 // 2:]
test_pixel_diff = np.abs(test_images1 - test_images2)
test_euc_dist = np.linalg.norm(test_images1 - test_images2, axis=1).reshape(-1, 1)
test_features = np.hstack([test_pixel_diff, test_euc_dist])

#### Standardise + PCA of Feature Engineer Data

In [8]:
#Distance PCA
#Standardise
scaler = StandardScaler()
train_features_scaled_aug = scaler.fit_transform(train_features_aug)
test_features_scaled = scaler.transform(test_features)
#PCA
pca = PCA(n_components=50)
train_features_pca_aug = pca.fit_transform(train_features_scaled_aug)
test_features_pca = pca.transform(test_features_scaled)

#### SVM

In [9]:
#SVM pipeline Augmented and Features Engineered
# Define the parameter grid
param_grid_svm = {
        'kernel': ['rbf'],
        'C': [0.1, 1, 10, 100, 1000],
        'gamma': [0.001, 0.01, 0.1, 1],
    }
svm = SVC(random_state=1)
# Perform Grid Search
grid_search_svm = GridSearchCV(estimator=svm, param_grid=param_grid_svm, cv=5, verbose=1, n_jobs=-1)
grid_search_svm.fit(train_features_pca_aug, train_labels_aug)

# Best parameters and score
print("Best Parameters:", grid_search_svm.best_params_)
print("Best Cross-Validation Accuracy:", grid_search_svm.best_score_)

#SVM result Features Engineered
y_train_pred = grid_search_svm.best_estimator_.predict(train_features_pca_aug)
train_accuracy = accuracy_score(train_labels_aug, y_train_pred)
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")

y_test_pred = grid_search_svm.best_estimator_.predict(test_features_pca)
test_accuracy = accuracy_score(test_labels, y_test_pred)
print(f"Testing Accuracy: {test_accuracy * 100:.2f}%")

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
Best Cross-Validation Accuracy: 0.6043749999999999
Training Accuracy: 76.47%
Testing Accuracy: 66.40%


#### Random Forest

In [10]:
# Random Forest pipeline Aug+features

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [4, 5],
    'min_samples_split': [10, 15, 20],
    'min_samples_leaf': [10, 15, 20]
}
rf = RandomForestClassifier(random_state=1)
# Perform Grid Search
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, verbose=1, n_jobs=-1)
grid_search_rf.fit(train_features_pca_aug, train_labels_aug)

# Best parameters and score
print("Best Parameters:", grid_search_rf.best_params_)
print("Best Cross-Validation Accuracy:", grid_search_rf.best_score_)

#RF Aug+Features
y_train_pred = grid_search_rf.best_estimator_.predict(train_features_pca_aug)
train_accuracy = accuracy_score(train_labels_aug, y_train_pred)
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")

y_test_pred = grid_search_rf.best_estimator_.predict(test_features_pca)
test_accuracy = accuracy_score(test_labels, y_test_pred)
print(f"Testing Accuracy: {test_accuracy * 100:.2f}%")

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters: {'max_depth': 5, 'min_samples_leaf': 20, 'min_samples_split': 10, 'n_estimators': 200}
Best Cross-Validation Accuracy: 0.61875
Training Accuracy: 74.53%
Testing Accuracy: 67.00%


In [11]:
# Best parameters from Grid Search
best_rf_params = grid_search_rf.best_params_
best_svm_params = grid_search_svm.best_params_

# Define base models with best parameters
rf_best = RandomForestClassifier(**best_rf_params, random_state=1)
svm_best = SVC(**best_svm_params, random_state=1)

# Define the meta-classifier
meta_classifier = LogisticRegression(random_state=1)

# Create the Stacking Classifier
stacking_clf = StackingClassifier(
    estimators=[
        ('rf', rf_best),
        ('svm', svm_best)
    ],
    final_estimator=meta_classifier,
    cv=5,
    n_jobs=-1, 
    verbose=1
)

# Train the Stacking Classifier
print("Training Stacking Classifier")
stacking_clf.fit(train_features_pca_aug, train_labels_aug)

# Evaluate on Train Data
train_y_pred = stacking_clf.predict(train_features_pca_aug)
train_accuracy = accuracy_score(train_labels_aug, train_y_pred)
print(f"Stacking Classifier Train Accuracy: {train_accuracy*100:.2f}%")

# Evaluate on Test Data
y_pred = stacking_clf.predict(test_features_pca)
test_accuracy = accuracy_score(test_labels, y_pred)
print(f"Stacking Classifier Test Accuracy: {test_accuracy*100:.2f}%")

Training Stacking Classifier


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 11 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 11 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.3s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.1s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.1s finished


Stacking Classifier Train Accuracy: 78.12%
Stacking Classifier Test Accuracy: 66.40%
