Let's play around with feature extraction and some supervised learning models.

We'll use the same dataset as before, but this time we'll extract features from the images using a pre-trained CNN and then use those features to train a supervised learning model.


In [1]:
import warnings
warnings.filterwarnings('ignore')

import keras
from keras import layers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import random
from pathlib import Path

data_dir = '/Users/bharathg/workspace/mscs/csca5642/week3'

In [2]:
def load_training_labels(data_dir='./', num_samples=None, balanced=False):
    training_labels = pd.read_csv(os.path.join(data_dir, 'train_labels.csv'))
    training_labels['label'] = training_labels['label'].astype('bool')
    if num_samples is None:
        return training_labels.sample(frac=1).reset_index(drop=True)
    
    if balanced:
        pos = training_labels[training_labels['label']].sample(num_samples // 2)
        neg = training_labels[~training_labels['label']].sample(num_samples // 2)
        training_labels = pd.concat([pos, neg]).sample(frac=1).reset_index(drop=True)
    else:
        training_labels = training_labels.sample(num_samples).reset_index(drop=True)

    return training_labels


def get_training_images(training_labels, data_dir='./', color_mode='rgb', crop=1.0):
    images = np.array(
        [tf.image.central_crop(
            keras.utils.img_to_array(keras.utils.load_img(os.path.join(data_dir, 'train', f'{id}.tif'), color_mode=color_mode)),
            crop).numpy()
         for id in training_labels['id']])
    return images


def batch_image_generator(training_labels, batch_size):
    inp = list(training_labels.index)
    while True:
        random.shuffle(inp)
        for i in range(0, len(inp), batch_size):
            inds = inp[i:i+batch_size]
            batch_labels = training_labels.loc[inds].reset_index(drop=True)
            batch_images = get_training_images(batch_labels)
            yield batch_images, batch_labels['label']


def stochastic_batch_image_generator(training_labels, batch_size):
    while True:
        batch_labels = training_labels.sample(batch_size).reset_index(drop=True)
        ids = batch_labels['id']
        batch_images = get_training_images(batch_labels)
        yield batch_images, batch_labels['label']


def get_test_images(data_dir='./', color_mode='rgb', crop=1.0):
    test_image_files = [f for f in os.listdir(os.path.join(data_dir, "test")) if f.endswith(".tif")]
    test_ids = [Path(f).stem for f in test_image_files]
    test_images = np.array(
        [tf.image.central_crop(
            keras.utils.img_to_array(keras.utils.load_img(os.path.join(data_dir, 'test', f), color_mode=color_mode)),
            crop).numpy()
         for f in test_image_files])
    return test_images, test_ids


def generate_submission(model, test_images, test_ids, color_mode='rgb', output_file='submission.csv'):
    test_predictions = model.predict(test_images)
    submission = pd.DataFrame({"id": test_ids, "label": test_predictions.flatten()})
    submission.to_csv(output_file, index=False)


def plot_training_history(history):
    # Plot the training and validation loss and accuracy
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Loss')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Accuracy')
    plt.legend()
    plt.show()

In [3]:
training_labels = load_training_labels(data_dir)
print(training_labels.shape)
print(training_labels.head())
print(training_labels['label'].value_counts())

(220025, 2)
                                         id  label
0  82852a2f1c2e0ec27e37969109e52a40dc264e89  False
1  b19345a3dbb8a0c65da08726e8b2307da25e572a  False
2  41240890435148a75b1bfbc5305ff4dff40e9c3e   True
3  81fec7e69d7f8c3794feb38ec3187768d195d007  False
4  7c71d3a26e829d5f7ed7d8c6d7295da39df4c5f1  False
label
False    130908
True      89117
Name: count, dtype: int64


In [4]:
X = get_training_images(training_labels, data_dir, crop=1/3)
y = training_labels['label'].values

print(X.shape)
print(y.shape)


2025-04-10 11:21:50.206859: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2025-04-10 11:21:50.207012: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-04-10 11:21:50.207026: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
I0000 00:00:1744264310.207466 1877426 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1744264310.207523 1877426 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


(220025, 32, 32, 3)
(220025,)


In [5]:
test_images, test_ids = get_test_images(data_dir, crop=1/3)
print(test_images.shape)

(57458, 32, 32, 3)


In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [20]:
base_model = keras.applications.EfficientNetV2B0(
    weights='imagenet', 
    include_top=False, 
    input_shape=(32, 32, 3))
model = keras.Sequential([
    base_model,
    layers.Flatten()
])

In [21]:
X_train = model.predict(X_train)
X_test = model.predict(X_test)
print(X_train.shape)
print(X_test.shape)

[1m5157/5157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 28ms/step
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 27ms/step
(165018, 1280)
(55007, 1280)


In [22]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000, solver='liblinear')
model.fit(X_train, y_train)
print(model.score(X_test, y_test))


0.7998072972530769


In [23]:
del X
del y

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=500, random_state=42, max_depth=5, n_jobs=-1)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.7776828403657716


In [24]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV


parameters = {'gamma': np.logspace(-5, 5, num=6, base=2), 'C': np.logspace(-5, 5, num=6, base=2)}
clf = SVC()

grid = GridSearchCV(clf, parameters, cv=3, n_jobs=-1).fit(X_train, y_train)

print(f'Best parameters: {grid.best_params_}')
print(f'Best score: {grid.best_score_}')

print(f'Test score: {grid.best_estimator_.score(X_test, y_test)}')

KeyboardInterrupt: 