# CSCE 5222 Feature Engineering — Gabor Filter Bank on SOCOFing

Group 7  
Members: Amir Naderian, Alireza Mohammadshafie

---

This notebook reproduces the complete pipeline for fingerprint alteration detection using Gabor features, as requested.


In [None]:
# Setup & Config
import os
from pathlib import Path
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from skimage import io, color, transform
from skimage.filters import gabor

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

# Paths
PROJECT_ROOT = Path.cwd()
# Search upward for the SOCOFing folder in case notebook is inside notebooks/
possible_root = PROJECT_ROOT
while possible_root != possible_root.parent and not (possible_root / 'SOCOFing').exists():
    possible_root = possible_root.parent
DATASET_ROOT = possible_root / 'SOCOFing'
print('Resolved dataset root:', DATASET_ROOT)
print('Dataset root:', DATASET_ROOT)

# Parameters
IMG_SIZE = (128, 128)
THETAS_DEG = [0, 45, 90, 135, 180]
FREQS = [0.1, 0.2, 0.3]
TEST_SIZE = 0.2
CV_FOLDS = 5
N_JOBS = -1

# Optional subsampling (None for full dataset)
MAX_PER_CLASS = None  # e.g., 100

sns.set_theme(style='whitegrid', context='notebook')


Dataset root: /Users/alireza.mohammadshafie/Documents/Projects/Feature_engineering/Project/notebooks/SOCOFing


In [2]:
# Dataset loading & labeling
from typing import List

SUPPORTED_EXTS = {'.bmp', '.png', '.jpg', '.jpeg', '.tif', '.tiff'}
real_dir = DATASET_ROOT / 'Real'
altered_dir = DATASET_ROOT / 'Altered'

if not real_dir.exists() or not altered_dir.exists():
    raise FileNotFoundError('SOCOFing Real/Altered folders not found at expected location')


def find_images(dir_path: Path) -> List[Path]:
    return [p for p in dir_path.rglob('*') if p.suffix.lower() in SUPPORTED_EXTS]

real_files = sorted(find_images(real_dir))
altered_files = sorted(find_images(altered_dir))

if MAX_PER_CLASS:
    real_files = real_files[:MAX_PER_CLASS]
    altered_files = altered_files[:MAX_PER_CLASS]

print(f'Found {len(real_files)} real and {len(altered_files)} altered images')

import pandas as pd
rows = [{'path': p, 'label': 0, 'label_str': 'Real'} for p in real_files] + \
       [{'path': p, 'label': 1, 'label_str': 'Altered'} for p in altered_files]

df = pd.DataFrame(rows).sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
df.head()


FileNotFoundError: SOCOFing Real/Altered folders not found at expected location

In [None]:
# Preprocessing helpers and sample visualisation
from math import ceil

def load_gray_and_resized(path: Path, size=IMG_SIZE):
    img = io.imread(path.as_posix())
    if img.ndim == 3:
        if img.shape[2] == 4:
            img = color.rgba2rgb(img)
            img_gray = color.rgb2gray(img)
        else:
            img_gray = color.rgb2gray(img)
    else:
        img_gray = img.astype(np.float32)
    img_resized = transform.resize(img_gray, size, anti_aliasing=True, mode='reflect', preserve_range=True)
    return img_resized

# Visualise a few samples
classes = ['Real', 'Altered']
fig, axes = plt.subplots(2, 4, figsize=(12,6))
for i, cls in enumerate(classes):
    subset = df[df['label_str']==cls].sample(n=4, random_state=RANDOM_STATE)
    for j, (_, row) in enumerate(subset.iterrows()):
        axes[i, j].imshow(load_gray_and_resized(row.path), cmap='gray')
        axes[i, j].axis('off')
        if i==0:
            axes[i, j].set_title(cls)
plt.tight_layout()
plt.show()


In [None]:
# Gabor feature utilities
def compute_gabor_features(image_gray: np.ndarray, thetas_deg, freqs):
    feats = []
    for theta_deg in thetas_deg:
        theta = np.deg2rad(theta_deg)
        for freq in freqs:
            real, imag = gabor(image_gray, frequency=freq, theta=theta)
            mag = np.sqrt(real**2 + imag**2)
            feats.extend([mag.mean(), mag.std()])
    return np.array(feats, dtype=np.float32)

def path_to_feature(path: Path):
    img = load_gray_and_resized(path)
    img = np.clip(img.astype(np.float32), 0, 1)
    return compute_gabor_features(img, THETAS_DEG, FREQS)


In [None]:
# Feature extraction (may take a while!)
num_features = 2*len(THETAS_DEG)*len(FREQS)
print('Each image ->', num_features, 'features')

X = np.empty((len(df), num_features), dtype=np.float32)
for i, row in enumerate(df.itertuples(index=False)):
    X[i] = path_to_feature(row.path)
    if (i+1)%500==0:
        print(f'Processed {i+1}/{len(df)}')

y = df['label'].values


In [None]:
# Train/test split & scaling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
print('Train size:', X_train.shape[0], 'Test size:', X_test.shape[0])


In [None]:
# Model training — k-NN and SVM
knn_params = {'n_neighbors':[3,5,7,9]}
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=CV_FOLDS, n_jobs=N_JOBS, scoring='accuracy')
knn_grid.fit(X_train_s, y_train)
print('Best k-NN:', knn_grid.best_params_)

svm_params = {'C':[1,10,100], 'gamma':['scale','auto',0.01,0.001]}
svm_grid = GridSearchCV(SVC(kernel='rbf'), svm_params, cv=CV_FOLDS, n_jobs=N_JOBS, scoring='accuracy')
svm_grid.fit(X_train_s, y_train)
print('Best SVM:', svm_grid.best_params_)


In [None]:
# Evaluation & results
results = []

def evaluate(name, model):
    preds = model.predict(X_test_s)
    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds)
    rec = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    cm = confusion_matrix(y_test, preds)
    print(f"\n{name} confusion matrix:\n", cm)
    results.append({'Model':name,'Accuracy':acc,'Precision':prec,'Recall':rec,'F1':f1})

evaluate('k-NN', knn_grid.best_estimator_)
evaluate('SVM', svm_grid.best_estimator_)

pd.DataFrame(results)


## Discussion

The table above compares k-NN and SVM on Gabor-based texture features. SVM typically excels thanks to the RBF kernel capturing non-linear boundaries, whereas k-NN offers a simpler baseline. Future work could expand the filter bank, balance classes, or explore deep learning approaches.
