In [None]:
import os
import sys
import time
import joblib
import numpy as np
import cv2
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from skimage.feature import local_binary_pattern, hog
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import lightgbm as lgb
from sklearn.base import ClassifierMixin
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.linear_model import LogisticRegression  
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import PCA

def is_kaggle_image_fire(image_path):
    parent_dir = os.path.basename(os.path.dirname(image_path))
    return parent_dir == 'fire_images'

def extract_color_histograms(img_processed, color_space, bins):
    histograms = []
    ranges = {
        'hsv': {'float': ([0, 1], [0, 1], [0, 1]), 'uint8': ([0, 180], [0, 256], [0, 256])},
        'ycbcr': {'float': ([0, 1], [-0.5, 0.5], [-0.5, 0.5]), 'uint8': ([0, 256], [0, 256], [0, 256])}
    }
    channel_indices = {'hsv': [0, 1], 'ycbcr': [1, 2]}
    dtype_key = 'float' if img_processed.dtype in [np.float32, np.float64] else 'uint8'

    if color_space in ranges and color_space in channel_indices:
        for i in channel_indices[color_space]:
            current_range = ranges[color_space][dtype_key][i]
            if img_processed.dtype != np.float32 and img_processed.dtype != np.uint8:
                 img_processed = img_processed.astype(np.float32)
                 dtype_key = 'float'
                 current_range = ranges[color_space][dtype_key][i]

            hist = cv2.calcHist([img_processed], [i], None, [bins], current_range)
            histograms.append(hist.flatten())

    if histograms: return np.concatenate(histograms)
    else: return np.array([])

def extract_lbp_features(img_gray, radius, n_points, method):

    if img_gray is None or img_gray.size == 0: return np.array([])
    if n_points is None: n_points = 8 * radius

    if img_gray.dtype != np.uint8 and img_gray.dtype != np.float64:
         img_gray = img_gray.astype(np.float64)

    try:
        lbp_image = local_binary_pattern(img_gray, n_points, radius, method=method)
        if method == 'uniform' or method == 'nri_uniform':
            n_bins = int(n_points + 2)
            hist_range = (0, n_bins)
        elif method == 'ror':
            n_bins = int(n_points / radius + 2)
            hist_range = (0, n_bins)
        else:
            n_bins = int(2**n_points)
            hist_range = (0, n_bins)
        lbp_hist, _ = np.histogram(lbp_image.ravel(), bins=n_bins, range=hist_range)
        lbp_hist = lbp_hist.astype(np.float32)
        if lbp_hist.sum() > 0: lbp_hist /= lbp_hist.sum()
        return lbp_hist.flatten()

    except Exception as e:

        return np.array([])

def extract_hog_features(img_gray, orientations, pixels_per_cell, cells_per_block, block_norm):
    if img_gray is None or img_gray.size == 0: return np.array([])
    if img_gray.dtype != np.uint8 and img_gray.dtype != np.float64:
         img_gray = img_gray.astype(np.float64)
    img_h, img_w = img_gray.shape
    cell_h, cell_w = pixels_per_cell
    block_h, block_w = cells_per_block
    min_img_h = cell_h * block_h
    min_img_w = cell_w * block_w

    if img_h < min_img_h or img_w < min_img_w:
        return np.array([])
    try:
        hog_features = hog(img_gray, orientations=orientations,
                           pixels_per_cell=pixels_per_cell,
                           cells_per_block=cells_per_block,
                           block_norm=block_norm,
                           visualize=False, feature_vector=True)
        return hog_features.flatten()
    except Exception as e:
        return np.array([])

def combine_features(img_dict, feature_params):
    all_features = []
    if 'hsv' in img_dict and img_dict['hsv'] is not None:
        hsv_hist = extract_color_histograms(img_dict['hsv'], 'hsv', bins=feature_params.get('hist_bins', 100))
        if hsv_hist.size > 0: all_features.append(hsv_hist)
    if 'ycbcr' in img_dict and img_dict['ycbcr'] is not None:
        ycbcr_hist = extract_color_histograms(img_dict['ycbcr'], 'ycbcr', bins=feature_params.get('hist_bins', 100))
        if ycbcr_hist.size > 0: all_features.append(ycbcr_hist)
    if 'gray' in img_dict and img_dict['gray'] is not None:
        img_gray_processed = img_dict['gray']

        lbp_features = extract_lbp_features(img_gray_processed,
                                            radius=feature_params.get('lbp_radius', 3),
                                            n_points=feature_params.get('lbp_n_points', None),
                                            method=feature_params.get('lbp_method', 'uniform'))
        if lbp_features.size > 0: all_features.append(lbp_features)

        hog_features = extract_hog_features(img_gray_processed,
                                           orientations=feature_params.get('hog_orientations', 9),
                                           pixels_per_cell=feature_params.get('hog_pixels_per_cell', (8, 8)),
                                           cells_per_block=feature_params.get('hog_cells_per_block', (2, 2)),
                                           block_norm=feature_params.get('hog_block_norm', 'L2-Hys'))
        if hog_features.size > 0: all_features.append(hog_features)

    if all_features:
        all_features = [f.astype(np.float32) for f in all_features]
        combined_vector = np.concatenate(all_features)
        return combined_vector
    else:
        return np.array([])

def load_and_extract_features_memory_safe(config, feature_params):
    dataset_choice = config.get('dataset_choice', 'kaggle')
    data_root = config.get('data_root')
    target_size = config.get('target_img_size')
    color_spaces_to_load = config.get('color_spaces_to_load', ['bgr', 'hsv', 'ycbcr'])
    normalize_pixels = config.get('normalize_pixels', 1)

    if not data_root or not target_size: return np.array([]), np.array([])

    image_label_pairs = []
    img_extensions = ['.jpg', '.jpeg', '.png']
    fire_dir = os.path.join(data_root, 'fire_images')
    non_fire_dir = os.path.join(data_root, 'non_fire_images')

    if not os.path.isdir(fire_dir) or not os.path.isdir(non_fire_dir):
        print(f"Error: 'fire_images' or 'non_fire_images' directory not found in {data_root}")
        return np.array([]), np.array([])

    print("Determining Labels...")
     
    fire_image_files = [os.path.join(fire_dir, f) for f in os.listdir(fire_dir) if os.path.splitext(f)[1].lower() in img_extensions]
    for img_path in tqdm(fire_image_files, desc="Processing Fire Images", leave=False):
        image_label_pairs.append((img_path, 1))  

     
    non_fire_image_files = [os.path.join(non_fire_dir, f) for f in os.listdir(non_fire_dir) if os.path.splitext(f)[1].lower() in img_extensions]
    for img_path in tqdm(non_fire_image_files, desc="Processing Non-Fire Images", leave=False):
        image_label_pairs.append((img_path, 0))  

    print("Label determination complete.")

    if not image_label_pairs:
        print("No images with labels found.")
        return np.array([]), np.array([])

    all_features_list = []
    all_labels_list = []
    total_images_processed = 0
    total_images_skipped_reading = 0
    total_images_skipped_feature = 0

    print("Loading images and extracting features...")
    for image_path, label in tqdm(image_label_pairs, desc="Memory-safe Feature Extraction", leave=False):
        img_bgr = cv2.imread(image_path)

        if img_bgr is None:
            total_images_skipped_reading += 1
            continue

        try:
            img_resized = cv2.resize(img_bgr, target_size, interpolation=cv2.INTER_LINEAR)
            img_dict_single = {}  
            img_processed_bgr = None
            if normalize_pixels:
                img_processed_bgr = img_resized.astype(np.float32) / 255.0 
                img_resized_uint8 = img_resized.astype(np.uint8)
                img_gray = cv2.cvtColor(img_resized_uint8, cv2.COLOR_BGR2GRAY).astype(np.float32) / 255.0  
                if 'hsv' in color_spaces_to_load:
                    img_dict_single['hsv'] = cv2.cvtColor(img_resized_uint8, cv2.COLOR_BGR2HSV).astype(np.float32) / np.array([180, 255, 255], dtype=np.float32)   
                if 'ycbcr' in color_spaces_to_load:
                    img_dict_single['ycbcr'] = cv2.cvtColor(img_resized_uint8, cv2.COLOR_BGR2YCrCb).astype(np.float32) / 255.0  
            else:  
                img_processed_bgr = img_resized.astype(np.uint8)
                img_gray = cv2.cvtColor(img_processed_bgr, cv2.COLOR_BGR2GRAY)
                if 'hsv' in color_spaces_to_load:
                    img_dict_single['hsv'] = cv2.cvtColor(img_processed_bgr, cv2.COLOR_BGR2HSV)
                if 'ycbcr' in color_spaces_to_load:
                    img_dict_single['ycbcr'] = cv2.cvtColor(img_processed_bgr, cv2.COLOR_BGR2YCrCb)

            img_dict_single['gray'] = img_gray
            if 'bgr' in color_spaces_to_load:
                img_dict_single['bgr'] = img_processed_bgr

            features_single = combine_features(img_dict_single, feature_params)

            if features_single.size > 0:
                all_features_list.append(features_single)
                all_labels_list.append(label)
                total_images_processed += 1
            else:
                total_images_skipped_feature += 1

        except Exception as e:
            total_images_skipped_feature += 1

    print("Feature extraction complete.")
    print(f"Total images initially found: {len(image_label_pairs)}")
    print(f"Images skipped (read error): {total_images_skipped_reading}")
    print(f"Images skipped (feature error): {total_images_skipped_feature}")
    print(f"Images successfully processed: {total_images_processed}")
    if not all_features_list:
        print("No features extracted from any image.")
        return np.array([]), np.array([])
    features_array = np.array(all_features_list, dtype=np.float32)
    labels_array = np.array(all_labels_list, dtype=np.int32)
    print(f"Final features array shape: {features_array.shape}")
    print(f"Final labels array shape: {labels_array.shape}")

    return features_array, labels_array

def get_config(dataset_choice):
    config = {}
    if dataset_choice == 'kaggle':
        config['dataset_choice'] = 'kaggle'
        config['data_root'] = os.path.join('..', 'data_subsets', 'fire_dataset')   
        config['target_img_size'] = (128, 128)  
        config['color_spaces_to_load'] = ['bgr', 'hsv', 'ycbcr']  
        config['normalize_pixels'] = 1  
        config['fire_class_ids'] = None   
    elif dataset_choice == 'dfire':
        config['dataset_choice'] = 'dfire'
        config['dfire_root'] = os.path.join('..', 'data_subsets', 'D-Fire')  
        config['split_name'] = "train"  
        config['data_root'] = os.path.join(config['dfire_root'], config['split_name'])
        config['target_img_size'] = (128, 128)  
        config['color_spaces_to_load'] = ['bgr', 'hsv', 'ycbcr']  
        config['normalize_pixels'] = 1  
        config['fire_class_ids'] = [0, 1]  
    else:
        raise ValueError(f"Unknown dataset choice: {dataset_choice}. Choose 'kaggle' or 'dfire'.")

    print(f"Using dataset: {config['dataset_choice']}")
    print(f"Data root: {config.get('data_root')}")
    print(f"Target image size: {config['target_img_size']}")
    print(f"Color spaces loaded: {config['color_spaces_to_load']}")
    print(f"Normalize pixels: {bool(config['normalize_pixels'])}")
    if dataset_choice == 'dfire':
         print(f"D-Fire Split: {config['split_name']}")
         print(f"D-Fire Fire Class IDs: {config['fire_class_ids']}")
    return config

def get_feature_params():
    feature_params = {
        'hist_bins': 100,  
        'lbp_radius': 3,  
        'lbp_n_points': None,  
        'lbp_method': 'uniform',  
        'hog_orientations': 9,  
        'hog_pixels_per_cell': (8, 8),  
        'hog_cells_per_block': (2, 2),  
        'hog_block_norm': 'L2-Hys'  
    }
    print("\nFeature extraction parameters:", feature_params)
    return feature_params

def split_data(features_array, labels_array, test_size=0.2, random_state=42):
    if features_array.shape[0] == 0:
        print("Feature array is empty, cannot split.")
        return None, None, None, None

    print(f"\nSplitting data: training ({1-test_size:.0%}) testing ({test_size:.0%})")

    X_train, X_test, y_train, y_test = train_test_split(
        features_array,
        labels_array,
        test_size=test_size,
        random_state=random_state,
        stratify=labels_array
    )

    print(f"Training features shape: {X_train.shape}")
    print(f"Testing features shape: {X_test.shape}")
    print(f"Training labels shape: {y_train.shape}")
    print(f"Testing labels shape: {y_test.shape}")

    train_labels, train_counts = np.unique(y_train, return_counts=True)
    test_labels, test_counts = np.unique(y_test, return_counts=True)
    print(f"Train label distribution: {dict(zip(train_labels, train_counts))}")
    print(f"Test label distribution: {dict(zip(test_labels, test_counts))}")

    return X_train, X_test, y_train, y_test

def scale_features(X_train, X_test):
    if X_train is None or X_test is None or X_train.shape[0] == 0:
         print("Scaling skipped: train or test data is empty.")
         return None, None, None
    print("\nScaling features...")
    scaler = StandardScaler()

    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print("Scaling complete.")
    return X_train_scaled, X_test_scaled, scaler

def perform_correlation_selection(X_train, y_train, X_test, k_features):
    if X_train is None or X_test is None or X_train.shape[0] == 0:
         print("Correlation Selection skipped: train or test data is empty.")
         return X_train, X_test, None  

    n_total_features = X_train.shape[1]
    k_features_int = k_features  
    percentage = None  
    percentage_str = None  

    if isinstance(k_features, str) and k_features.endswith('%'):
        try:
            percentage_str = k_features  
            percentage = float(k_features[:-1]) / 100.0
            k_features_int = max(1, int(n_total_features * percentage))
            print(f"Selecting top {k_features_int} features based on {percentage_str} percentage using Correlation...")
        except ValueError:
            print(f"Invalid percentage string for k_features: {k_features}. Skipping selection.")
            return X_train, X_test, None
    elif k_features == 'all':
         print("Selecting all features (no correlation selection)...")
         return X_train, X_test, None   
    elif isinstance(k_features, int) and k_features > 0:
        k_features_int = min(k_features, n_total_features)
        print(f"Selecting top {k_features_int} features by correlation...")
    else:
        print(f"Invalid k_features value: {k_features}. Must be int > 0, 'all', or percentage string (e.g., '75%'). Skipping selection.")
        return X_train, X_test, None


    if k_features_int <= 0 or k_features_int > n_total_features:
         print(f"Calculated number of features to select ({k_features_int}) is invalid. Skipping selection.")
         return X_train, X_test, None
    if k_features_int == n_total_features:
         print("Number of features to select is equal to total features. Skipping selection.")
         return X_train, X_test, None

    selector = SelectKBest(score_func=f_classif, k=k_features_int)
    selector.fit(X_train, y_train)  

    X_train_selected = selector.transform(X_train)
    X_test_selected = selector.transform(X_test)

    print(f"Original feature shape: {X_train.shape}")
    print(f"Selected feature shape: {X_train_selected.shape}")

    return X_train_selected, X_test_selected, selector

def perform_rfe_selection(X_train, y_train, X_test, n_features_to_select, step=0.1, estimator=None):
    if X_train is None or X_test is None or X_train.shape[0] == 0:
         print("RFE Selection skipped: train or test data is empty.")
         return X_train, X_test, None  

    n_total_features = X_train.shape[1]
    n_features_int = n_features_to_select  
    percentage = None  
    percentage_str = None  

    if estimator is None:
        estimator = LogisticRegression(solver='liblinear', random_state=42, max_iter=2000)

    if isinstance(n_features_to_select, str) and n_features_to_select.endswith('%'):
        try:
            percentage_str = n_features_to_select  
            percentage = float(n_features_to_select[:-1]) / 100.0
            n_features_int = max(1, int(n_total_features * percentage))  
            print(f"Selecting top {n_features_int} features based on {percentage_str} percentage using RFE...")
        except ValueError:
            print(f"Invalid percentage string for n_features_to_select: {n_features_to_select}. Skipping RFE.")
            return X_train, X_test, None
    elif isinstance(n_features_to_select, int) and n_features_to_select > 0:
        n_features_int = min(n_features_to_select, n_total_features)  
        print(f"Selecting {n_features_int} features using RFE...")
    elif n_features_to_select == 'auto':
        print("RFE with 'auto' feature selection requires RFECV, which is not implemented in this helper. Skipping selection.")
        return X_train, X_test, None  
    else:
        print(f"Invalid n_features_to_select value: {n_features_to_select}. Skipping RFE.")
        return X_train, X_test, None

    if n_features_int <= 0 or n_features_int > n_total_features:
        print(f"Calculated number of features for RFE ({n_features_int}) is invalid. Skipping selection.")
        return X_train, X_test, None
    if n_features_int == n_total_features:
        print("Number of features to select is equal to total features. Skipping selection.")
        return X_train, X_test, None

    try:         
        if n_features_int >= n_total_features:
            print("Number of features to select is >= total features. Skipping RFE selection.")
            return X_train, X_test, None  

        rfe = RFE(estimator=estimator, n_features_to_select=n_features_int, step=step)
        rfe.fit(X_train, y_train)  

        X_train_selected = rfe.transform(X_train)
        X_test_selected = rfe.transform(X_test)

        print(f"Original feature shape: {X_train.shape}")
        print(f"Selected feature shape: {X_train_selected.shape}")

        return X_train_selected, X_test_selected, rfe
    except Exception as e:
        print(f"Error during RFE fit/transform: {e}")
        return X_train, X_test, None  

def tune_model_hyperparameters(model_estimator, X_train, y_train, param_grid, cv_strategy, scoring='f1', search_method='RandomSearch', n_iter=20):
    if X_train is None or y_train is None or X_train.shape[0] == 0:
        print("Hyperparameter tuning skipped: training data is empty.")
        return None

    print(f"\nPerforming {search_method} tuning (scoring='{scoring}')... with {n_iter} iterations")
    start_time = time.time()

    if search_method == 'RandomSearch':
         search_cv = RandomizedSearchCV(
            estimator=model_estimator,
            param_distributions=param_grid,  
            n_iter=n_iter,  
            cv=cv_strategy,
            scoring=scoring,
            n_jobs=-1,  
            verbose=1,
            random_state=42  
         )
    else:
        print(f"Unknown search_method: {search_method}. Use 'GridSearch' or 'RandomSearch'.")
        return None

    search_cv.fit(X_train, y_train)

    end_time = time.time()
    print(f"{search_method} duration: {end_time - start_time:.2f} seconds")
    print("\nBest parameters found:")
    print(search_cv.best_params_)
    print("\nBest CV score:")
    print(search_cv.best_score_)

    return search_cv

def evaluate_model(model, X_test, y_test, model_name="Model", feature_set_name="Unknown Feature Set"):
    if model is None or X_test is None or y_test is None or X_test.shape[0] == 0:
        print(f"{model_name} evaluation skipped on {feature_set_name}: model not trained or test data is empty.")
        return {}

    print(f"\nEvaluating {model_name} on the test set using {feature_set_name}...")
    start_time = time.time()
    y_pred = model.predict(X_test)
    end_time = time.time()
    print(f"Prediction duration: {end_time - start_time:.4f} seconds")

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    print(f"\nConfusion Matrix ({model_name} on {feature_set_name}):")
    print(conf_matrix)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'confusion_matrix': conf_matrix.tolist()  
    }

def perform_pca_dimension_reduction(X_train, X_test, n_components):
    if X_train is None or X_test is None or X_train.shape[0] == 0:
        print("PCA skipped bc the data is empty..")
        return None, None, None

    try:
        n_total_features = X_train.shape[1]
        if isinstance(n_components, float) and 0 < n_components < 1:
            print(f"Applying PCA to retain {n_components:.0%} of variance...")
        elif isinstance(n_components, int) and 0 < n_components < n_total_features:
            print(f"Applying PCA to reduce to {n_components} components...")
        else: return X_train, X_test, None
        pca = PCA(n_components=n_components, random_state=42) # didn't wanna ruin a convention that came from a pop cultural joke
        X_train_pca = pca.fit_transform(X_train)
        X_test_pca = pca.transform(X_test)

        print(f"Original feature shape: {X_train.shape}")
        print(f"PCA transformed feature shape: {X_train_pca.shape}")
        print(f"Explained variance ratio with {pca.n_components_} components: {np.sum(pca.explained_variance_ratio_):.4f}")

        return X_train_pca, X_test_pca, pca
    except Exception as e:
        print(f"Error during PCA: {e}")
        return X_train, X_test, None

print("Imports and helper functions loaded.")


Imports and helper functions loaded.


In [2]:
dataset_choice = 'kaggle'

try:
    config = get_config(dataset_choice)
    feature_params = get_feature_params()
    features_array_orig, labels_array = load_and_extract_features_memory_safe(config, feature_params)

except ValueError as e:
    print(f"Configuration Error: {e}")
    features_array_orig = np.array([])
    labels_array = np.array([])
except FileNotFoundError as e:
    print(f"File Not Found Error: {e}. Please check data paths in get_config.")
    features_array_orig = np.array([])
    labels_array = np.array([])
except Exception as e:
    print(f"An unexpected error occurred during data loading and feature extraction: {e}")
    features_array_orig = np.array([])
    labels_array = np.array([])
    
X_train_orig, X_test_orig, y_train, y_test = None, None, None, None
X_train_scaled, X_test_scaled, scaler = None, None, None
feature_sets = {}
feature_transformers = {}

if features_array_orig.shape[0] > 0:
    print("\n--- Splitting Data ---")
    X_train_orig, X_test_orig, y_train, y_test = split_data(features_array_orig, labels_array, test_size=0.2, random_state=42)
    print("\n--- Scaling Features (Initial) ---")
    X_train_scaled, X_test_scaled, scaler = scale_features(X_train_orig, X_test_orig)
    feature_sets['Scaled_All'] = (X_train_scaled, X_test_scaled)
    feature_transformers['Scaled_All'] = scaler

else:
    print("No features loaded. Cannot proceed with splitting, scaling, or modeling.")

Using dataset: kaggle
Data root: ..\data_subsets\fire_dataset
Target image size: (128, 128)
Color spaces loaded: ['bgr', 'hsv', 'ycbcr']
Normalize pixels: True

Feature extraction parameters: {'hist_bins': 100, 'lbp_radius': 3, 'lbp_n_points': None, 'lbp_method': 'uniform', 'hog_orientations': 9, 'hog_pixels_per_cell': (8, 8), 'hog_cells_per_block': (2, 2), 'hog_block_norm': 'L2-Hys'}
Determining Labels...


                                                                   

Label determination complete.
Loading images and extracting features...


                                                                                 

Feature extraction complete.
Total images initially found: 999
Images skipped (read error): 1
Images skipped (feature error): 0
Images successfully processed: 998
Final features array shape: (998, 8526)
Final labels array shape: (998,)

--- Splitting Data ---

Splitting data: training (80%) testing (20%)
Training features shape: (798, 8526)
Testing features shape: (200, 8526)
Training labels shape: (798,)
Testing labels shape: (200,)
Train label distribution: {np.int32(0): np.int64(194), np.int32(1): np.int64(604)}
Test label distribution: {np.int32(0): np.int64(49), np.int32(1): np.int64(151)}

--- Scaling Features (Initial) ---

Scaling features...
Scaling complete.




In [3]:
if X_train_scaled is not None and y_train is not None:
    print("\n--- Performing Feature Engineering (Selection) ---")
    original_feature_count = X_train_scaled.shape[1]
    print(f"Starting with {original_feature_count} features after scaling.")

     
    corr_feature_percentages = ['75%', '50%']
    for percentage_str in corr_feature_percentages:
        print(f"\nAttempting Correlation Selection with {percentage_str}...")
        try:
            X_train_corr, X_test_corr, corr_selector = perform_correlation_selection(
                X_train_scaled, y_train, X_test_scaled, k_features=percentage_str
            )
            if X_train_corr is not None and X_train_corr.shape[1] < original_feature_count:
                feature_sets[f'Scaled_Corr{percentage_str}'] = (X_train_corr, X_test_corr)
                feature_transformers[f'Scaled_Corr{percentage_str}'] = corr_selector
            else:
                 print(f"Correlation Selection with {percentage_str} did not reduce features or failed. Skipping adding this set.")
        except Exception as e:
            print(f"Error during Correlation Selection ({percentage_str}): {e}")
     
    rfe_feature_percentages = ['75%', '50%']     
    rfe_step_val = 0.1  
    rfe_estimator = LogisticRegression(solver='liblinear', random_state=42, max_iter=2000)

    for percentage_str in rfe_feature_percentages:
         print(f"\nAttempting RFE Selection with {percentage_str} (step={rfe_step_val})...")
         try:
            X_train_rfe, X_test_rfe, rfe_selector = perform_rfe_selection(
                X_train_scaled, y_train, X_test_scaled, n_features_to_select=percentage_str, step=rfe_step_val, estimator=rfe_estimator
            )
             
            if X_train_rfe is not None and X_train_rfe.shape[1] < original_feature_count:
                feature_sets[f'Scaled_RFE{percentage_str}'] = (X_train_rfe, X_test_rfe)
                feature_transformers[f'Scaled_RFE{percentage_str}'] = rfe_selector
            else:
                 print(f"RFE Selection with {percentage_str} did not reduce features or failed. Skipping adding this set.")
         except Exception as e:
            print(f"Error during RFE Selection ({percentage_str}): {e}")

    print("\n--- Available Feature Sets for Tuning ---")
    for name, (X_train_fs, _) in feature_sets.items():
        print(f"- {name}: {X_train_fs.shape[1]} features, before PCA..")
    
    pca_components= [0.95, 500] # aiming for .95 variance retention & 500 feature for first trial
    for n_comp in pca_components:
        print(f"\nPCA with n_components={n_comp}...")
        try:
            X_train_pca, X_test_pca, pca_transformer = perform_pca_dimension_reduction(X_train_scaled, X_test_scaled, n_components=n_comp)
            if X_train_pca is not None and X_train_pca.shape[1] < original_feature_count:
                fs_name_suffix = f"{int(n_comp*100)}%" if isinstance(n_comp, float) else str(n_comp)
                fs_name = f'Scaled_PCA_{fs_name_suffix}'
                feature_sets[fs_name] = (X_train_pca, X_test_pca)
                feature_transformers[fs_name] = pca_transformer
            else:
                print(f"n_components={n_comp} failed..")
        except Exception as e:
            print(f"error during n_components={n_comp}: {e}")

    print("\n--- AFTER PCA: ---")
    for name, (X_train_fs, _) in feature_sets.items():
        print(f"- {name}: {X_train_fs.shape[1]} features")

else:
    print("Skipping feature engineering as scaled data is not available.")


--- Performing Feature Engineering (Selection) ---
Starting with 8526 features after scaling.

Attempting Correlation Selection with 75%...
Selecting top 6394 features based on 75% percentage using Correlation...
Original feature shape: (798, 8526)
Selected feature shape: (798, 6394)

Attempting Correlation Selection with 50%...
Selecting top 4263 features based on 50% percentage using Correlation...
Original feature shape: (798, 8526)
Selected feature shape: (798, 4263)

Attempting RFE Selection with 75% (step=0.1)...
Selecting top 6394 features based on 75% percentage using RFE...
Original feature shape: (798, 8526)
Selected feature shape: (798, 6394)

Attempting RFE Selection with 50% (step=0.1)...
Selecting top 4263 features based on 50% percentage using RFE...
Original feature shape: (798, 8526)
Selected feature shape: (798, 4263)

--- Available Feature Sets for Tuning ---
- Scaled_All: 8526 features, before PCA..
- Scaled_Corr75%: 6394 features, before PCA..
- Scaled_Corr50%: 42

In [4]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout, LeakyReLU, Input
from scikeras.wrappers import KerasClassifier
def create_custom_mlp(hidden_layer_1_neurons=128, hidden_layer_2_neurons=64,
                        dropout_rate=0.3, activation='leaky_relu', learning_rate=0.001,
                        meta=None):
    n_features_in = meta["n_features_in_"]
    
    model = Sequential()
    model.add(Input(shape=(n_features_in,)))
    model.add(Dense(hidden_layer_1_neurons))
    model.add(BatchNormalization())
    if activation == 'leaky_relu': model.add(LeakyReLU(alpha=0.1))
    else: model.add(tf.keras.layers.ReLU())
    model.add(Dropout(dropout_rate))
    model.add(Dense(hidden_layer_2_neurons))
    model.add(BatchNormalization())
    if activation == 'leaky_relu': model.add(LeakyReLU(alpha=0.1))
    else: model.add(tf.keras.layers.ReLU())
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [None]:
if not feature_sets or y_train is None:
    print("Skipping model training and tuning: No feature sets available or labels are missing.")
else:
    print("\n--- Starting Model Training and Hyperparameter Tuning (RandomizedSearchCV) ---")     
    models_to_tune = {
        'SVM': {
            'estimator': SVC(random_state=42),
            'param_grid': {
                'C': [0.1, 1, 10], # 50, 100
                'gamma': ['scale', 'auto', 0.01, 0.1, 1], # 0.001
                'kernel': ['rbf', 'linear']
            }
        },
        'LightGBM': {
            'estimator': lgb.LGBMClassifier(random_state=42, objective='binary', metric='binary_logloss', verbosity=-1),  
            'param_grid': {
                'n_estimators': [50, 100, 150],      
                'learning_rate': [0.01, 0.05, 0.1],   
                'max_depth': [-1, 10, 20],           
                'num_leaves': [31, 50, 70],          
                'subsample': [0.8, 0.9],              
                'colsample_bytree': [0.8, 0.9, 1.0],
                'min_split_gain': [0.1],    
                'min_child_samples': [5]   
            }
        },
        """'MLP': {  
            'estimator': MLPClassifier(random_state=42, max_iter=500, early_stopping=True, n_iter_no_change=10),  
            'param_grid': {
                'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],  
                'activation': ['relu'], # tf.keras.layers.LeakyReLU() dense katmanında eklenebilirmis. custom mlpye geciyorum customu unutmusum
                'solver': ['adam'],  
                'alpha': [0.0001, 0.001],  
                'learning_rate_init': [0.001, 0.005, 0.01],  
            }
        },"""
        'Custom_MLP': {
            'estimator': KerasClassifier(
                model=create_custom_mlp,
                loss=tf.keras.losses.BinaryCrossentropy,
                **ClassifierMixin.__sklearn_tags__,
                epochs=100,
                batch_size=32,
                verbose=0, # çıktı ayarı
                callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=15, verbose=0, restore_best_weights=True)]
            ),
            'param_grid': {
                'model__hidden_layer_configs': [(64, 32), (64, 64), (128, 32), (128, 64)],
                'model__dropout_rate': [0.2, 0.4],
                'model__activation': ['relu', 'leaky_relu'],
                'optimizer__learning_rate': [0.001, 0.005, 0.01]
            }
        }
    }

    # todo: degisiklikleri dfirea da uygula
    cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) #bunu daha onceki notebookta acıklamıstım. raporda/sunumda acıklamam iyi olur
    scoring_metric = 'f1'   
    all_results = {}

    best_overall_test_score = -np.inf
    best_overall_combination = None
    best_overall_trained_model = None
    best_overall_X_test = None
    best_overall_transformer = None

    for model_name, model_config in models_to_tune.items():
        all_results[model_name] = {}
        estimator = model_config['estimator']
        param_distributions = model_config['param_grid']
        n_iter_search = model_config.get('n_iter', 20)  

        print(f"\n\n=== Training and Tuning {model_name} ===")

        for fs_name in sorted(feature_sets.keys()):
            X_train_fs, X_test_fs = feature_sets[fs_name]
            print(f"\n--- Tuning {model_name} on Feature Set: {fs_name} ({X_train_fs.shape[1]} features) ---")

            if X_train_fs is None or X_train_fs.shape[0] == 0:
                print(f"Skipping tuning for {model_name} on {fs_name}: Training data is empty.")
                continue

            tuned_search = tune_model_hyperparameters(
                estimator,
                X_train_fs,
                y_train,
                param_grid=param_distributions,  
                cv_strategy=cv_strategy,
                scoring=scoring_metric,
                search_method='RandomSearch',  
                n_iter=n_iter_search
            )

            if tuned_search:
                best_model_for_combination = tuned_search.best_estimator_
                best_cv_score = tuned_search.best_score_
                best_params = tuned_search.best_params_
                print(f"Best CV {scoring_metric} for {model_name} on {fs_name}: {best_cv_score:.4f}")

                test_metrics = evaluate_model(best_model_for_combination, X_test_fs, y_test, model_name, fs_name)

                all_results[model_name][fs_name] = {
                    'best_cv_score': best_cv_score,
                    'best_params': best_params,
                    'test_metrics': test_metrics,
                    'trained_model': best_model_for_combination,
                    'transformer': feature_transformers.get(fs_name)   
                }

                if test_metrics and test_metrics.get('f1_score', -np.inf) > best_overall_test_score:
                    best_overall_test_score = test_metrics['f1_score']
                    best_overall_combination = (model_name, fs_name)
                    best_overall_trained_model = best_model_for_combination
                    best_overall_X_test = X_test_fs
                    best_overall_transformer = feature_transformers.get(fs_name)
    print("\n--- Model Training and Hyperparameter Tuning Complete ---")



--- Starting Model Training and Hyperparameter Tuning (RandomizedSearchCV) ---


=== Training and Tuning SVM ===

--- Tuning SVM on Feature Set: Scaled_All (8526 features) ---

Performing RandomSearch tuning (scoring='f1')... with 20 iterations
Fitting 5 folds for each of 20 candidates, totalling 100 fits
RandomSearch duration: 90.58 seconds

Best parameters found:
{'kernel': 'linear', 'gamma': 0.1, 'C': 10}

Best CV score:
0.9511085592239832
Best CV f1 for SVM on Scaled_All: 0.9511

Evaluating SVM on the test set using Scaled_All...
Prediction duration: 0.2819 seconds
Accuracy: 0.9250
Precision: 0.9533
Recall (Sensitivity): 0.9470
F1 Score: 0.9502

Confusion Matrix (SVM on Scaled_All):
[[ 42   7]
 [  8 143]]

--- Tuning SVM on Feature Set: Scaled_Corr50% (4263 features) ---

Performing RandomSearch tuning (scoring='f1')... with 20 iterations
Fitting 5 folds for each of 20 candidates, totalling 100 fits
RandomSearch duration: 34.08 seconds

Best parameters found:
{'kernel': 'linear', 

AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [None]:
print("\n\n=== Results Summary Across Models and Feature Sets ===")

if 'all_results' not in locals() or not all_results:
    print("No results available to summarize.")
else:
    print("\nCross-Validation Results (Best CV F1 Score):")
    print("-------------------------------------------------")
    for model_name, fs_results in all_results.items():
        print(f"\n{model_name}:")
        if fs_results:
            # Sort feature sets for display
            for fs_name in sorted(fs_results.keys()):
                 result = fs_results[fs_name]
                 cv_score = result.get('best_cv_score', float('nan'))
                 print(f"  - {fs_name}: {cv_score:.4f}")
        else:
            print("  No results for this model.")

    print("\nTest Set Results (F1 Score):")
    print("----------------------------")
    best_f1_per_model = {}

    for model_name, fs_results in all_results.items():
        print(f"\n{model_name}:")
        if fs_results:
            best_test_f1_for_model = -np.inf
            best_fs_name_for_model = None

            for fs_name in sorted(fs_results.keys()):
                 result = fs_results[fs_name]
                 test_f1 = result.get('test_metrics', {}).get('f1_score', float('nan'))
                 print(f"  - {fs_name}: {test_f1:.4f}")

                 if not np.isnan(test_f1) and test_f1 > best_test_f1_for_model:
                     best_test_f1_for_model = test_f1
                     best_fs_name_for_model = fs_name

            if best_fs_name_for_model:
                best_f1_per_model[model_name] = (best_fs_name_for_model, best_test_f1_for_model)
        else:
            print("  No test results for this model.")

    print("\n=== Overall Best Combination on Test Set (Based on F1 Score) ===")
    if best_overall_combination:
        model_name, fs_name = best_overall_combination
        best_result = all_results[model_name][fs_name]
        test_metrics = best_result['test_metrics']

        print(f"Best Model: {model_name}")
        actual_feature_count = feature_sets[fs_name][0].shape[1] if fs_name in feature_sets and feature_sets[fs_name][0] is not None else 'N/A'
        print(f"Best Feature Set: {fs_name} ({actual_feature_count} features)")

        print(f"Best CV F1 Score: {best_result['best_cv_score']:.4f}")
        print(f"Test F1 Score: {test_metrics['f1_score']:.4f}")
        print(f"Test Accuracy: {test_metrics['accuracy']:.4f}")
        print(f"Test Precision: {test_metrics['precision']:.4f}")
        print(f"Test Recall: {test_metrics['recall']:.4f}")
        print(f"Best Parameters: {best_result['best_params']}")
        print(f"Confusion Matrix:\n{np.array(test_metrics['confusion_matrix'])}")
    else:
        print("No successful model tuning and evaluation completed to determine the overall best combination.")




=== Results Summary Across Models and Feature Sets ===

Cross-Validation Results (Best CV F1 Score):
-------------------------------------------------

SVM:
  - Scaled_All: 0.9511
  - Scaled_Corr50%: 0.9577
  - Scaled_Corr75%: 0.9600
  - Scaled_RFE50%: 0.9853
  - Scaled_RFE75%: 0.9707

LightGBM:
  - Scaled_All: 0.9761
  - Scaled_Corr50%: 0.9728
  - Scaled_Corr75%: 0.9769
  - Scaled_RFE50%: 0.9776
  - Scaled_RFE75%: 0.9769

MLP:
  - Scaled_All: 0.9174
  - Scaled_Corr50%: 0.9349
  - Scaled_Corr75%: 0.9170
  - Scaled_RFE50%: 0.9667
  - Scaled_RFE75%: 0.9375

Test Set Results (F1 Score):
----------------------------

SVM:
  - Scaled_All: 0.9502
  - Scaled_Corr50%: 0.9412
  - Scaled_Corr75%: 0.9467
  - Scaled_RFE50%: 0.9431
  - Scaled_RFE75%: 0.9467

LightGBM:
  - Scaled_All: 0.9539
  - Scaled_Corr50%: 0.9571
  - Scaled_Corr75%: 0.9574
  - Scaled_RFE50%: 0.9574
  - Scaled_RFE75%: 0.9542

MLP:
  - Scaled_All: 0.9252
  - Scaled_Corr50%: 0.9384
  - Scaled_Corr75%: 0.9356
  - Scaled_RFE50%: 0

In [None]:
MODEL_SAVE_DIR = os.path.join('..', 'models')
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

print("\n--- Saving Best Model Per Algorithm (Based on Test F1) ---")

%store -r best_f1_per_model

if 'all_results' not in locals() or not all_results:
     print("No results found from model training and tuning. Nothing to save.")
elif 'best_f1_per_model' not in locals() or not best_f1_per_model:
     print("Could not determine best feature set per model. Skipping saving.")
else:
     
    initial_scaler_for_saving = feature_transformers.get('Scaled_All')
    SCALER_FILENAME = 'scaler_initial.pkl'
    SCALER_SAVE_PATH = os.path.join(MODEL_SAVE_DIR, SCALER_FILENAME)

    if initial_scaler_for_saving:
        if not os.path.exists(SCALER_SAVE_PATH):
            try:
                joblib.dump(initial_scaler_for_saving, SCALER_SAVE_PATH)
                print(f"Saved initial scaler: {SCALER_SAVE_PATH}")
            except Exception as e:
                print(f"Error saving initial scaler: {e}")
        else:
            print(f"Initial scaler already exists at {SCALER_SAVE_PATH}. Skipping save.")
    else:
        print("Initial scaler ('Scaled_All') not found in feature_transformers. Cannot save initial scaler.")

    print("\nSaving best model and transformer for each algorithm...")
    for model_name, (best_fs_name_for_model, best_test_f1_for_model) in best_f1_per_model.items():
        print(f"\nProcessing {model_name}...")
        if best_fs_name_for_model and model_name in all_results and best_fs_name_for_model in all_results[model_name]:
            best_combination_results = all_results[model_name][best_fs_name_for_model]
            model_to_save = best_combination_results.get('trained_model')
            transformer_to_save = best_combination_results.get('transformer')

            if model_to_save:
                model_filename = f'kaggle{model_name.lower()}_best_model_{best_fs_name_for_model}.pkl'
                MODEL_SAVE_PATH_ALG = os.path.join(MODEL_SAVE_DIR, model_filename)
                try:
                    joblib.dump(model_to_save, MODEL_SAVE_PATH_ALG)
                    print(f"   Saved model: {MODEL_SAVE_PATH_ALG}")
                except Exception as e:
                     print(f"   Error saving {model_name} model to {MODEL_SAVE_PATH_ALG}: {e}")
            else:
                 print(f"   No trained model found for {model_name} on {best_fs_name_for_model}.")

             
            if transformer_to_save and best_fs_name_for_model != 'Scaled_All':
                 transformer_filename = f'selector_{best_fs_name_for_model}.pkl'
                 TRANSFORMER_SAVE_PATH = os.path.join(MODEL_SAVE_DIR, transformer_filename)
                 try:
                     joblib.dump(transformer_to_save, TRANSFORMER_SAVE_PATH)
                     print(f"   Saved feature selection transformer: {TRANSFORMER_SAVE_PATH}")
                 except Exception as e:
                    print(f"   Error saving transformer for {best_fs_name_for_model} to {TRANSFORMER_SAVE_PATH}: {e}")
            elif best_fs_name_for_model != 'Scaled_All':
                print(f"   Warning: Feature selection transformer not found for {best_fs_name_for_model}. Cannot save it.")

        else:
            print(f"No valid results found for the best feature set '{best_fs_name_for_model}' for model {model_name}.")

print("\n--- Saving Process Complete ---")



--- Saving Best Model Per Algorithm (Based on Test F1) ---
no stored variable or alias best_f1_per_model
Initial scaler already exists at ..\models\scaler_initial.pkl. Skipping save.

Saving best model and transformer for each algorithm...

Processing SVM...
   Saved model: ..\models\kagglesvm_best_model_Scaled_All.pkl

Processing LightGBM...
   Saved model: ..\models\kagglelightgbm_best_model_Scaled_Corr75%.pkl
   Saved feature selection transformer: ..\models\selector_Scaled_Corr75%.pkl

Processing MLP...
   Saved model: ..\models\kagglemlp_best_model_Scaled_Corr50%.pkl
   Saved feature selection transformer: ..\models\selector_Scaled_Corr50%.pkl

--- Saving Process Complete ---
