# Lung Histopathology Classification: ACA / N / SCC
## Multi-CNN + Channel Attention + GA + KNN/SVM/RF + Fusion

This notebook implements a comprehensive lung histopathology classification system that combines:
- Multiple CNN backbones (DenseNet121, ResNet50, VGG16)
- Channel attention mechanism (SE blocks)
- Genetic Algorithm for feature selection
- Ensemble of classical ML classifiers (KNN, SVM, Random Forest)
- Majority voting fusion

In [None]:
# Import required libraries
import os, random, json
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.applications import DenseNet121, ResNet50, VGG16
from tensorflow.keras.applications.densenet import preprocess_input as pre_densenet
from tensorflow.keras.applications.resnet import preprocess_input as pre_resnet
from tensorflow.keras.applications.vgg16 import preprocess_input as pre_vgg
from tensorflow.keras.layers import (Input, GlobalAveragePooling2D, GlobalMaxPooling2D,
                                     Concatenate, Dense, Reshape, Multiply, Lambda)
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import mode

# GA (DEAP)
from deap import base, creator, tools

print("All libraries imported successfully!")

In [None]:
# Configuration and Data Setup
DATA_DIR   = "/path/to/lung_colon_image_set/lung_image_sets"  # << set this
IMG_SIZE   = (224, 224)
BATCH_SIZE = 24
SEED       = 42

# Set random seeds for reproducibility
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

print(f"Configuration set:")
print(f"Data Directory: {DATA_DIR}")
print(f"Image Size: {IMG_SIZE}")
print(f"Batch Size: {BATCH_SIZE}")
print(f"Random Seed: {SEED}")

In [None]:
# Data Generators Setup
# Only lung classes will be present in this directory if you set DATA_DIR as above:
# expected subfolders: lung_aca / lung_n / lung_scc

train_datagen = ImageDataGenerator(
    validation_split=0.20,
    rotation_range=20,
    horizontal_flip=True,
    # IMPORTANT: no rescale here, since we feed raw to model-specific preprocessors
)

def make_gen(subset):
    return train_datagen.flow_from_directory(
        DATA_DIR,
        target_size=IMG_SIZE,
        class_mode='categorical',
        batch_size=BATCH_SIZE,
        subset=subset,
        seed=SEED,
        shuffle=True
    )

train_gen = make_gen('training')
val_gen   = make_gen('validation')
num_classes = train_gen.num_classes
class_indices = train_gen.class_indices
id2label = {v:k for k,v in class_indices.items()}

print("Classes:", class_indices)
print(f"Number of classes: {num_classes}")
print(f"Training samples: {train_gen.samples}")
print(f"Validation samples: {val_gen.samples}")

In [None]:
# Channel Attention (SE Block) Implementation
def se_block(x, reduction=16, name=None):
    """Squeeze-and-Excitation block for channel attention"""
    ch = x.shape[-1]
    gap = GlobalAveragePooling2D(name=None if not name else name+"_gap")(x)
    gmp = GlobalMaxPooling2D(name=None if not name else name+"_gmp")(x)

    # shared MLP via Dense on pooled (batch, ch)
    d1_gap = Dense(ch // reduction, activation='relu')(gap)
    d1_gmp = Dense(ch // reduction, activation='relu')(gmp)
    d2_gap = Dense(ch)(d1_gap)
    d2_gmp = Dense(ch)(d1_gmp)

    scale = tf.nn.sigmoid(d2_gap + d2_gmp)
    scale = Reshape((1,1,ch))(scale)
    return Multiply()([x, scale])

print("SE block function defined successfully!")

In [None]:
# Preprocessing Lanes (one per backbone)
def lane(tensor, backbone="resnet"):
    """Create a processing lane for each CNN backbone with SE attention"""
    if backbone == "resnet":
        x = Lambda(pre_resnet, name="pre_resnet")(tensor)
        x = ResNet50(include_top=False, weights='imagenet')(x)
    elif backbone == "densenet":
        x = Lambda(pre_densenet, name="pre_densenet")(tensor)
        x = DenseNet121(include_top=False, weights='imagenet')(x)
    else:  # vgg
        x = Lambda(pre_vgg, name="pre_vgg")(tensor)
        x = VGG16(include_top=False, weights='imagenet')(x)
    
    x = se_block(x, reduction=16, name=f"se_{backbone}")
    x = GlobalAveragePooling2D(name=f"gap_{backbone}")(x)
    return x

print("Lane function defined successfully!")

In [None]:
# Build Feature Extractor Model
print("Building multi-backbone feature extractor...")

inp = Input(shape=(224,224,3))
feat_d = lane(inp, "densenet")
feat_r = lane(inp, "resnet")
feat_v = lane(inp, "vgg")
concat_feat = Concatenate(name="concat_feats")([feat_d, feat_r, feat_v])
feature_model = Model(inp, concat_feat)
feature_dim = feature_model.output_shape[-1]

print(f"Feature extractor built successfully!")
print(f"Feature dimension: {feature_dim}")
feature_model.summary()

In [None]:
# Extract Deep Features
def extract_features(generator):
    """Extract features from a data generator using the feature model"""
    X, y = [], []
    steps = len(generator)
    for i in range(steps):
        imgs, labels = generator.next()
        feats = feature_model.predict(imgs, verbose=0)
        X.append(feats)
        y.append(labels)
        if (i + 1) % 10 == 0:
            print(f"Processed {i + 1}/{steps} batches")
    return np.vstack(X), np.vstack(y)

print("Feature extraction function defined!")

In [None]:
# Extract Training Features
print("Extracting training features …")
X_tr, Y_tr_ohe = extract_features(train_gen)
print(f"Training features shape: {X_tr.shape}")
print(f"Training labels shape: {Y_tr_ohe.shape}")

In [None]:
# Extract Validation Features
print("Extracting validation features …")
X_va, Y_va_ohe = extract_features(val_gen)
print(f"Validation features shape: {X_va.shape}")
print(f"Validation labels shape: {Y_va_ohe.shape}")

In [None]:
# Combine Features and Convert Labels
X_full = np.vstack([X_tr, X_va])
y_full = np.argmax(np.vstack([Y_tr_ohe, Y_va_ohe]), axis=1)

print(f"Total features shape: {X_full.shape}")
print(f"Total labels shape: {y_full.shape}")
print(f"Classes present: {np.unique(y_full)}")
print(f"Class distribution: {np.bincount(y_full)}")

In [None]:
# GA-based Feature Selection Setup (DEAP)
POP_SIZE = 40
N_GEN    = 10        # start smaller; increase later
CX_PROB  = 0.8
MUT_PROB = 0.1
INDPB    = 0.05

n_features = X_full.shape[1]

print(f"GA Parameters:")
print(f"Population Size: {POP_SIZE}")
print(f"Generations: {N_GEN}")
print(f"Crossover Probability: {CX_PROB}")
print(f"Mutation Probability: {MUT_PROB}")
print(f"Total Features: {n_features}")

In [None]:
# Define GA Components
# Safe (re)definition guards for repeated runs
if "FitnessMax" not in creator.__dict__:
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
if "Individual" not in creator.__dict__:
    creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n_features)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

def eval_fitness(individual):
    """Evaluate fitness of an individual (feature subset)"""
    idx = [i for i, b in enumerate(individual) if b == 1]
    if len(idx) < 2:
        return (0.0,)
    Xs = X_full[:, idx]
    knn = KNeighborsClassifier(n_neighbors=5)
    scores = cross_val_score(knn, Xs, y_full, cv=3, scoring='accuracy')
    # Small L0 penalty to prefer compact subsets
    fitness = scores.mean() - 0.1 * (len(idx) / n_features)
    return (float(fitness),)

toolbox.register("evaluate", eval_fitness)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=INDPB)
toolbox.register("select", tools.selTournament, tournsize=3)

print("GA components defined successfully!")

In [None]:
# Initialize GA Population
pop = toolbox.population(n=POP_SIZE)
print(f"GA initialized: pop={POP_SIZE}, feats={n_features}")
print("Starting genetic algorithm evolution...")

In [None]:
# Run GA Evolution
for gen in range(N_GEN):
    print(f"\nGeneration {gen+1}/{N_GEN}")
    
    offspring = toolbox.select(pop, len(pop))
    offspring = list(map(toolbox.clone, offspring))

    # Crossover
    for c1, c2 in zip(offspring[::2], offspring[1::2]):
        if random.random() < CX_PROB:
            toolbox.mate(c1, c2)
            if "fitness" in c1.__dict__: del c1.fitness.values
            if "fitness" in c2.__dict__: del c2.fitness.values

    # Mutation
    for ind in offspring:
        if random.random() < MUT_PROB:
            toolbox.mutate(ind)
            if "fitness" in ind.__dict__: del ind.fitness.values

    # Evaluation
    invalid = [ind for ind in offspring if not ind.fitness.valid]
    print(f"  Evaluating {len(invalid)} individuals...")
    fits = list(map(toolbox.evaluate, invalid))
    for ind, fit in zip(invalid, fits):
        ind.fitness.values = fit

    pop[:] = offspring
    gen_fits = [ind.fitness.values[0] for ind in pop]
    print(f"  Max fitness: {np.max(gen_fits):.4f}")
    print(f"  Avg fitness: {np.mean(gen_fits):.4f}")

print("\nGA evolution completed!")

In [None]:
# Select Best Features
best = tools.selBest(pop, 1)[0]
sel_idx = np.array([i for i, b in enumerate(best) if b == 1], dtype=int)

print(f"Selected {len(sel_idx)} / {n_features} features")
print(f"Feature selection ratio: {len(sel_idx)/n_features:.3f}")
print(f"Best fitness: {best.fitness.values[0]:.4f}")

In [None]:
# Prepare Selected Features for Training
X_sel = X_full[:, sel_idx]
X_train, X_test, y_train, y_test = train_test_split(
    X_sel, y_full, test_size=0.20, random_state=SEED, stratify=y_full
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Training class distribution: {np.bincount(y_train)}")
print(f"Test class distribution: {np.bincount(y_test)}")

In [None]:
# Initialize Classifiers
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
svm = SVC(kernel='rbf', probability=True, C=1.0, gamma='scale', random_state=SEED)
rf  = RandomForestClassifier(n_estimators=300, random_state=SEED, n_jobs=-1)

print("Classifiers initialized:")
print(f"  KNN: k=5, weights='distance'")
print(f"  SVM: RBF kernel, C=1.0, gamma='scale'")
print(f"  Random Forest: 300 trees")

In [None]:
# Train Classifiers
print("Training classifiers …")

print("  Training KNN...")
knn.fit(X_train, y_train)

print("  Training SVM...")
svm.fit(X_train, y_train)

print("  Training Random Forest...")
rf.fit(X_train, y_train)

print("All classifiers trained successfully!")

In [None]:
# Make Predictions
print("Making predictions...")

knn_pred = knn.predict(X_test)
svm_pred = svm.predict(X_test)
rf_pred  = rf.predict(X_test)

print("Predictions completed!")

In [None]:
# Individual Classifier Results
print("Individual Classifier Accuracies:")
knn_acc = accuracy_score(y_test, knn_pred)
svm_acc = accuracy_score(y_test, svm_pred)
rf_acc = accuracy_score(y_test, rf_pred)

print(f"  KNN: {knn_acc:.4f}")
print(f"  SVM: {svm_acc:.4f}")
print(f"  RF : {rf_acc:.4f}")

# Display individual classification reports
target_names = [id2label[i] for i in range(num_classes)]

print("\n=== KNN Classification Report ===")
print(classification_report(y_test, knn_pred, target_names=target_names))

print("\n=== SVM Classification Report ===")
print(classification_report(y_test, svm_pred, target_names=target_names))

print("\n=== Random Forest Classification Report ===")
print(classification_report(y_test, rf_pred, target_names=target_names))

In [None]:
# Ensemble Fusion (Majority Voting)
preds = np.stack([knn_pred, svm_pred, rf_pred], axis=0)
ens = mode(preds, axis=0, keepdims=False).mode
ens_acc = accuracy_score(y_test, ens)

print(f"Ensemble Accuracy (Majority Voting): {ens_acc:.4f}")
print(f"\nImprovement over best individual: {ens_acc - max(knn_acc, svm_acc, rf_acc):.4f}")

print("\n=== Ensemble Classification Report ===")
print(classification_report(y_test, ens, target_names=target_names))

In [None]:
# Summary Results
print("\n" + "="*60)
print("FINAL RESULTS SUMMARY")
print("="*60)
print(f"Total samples processed: {len(y_full)}")
print(f"Features selected by GA: {len(sel_idx)} / {n_features} ({len(sel_idx)/n_features:.1%})")
print(f"Test set size: {len(y_test)}")
print("\nClassifier Accuracies:")
print(f"  KNN:              {knn_acc:.4f}")
print(f"  SVM:              {svm_acc:.4f}")
print(f"  Random Forest:    {rf_acc:.4f}")
print(f"  Ensemble (Fusion): {ens_acc:.4f} ← BEST")
print("\nClass Labels:")
for i, label in id2label.items():
    print(f"  {i}: {label}")
print("="*60)