In [4]:
# --- 1. IMPORTS (with TensorFlow) ---
import os
import cv2
import numpy as np
import pandas as pd
import joblib
import warnings
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

warnings.filterwarnings('ignore')
print("Libraries imported successfully.")

# --- 2. LOAD AND PREPROCESS DATA ---
MAX_IMAGES_PER_CLASS = 400
IMG_SIZE = 64

def load_images_for_sklearn(folder_path):
    images_flat = []
    labels = []
    for label_name in ['NORMAL', 'PNEUMONIA']:
        class_path = os.path.join(folder_path, label_name)
        count = 0
        for filename in os.listdir(class_path):
            if count >= MAX_IMAGES_PER_CLASS: break
            img_path = os.path.join(class_path, filename)
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            img_resized = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
            images_flat.append(img_resized.flatten())
            labels.append(0 if label_name == 'NORMAL' else 1)
            count += 1
    return np.array(images_flat), np.array(labels)

def load_images_for_cnn(folder_path):
    images_cnn = []
    labels = []
    for label_name in ['NORMAL', 'PNEUMONIA']:
        class_path = os.path.join(folder_path, label_name)
        count = 0
        for filename in os.listdir(class_path):
            if count >= MAX_IMAGES_PER_CLASS: break
            img_path = os.path.join(class_path, filename)
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            img_resized = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
            images_cnn.append(img_resized)
            labels.append(0 if label_name == 'NORMAL' else 1)
            count += 1
    # Normalize and reshape for CNN
    images_cnn = np.array(images_cnn) / 255.0
    images_cnn = images_cnn.reshape(-1, IMG_SIZE, IMG_SIZE, 1)
    return images_cnn, np.array(labels)

# --- 3. LOAD DATA AND SPLIT ---
train_data_path = '../data/chest_xray/train'
print("Loading data for Scikit-learn models...")
X_flat, y_flat = load_images_for_sklearn(train_data_path)
X_train_flat, X_test_flat, y_train, y_test = train_test_split(X_flat, y_flat, test_size=0.2, random_state=42, stratify=y_flat)

print("Loading data for CNN model...")
X_cnn, y_cnn = load_images_for_cnn(train_data_path)
X_train_cnn, X_test_cnn, y_train_cnn, y_test_cnn = train_test_split(X_cnn, y_cnn, test_size=0.2, random_state=42, stratify=y_cnn)

# --- 4. TRAIN AND SAVE SKLEARN MODELS ---
sklearn_models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(probability=True), # probability=True is needed for predict_proba
    "Random Forest": RandomForestClassifier()
}

os.makedirs('../models', exist_ok=True)
print("\n--- Training and Saving Scikit-learn Models ---")
for name, model in sklearn_models.items():
    print(f"Training {name}...")
    model.fit(X_train_flat, y_train)
    y_pred = model.predict(X_test_flat)
    print(f"Accuracy for {name}: {accuracy_score(y_test, y_pred):.4f}")
    
    # Save model
    filename = f"../models/{name.replace(' (', '_').replace(')', '').replace(' ', '_')}.joblib"
    joblib.dump(model, filename)
    print(f"Saved {name} to {filename}")

# --- 5. DEFINE, TRAIN, AND SAVE CNN MODEL ---
print("\n--- Defining and Training CNN Model ---")
cnn_model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid') # Sigmoid for binary classification
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn_model.summary()

# Train the CNN
history = cnn_model.fit(X_train_cnn, y_train_cnn, epochs=10, validation_data=(X_test_cnn, y_test_cnn), batch_size=32)

# Save the CNN model
cnn_filename = '../models/CNN.keras'
cnn_model.save(cnn_filename)
print(f"Saved CNN model to {cnn_filename}")

print("\n--- All models trained and saved. ---")

Libraries imported successfully.
Loading data for Scikit-learn models...
Loading data for CNN model...

--- Training and Saving Scikit-learn Models ---
Training Logistic Regression...
Accuracy for Logistic Regression: 0.9625
Saved Logistic Regression to ../models/Logistic_Regression.joblib
Training KNN (k=5)...
Accuracy for KNN (k=5): 0.8875
Saved KNN (k=5) to ../models/KNN_k=5.joblib
Training Naive Bayes...
Accuracy for Naive Bayes: 0.8688
Saved Naive Bayes to ../models/Naive_Bayes.joblib
Training SVM...
Accuracy for SVM: 0.9500
Saved SVM to ../models/SVM.joblib
Training Random Forest...
Accuracy for Random Forest: 0.9437
Saved Random Forest to ../models/Random_Forest.joblib

--- Defining and Training CNN Model ---


Epoch 1/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 127ms/step - accuracy: 0.5969 - loss: 0.6548 - val_accuracy: 0.7437 - val_loss: 0.5707
Epoch 2/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 123ms/step - accuracy: 0.8125 - loss: 0.4507 - val_accuracy: 0.9062 - val_loss: 0.2908
Epoch 3/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 122ms/step - accuracy: 0.8875 - loss: 0.2880 - val_accuracy: 0.9438 - val_loss: 0.1980
Epoch 4/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 88ms/step - accuracy: 0.9266 - loss: 0.2317 - val_accuracy: 0.9187 - val_loss: 0.1991
Epoch 5/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 108ms/step - accuracy: 0.9344 - loss: 0.1926 - val_accuracy: 0.9312 - val_loss: 0.1536
Epoch 6/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 74ms/step - accuracy: 0.9344 - loss: 0.1863 - val_accuracy: 0.9563 - val_loss: 0.1420
Epoch 7/10
[1m20/20[0m [32m