## 1. Import Needed Libraries

In [None]:
# --- General Libraries ---
import os
import warnings

# --- Data Handling ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# --- Image Handling and Visualization ---
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

# --- PyTorch and Deep Learning ---
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

# --- Warning Suppression ---
warnings.filterwarnings('ignore')


## 2. Device Configuration

In this section, we configure the device for PyTorch computations. If a CUDA-enabled GPU is available, it will be used for faster processing. Otherwise, the computations will fall back to the CPU. Additional details about the CUDA device are also printed if available.

In [None]:
# Set the device for PyTorch computations
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print device information
print(f"Using {device} device")
print(f"CUDA Available: {torch.cuda.is_available()}")

# If CUDA is available, print additional details
if torch.cuda.is_available():
    print(f"Number of CUDA Devices: {torch.cuda.device_count()}")
    print(f"Current CUDA Device: {torch.cuda.current_device()}")
    print(f"CUDA Device Name: {torch.cuda.get_device_name(torch.cuda.current_device())}")

## 3. Data Loading and Visualization

In [None]:
root_dir = "lung_colon_image_set"
image_paths = []
binary_labels = []
multi_labels = []

label_mapping = {
    'lung_n': 0, 'lung_aca': 1, 'lung_scc': 2,
    'colon_n': 3, 'colon_aca': 4
}

binary_mapping = {
    'lung_n': 0, 'colon_n': 0,
    'lung_aca': 1, 'lung_scc': 1, 'colon_aca': 1
}

for subfolder in ['lung_image_sets/lung_n', 'lung_image_sets/lung_aca', 'lung_image_sets/lung_scc',
                  'colon_image_sets/colon_n', 'colon_image_sets/colon_aca']:
    class_dir = os.path.join(root_dir, subfolder)
    class_name = subfolder.split('/')[-1]

    for img_file in os.listdir(class_dir):
        img_path = os.path.join(class_dir, img_file)
        image_paths.append(img_path)
        multi_labels.append(label_mapping[class_name])
        binary_labels.append(binary_mapping[class_name])

print(f"Total Image: {len(image_paths)}")

In [None]:
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(), 
    transforms.RandomRotation(15),  
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),  # Random color jitter
    transforms.RandomZoom(0.1), 
    transforms.RandomVerticalFlip(), 
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalization
])

# Regular transform for test/validation data (no augmentation)
test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [None]:
class FeatureDataset(Dataset):
    def __init__(self, image_paths, binary_labels, multi_labels, transform=None):
        self.image_paths = image_paths
        self.binary_labels = binary_labels
        self.multi_labels = multi_labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, self.binary_labels[idx], self.multi_labels[idx]


In [None]:
# Split the data
X_train, X_test, y_bin_train, y_bin_test, y_multi_train, y_multi_test = train_test_split(
    image_paths, binary_labels, multi_labels,
    test_size=0.3, random_state=42, stratify=multi_labels
)

# Create datasets with respective transformations
train_dataset = FeatureDataset(X_train, y_bin_train, y_multi_train, transform=train_transform)
test_dataset = FeatureDataset(X_test, y_bin_test, y_multi_test, transform=test_transform)

# Create data loaders for train and test datasets
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
model = models.efficientnet_b3(weights=models.EfficientNet_B3_Weights.DEFAULT)
model.classifier = torch.nn.Identity() 
model = model.to(device)
model.eval()


In [None]:
def extract_features(dataloader):
    all_features = []
    all_bin_labels = []
    all_multi_labels = []

    with torch.no_grad():
        for inputs, bin_labels, multi_labels in dataloader:
            inputs = inputs.to(device)
            features = model(inputs)
            all_features.append(features.cpu().numpy())
            all_bin_labels.extend(bin_labels)
            all_multi_labels.extend(multi_labels)

    return np.concatenate(all_features), np.array(all_bin_labels), np.array(all_multi_labels)

In [None]:
X_train_feat, y_bin_train, y_multi_train = extract_features(train_loader)
X_test_feat, y_bin_test, y_multi_test = extract_features(test_loader)

In [None]:
rf_binary = RandomForestClassifier(n_estimators=100, random_state=42)
rf_binary.fit(X_train_feat, y_bin_train)

In [None]:
y_bin_pred = rf_binary.predict(X_test_feat)

print("🔎 Binary Classification Report:")
print(classification_report(y_bin_test, y_bin_pred, target_names=["Benign", "Malignant"]))

In [None]:
rf_multi = RandomForestClassifier(n_estimators=100, random_state=42)
rf_multi.fit(X_train_feat, y_multi_train)

In [None]:
y_multi_pred = rf_multi.predict(X_test_feat)

print("🔎 Multiclass Classification Report:")
print(classification_report(y_multi_test, y_multi_pred, target_names=[
    "Lung Benign", "Lung ACA", "Lung SCC", "Colon Benign", "Colon ACA"
]))