In [1]:
import os
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.metrics import Precision, Recall, AUC
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import l2
import pydicom
from PIL import Image

In [2]:
import os
import pydicom

def load_dicom_images_with_labels(root_dir):
    patient_data = {}
    tract_labels = {}

    # Iterate over 'cancer' and 'non_cancer' directories
    for label_dir in ['cancer', 'non_cancer']:
        label_path = os.path.join(root_dir, label_dir)

        # Check if the label_path is indeed a directory (ignore if not)
        if not os.path.isdir(label_path):
            continue

        label = 1 if label_dir == 'cancer' else 0

        # Iterate over patients within each label directory
        for patient in os.listdir(label_path):
            patient_path = os.path.join(label_path, patient)

            # Check if the patient_path is indeed a directory
            if not os.path.isdir(patient_path):
                continue

            tracts = patient_data.get(patient, {})

            # Iterate over tracts for each patient
            for tract in os.listdir(patient_path):
                tract_path = os.path.join(patient_path, tract)

                # Check if the tract_path is indeed a directory
                if not os.path.isdir(tract_path):
                    continue

                images = []

                # Load DICOM images
                for filename in os.listdir(tract_path):
                    if filename.endswith('.dcm'):
                        file_path = os.path.join(tract_path, filename)
                        if os.path.isfile(file_path):  # Ensure the path is a file
                            ds = pydicom.dcmread(file_path)
                            images.append(ds.pixel_array)
                            print(f"Loaded DICOM image from {file_path}")  # Debug: Print loaded file path

                if not images:
                    print(f"No DICOM images loaded for tract at {tract_path}")  # Debug: Notify if no images loaded

                # Save images and labels
                tracts[tract] = images
                tract_key = f"{label_dir}/{patient}/{tract}"
                tract_labels[tract_key] = label

            # Update the patient data with new or modified tracts
            patient_data[patient] = tracts

    return patient_data, tract_labels


In [3]:
import cv2
import numpy as np

def preprocess_image(image):
    # Normalize the input image to 8-bit if not already
    if image.dtype != np.uint8:
        norm_image = cv2.normalize(image, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8U)
    else:
        norm_image = image

    # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    clahe_img = clahe.apply(norm_image)

    # Re-normalize to 0-1 range if necessary for your application
    final_image = clahe_img.astype(np.float32) / 255

    return final_image



In [4]:
from skimage.feature import graycomatrix, graycoprops

def extract_features(image):
    # Ensure image is scaled back to uint8 for GLCM
    image_uint8 = (image * 255).astype(np.uint8)

    # Compute GLCM and extract features
    glcm = graycomatrix(image_uint8, [1], [0, np.pi/4, np.pi/2, 3*np.pi/4], 256, symmetric=True, normed=True)
    contrast = graycoprops(glcm, 'contrast').flatten()
    energy = graycoprops(glcm, 'energy').flatten()

    # Histogram features
    hist = cv2.calcHist([image_uint8], [0], None, [256], [0, 256]).flatten()

    return np.concatenate([contrast, energy, hist])



In [5]:
def aggregate_features(tracts):
    tract_features = []
    for tract, images in tracts.items():
        if images:  # Check if there are images
            all_features = [extract_features(preprocess_image(image)) for image in images]
            aggregated = np.mean(all_features, axis=0)  # Calculate mean of features
        else:
            print(f"No images to process for tract: {tract}")
            aggregated = np.nan  # Use NaN or a suitable default to indicate no data

        tract_features.append(aggregated)
    return tract_features



In [6]:
from sklearn.decomposition import PCA

def reduce_dimensions(features):
    pca = PCA(n_components=10)  # reduce to 10 dimensions
    reduced_features = pca.fit_transform(features)
    return reduced_features, pca


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

def train_classifier(features, labels):
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)
    return clf, X_test, y_test


In [8]:
from sklearn.metrics import roc_auc_score, recall_score, precision_score, f1_score, confusion_matrix, classification_report


def evaluate_classifier_extended(clf, X_test, y_test):
    # Predict probabilities for AUC
    y_probs = clf.predict_proba(X_test)[:, 1]  # probabilities for the positive class
    # Predict class labels for other metrics
    y_pred = clf.predict(X_test)

    # Calculate metrics
    auc = roc_auc_score(y_test, y_probs)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    
    # Print metrics
    print(f'Accuracy: {clf.score(X_test, y_test)}')
    print(f'AUC: {auc}')
    print(f'Recall: {recall}')
    print(f'Precision: {precision}')
    print(f'F1 Score: {f1}')
    print("Confusion Matrix:")
    print(confusion)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))


In [9]:
# Load data and labels
root_directory = '/Users/arjunmoorthy/Desktop/Research_Capstone/ImageData/new_train_data'
all_data, all_labels = load_dicom_images_with_labels(root_directory)

# Prepare features and labels for ML
features_list = []
labels_list = []

for patient, tracts in all_data.items():
    for tract, images in tracts.items():
        # Check if there are no images in the current tract
        if not images:
            print(f"Skipping empty tract for patient {patient}: {tract}")
            continue  # Skip this tract entirely as there are no images to process

        # Determine the correct label directory for each tract
        label_dir = next((key.split('/')[0] for key in all_labels.keys() if key.endswith(f"{patient}/{tract}")), None)
        
        # Check if the label directory was found
        if label_dir is None:
            print(f"No label directory found for {patient}/{tract}")
            continue

        # Form the correct key with label_dir
        key = f"{label_dir}/{patient}/{tract}"

        # Extract and aggregate features for each tract
        tract_features = aggregate_features({tract: images})
        features_list.extend(tract_features)

        # Append the label for each tract
        labels_list.append(all_labels[key])  # Using the correctly prefixed key

# Convert lists to numpy arrays
features_array = np.array(features_list)
labels_array = np.array(labels_list)

# Apply PCA if necessary
if features_array.shape[1] > 10:  # Check if PCA is necessary
    features_array, pca = reduce_dimensions(features_array)

# Train and evaluate the classifier
clf, X_test, y_test = train_classifier(features_array, labels_array)
evaluate_classifier_extended(clf, X_test, y_test)


Loaded DICOM image from /Users/arjunmoorthy/Desktop/Research_Capstone/ImageData/new_train_data/cancer/Prostate-MRI-US-Biopsy-1135/biopsy_9129_cancer/1-33.dcm
Loaded DICOM image from /Users/arjunmoorthy/Desktop/Research_Capstone/ImageData/new_train_data/cancer/Prostate-MRI-US-Biopsy-1135/biopsy_9129_cancer/1-27.dcm
Loaded DICOM image from /Users/arjunmoorthy/Desktop/Research_Capstone/ImageData/new_train_data/cancer/Prostate-MRI-US-Biopsy-1135/biopsy_9129_cancer/1-26.dcm
Loaded DICOM image from /Users/arjunmoorthy/Desktop/Research_Capstone/ImageData/new_train_data/cancer/Prostate-MRI-US-Biopsy-1135/biopsy_9129_cancer/1-32.dcm
Loaded DICOM image from /Users/arjunmoorthy/Desktop/Research_Capstone/ImageData/new_train_data/cancer/Prostate-MRI-US-Biopsy-1135/biopsy_9129_cancer/1-30.dcm
Loaded DICOM image from /Users/arjunmoorthy/Desktop/Research_Capstone/ImageData/new_train_data/cancer/Prostate-MRI-US-Biopsy-1135/biopsy_9129_cancer/1-31.dcm
Loaded DICOM image from /Users/arjunmoorthy/Desktop/

In [None]:
import os
import pydicom

def load_dicom_images_with_labels(root_dir):
    patient_data = {}
    tract_labels = {}

    # Iterate over 'cancer' and 'non_cancer' directories
    for label_dir in ['cancer', 'non_cancer']:
        label_path = os.path.join(root_dir, label_dir)
        label = 1 if label_dir == 'cancer' else 0

        # Iterate over patients within each label directory
        for patient in os.listdir(label_path):
            patient_path = os.path.join(label_path, patient)
            tracts = {}

            # Iterate over tracts for each patient
            for tract in os.listdir(patient_path):
                tract_path = os.path.join(patient_path, tract)
                images = []

                # Load DICOM images
                for filename in os.listdir(tract_path):
                    if filename.endswith('.dcm'):
                        file_path = os.path.join(tract_path, filename)
                        ds = pydicom.dcmread(file_path)
                        images.append(ds.pixel_array)

                # Save images and labels
                tracts[tract] = images
                tract_labels[os.path.join(label_dir, patient, tract)] = label

            # If patient already exists in data (cross-label scenario), update it
            if patient in patient_data:
                patient_data[patient].update(tracts)
            else:
                patient_data[patient] = tracts

    return patient_data, tract_labels


import cv2
import numpy as np

def preprocess_image(image):
    # Normalize the input image to 8-bit if not already
    if image.dtype != np.uint8:
        norm_image = cv2.normalize(image, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8U)
    else:
        norm_image = image

    # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    clahe_img = clahe.apply(norm_image)

    # Re-normalize to 0-1 range if necessary for your application
    final_image = clahe_img.astype(np.float32) / 255

    return final_image

from skimage.feature import greycomatrix, greycoprops

def extract_features(image):
    # Compute GLCM and extract contrast
    glcm = greycomatrix(image, [1], [0, np.pi/4, np.pi/2, 3*np.pi/4], 256, symmetric=True, normed=True)
    contrast = greycoprops(glcm, 'contrast').flatten()
    energy = greycoprops(glcm, 'energy').flatten()

    # Histogram features
    hist = cv2.calcHist([image], [0], None, [256], [0, 256]).flatten()

    return np.concatenate([contrast, energy, hist])
def aggregate_features(tracts):
    tract_features = []
    for tract, images in tracts.items():
        all_features = [extract_features(preprocess_image(image)) for image in images]
        aggregated = np.mean(all_features, axis=0)  # Example of using mean
        tract_features.append(aggregated)
    return tract_features
from sklearn.decomposition import PCA

def reduce_dimensions(features):
    pca = PCA(n_components=10)  # reduce to 10 dimensions
    reduced_features = pca.fit_transform(features)
    return reduced_features, pca
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

def train_classifier(features, labels):
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)
    return clf, X_test, y_test
def evaluate_classifier(clf, X_test, y_test):
    accuracy = clf.score(X_test, y_test)
    print(f'Accuracy: {accuracy}')
# Load data and labels
root_directory = 'path_to_data_directory'
all_data, all_labels = load_dicom_images_with_labels(root_directory)

# Prepare features and labels for ML
features_list = []
labels_list = []

for patient, tracts in all_data.items():
    for tract, images in tracts.items():
        # Extract and aggregate features for each tract
        tract_features = aggregate_features({tract: images})
        features_list.extend(tract_features)
        # Append the label for each tract
        labels_list.append(all_labels[patient + '/' + tract])

# Convert lists to numpy arrays
features_array = np.array(features_list)
labels_array = np.array(labels_list)

# Apply PCA if necessary
if features_array.shape[1] > 10:  # Check if PCA is necessary
    features_array = reduce_dimensions(features_array)

# Train and evaluate the classifier
clf, X_test, y_test = train_classifier(features_array, labels_array)
evaluate_classifier(clf, X_test, y_test)
