# Feature Extraction

### Below code is too heavy to run on mac(need higher end system)

In [None]:
import os
import numpy as np
import pandas as pd

# Import feature extraction functions from the available modules.
# Ensure these modules are in your PYTHONPATH or in the same directory.
from modules.shape_features import extract_nodule_features
from modules.texture_features import extract_texture_features
from modules.artifact_noise_features import extract_artifact_noise_features, get_bounding_box
from modules.statistic_features import extract_intensity_features

# Base directory for processed data
data_processed_root = "../data_processed"

# Define the two classes
classes = ["RealCancerous", "FakeAddedCancer"]

# List to store features for each nodule
all_features = []

# Iterate through each class folder
for cls in classes:
    class_dir = os.path.join(data_processed_root, cls)
    if not os.path.isdir(class_dir):
        print(f"Directory for class {cls} not found: {class_dir}")
        continue

    # Iterate over each patient folder in this class
    patient_dirs = [os.path.join(class_dir, d) for d in os.listdir(class_dir)
                    if os.path.isdir(os.path.join(class_dir, d))]
    for patient_dir in patient_dirs:
        patient_id = os.path.basename(patient_dir)
        # Full CT scan file is expected to be named as "[patient_id]_full_volume.npy"
        ct_scan_path = os.path.join(patient_dir, f"{patient_id}_full_volume.npy")
        if not os.path.isfile(ct_scan_path):
            print(f"CT volume not found for patient {patient_id} at {ct_scan_path}")
            continue
        ct_volume = np.load(ct_scan_path)
        
        # Look for ASRG masks first; if none, then for CMW masks.
        mask_files = [f for f in os.listdir(patient_dir) if f.endswith("_asrg_mask.npy")]
        if len(mask_files) == 0:
            mask_files = [f for f in os.listdir(patient_dir) if f.endswith("_cmw_mask.npy")]
        
        # If no mask files are found, skip this patient.
        if len(mask_files) == 0:
            print(f"Warning: No nodule masks found for patient {patient_id}. Skipping patient.")
            continue
        
        # Process each mask file for this patient.
        for mask_file in mask_files:
            # Extract nodule number from the file name (assumes format: "noduleNumber_asrg_mask.npy" or "noduleNumber_cmw_mask.npy")
            nodule_number = mask_file.split("_")[0]
            mask_path = os.path.join(patient_dir, mask_file)
            mask = np.load(mask_path)
            
            # Obtain the ROI from the CT scan using the bounding box of the mask.
            bbox = get_bounding_box(mask)
            if bbox is None:
                print(f"Warning: No nodule found in mask {mask_file} for patient {patient_id}. Skipping this mask.")
                continue
            roi = ct_volume[bbox]
            
            # Extract features using the available modules.
            # Note: The artifact and noise features are now extracted using the updated module (edge sharpness removed).
            shape_feats = extract_nodule_features(mask, spacing=(1, 1, 1))
            texture_feats = extract_texture_features(roi)  # Uses default parameters for GLCM, GLRLM, and wavelet features.
            artifact_noise_feats = extract_artifact_noise_features(ct_volume, mask)
            intensity_feats = extract_intensity_features(roi)
            
            # Combine features into one dictionary.
            features = {}
            features.update(shape_feats)
            features.update(texture_feats)
            features.update(artifact_noise_feats)
            features.update(intensity_feats)
            
            # Add identification information.
            features["nodule_number"] = nodule_number
            features["patient_id"] = patient_id
            features["class"] = cls
            
            all_features.append(features)

# Create a DataFrame from the collected features.
df_features = pd.DataFrame(all_features)

# Save the DataFrame as a CSV file in the processed data root.
output_csv_path = os.path.join(data_processed_root, "nodule_features.csv")
df_features.to_csv(output_csv_path, index=False)
print(f"Feature extraction complete. CSV saved at: {output_csv_path}")


## Broke down Feature Extraction

# Shape Based

In [2]:
import os
import numpy as np
import pandas as pd

# Import the shape-based feature extraction function.
from modules.shape_features import extract_nodule_features

# Base directory for processed data.
data_processed_root = "../data_processed"

# Define the two classes.
classes = ["RealCancerous", "FakeAddedCancer"]

# List to store shape-based features for each nodule.
all_features = []

# Iterate through each class folder.
for cls in classes:
    class_dir = os.path.join(data_processed_root, cls)
    if not os.path.isdir(class_dir):
        print(f"Directory for class {cls} not found: {class_dir}")
        continue

    # Iterate over each patient folder within the class.
    patient_dirs = [os.path.join(class_dir, d) for d in os.listdir(class_dir)
                    if os.path.isdir(os.path.join(class_dir, d))]
    for patient_dir in patient_dirs:
        patient_id = os.path.basename(patient_dir)
        # Full CT scan file: "[patient_id]_full_volume.npy"
        ct_scan_path = os.path.join(patient_dir, f"{patient_id}_full_volume.npy")
        if not os.path.isfile(ct_scan_path):
            print(f"CT volume not found for patient {patient_id} at {ct_scan_path}. Skipping patient.")
            continue
        
        # Look for ASRG masks first; if none, try CMW masks.
        mask_files = [f for f in os.listdir(patient_dir) if f.endswith("_asrg_mask.npy")]
        if len(mask_files) == 0:
            mask_files = [f for f in os.listdir(patient_dir) if f.endswith("_cmw_mask.npy")]
        
        # If no mask files are found, skip this patient.
        if len(mask_files) == 0:
            print(f"Warning: No nodule masks found for patient {patient_id}. Skipping patient.")
            continue
        
        # Process each nodule mask.
        for mask_file in mask_files:
            # Expecting mask file names like "1_asrg_mask.npy" or "1_cmw_mask.npy"
            nodule_number = mask_file.split("_")[0]
            mask_path = os.path.join(patient_dir, mask_file)
            try:
                mask = np.load(mask_path)
            except Exception as e:
                print(f"Error loading mask {mask_file} for patient {patient_id}: {e}")
                continue
            
            # Extract shape-based features using the mask.
            # Adjust the spacing tuple if needed (default assumes isotropic voxels of size 1).
            shape_feats = extract_nodule_features(mask, spacing=(1, 1, 1))
            
            # Build the feature dictionary.
            feature_dict = {
                "nodule_number": nodule_number,
                "patient_id": patient_id,
                "class": cls
            }
            feature_dict.update(shape_feats)
            all_features.append(feature_dict)

# Create a DataFrame from the collected shape features.
df_shape = pd.DataFrame(all_features)

# Save the shape-based features to a CSV file.
output_csv_path = os.path.join(data_processed_root, "nodule_features.csv")
df_shape.to_csv(output_csv_path, index=False)
print(f"Shape-based feature extraction complete. CSV saved at: {output_csv_path}")


Shape-based feature extraction complete. CSV saved at: ../data_processed/nodule_features.csv


## Texture-Based Features

In [5]:
import os
import numpy as np
import pandas as pd

# Import the texture-based feature extraction function and the bounding box helper.
from modules.texture_features import extract_texture_features
from modules.artifact_noise_features import get_bounding_box

# Base directory for processed data.
data_processed_root = "../data_processed"

# Path to the existing features CSV (from the shape-based step).
csv_path = os.path.join(data_processed_root, "nodule_features.csv")

# Load the existing features DataFrame.
df = pd.read_csv(csv_path)

# List to store texture feature dictionaries.
texture_feature_list = []

# Iterate over each row in the CSV.
for index, row in df.iterrows():
    # Convert patient_id and nodule_number to string to avoid type issues.
    patient_id = str(row['patient_id'])
    nodule_number = str(row['nodule_number'])
    cls = str(row['class'])
    
    patient_dir = os.path.join(data_processed_root, cls, patient_id)
    ct_scan_path = os.path.join(patient_dir, f"{patient_id}_full_volume.npy")
    
    if not os.path.isfile(ct_scan_path):
        print(f"CT scan not found for patient {patient_id}. Skipping row.")
        texture_feature_list.append({})
        continue
    ct_volume = np.load(ct_scan_path)
    
    # Look for the nodule mask: first try ASRG, then fall back to CMW.
    mask_filename = f"{nodule_number}_asrg_mask.npy"
    mask_path = os.path.join(patient_dir, mask_filename)
    if not os.path.isfile(mask_path):
        mask_filename = f"{nodule_number}_cmw_mask.npy"
        mask_path = os.path.join(patient_dir, mask_filename)
        if not os.path.isfile(mask_path):
            print(f"Mask not found for patient {patient_id}, nodule {nodule_number}. Skipping row.")
            texture_feature_list.append({})
            continue
    mask = np.load(mask_path)
    
    # Use the bounding box of the mask to extract the ROI from the CT scan.
    bbox = get_bounding_box(mask)
    if bbox is None:
        print(f"No nodule found in mask for patient {patient_id}, nodule {nodule_number}. Skipping row.")
        texture_feature_list.append({})
        continue
    roi = ct_volume[bbox]
    
    # Extract texture-based features from the ROI.
    texture_feats = extract_texture_features(roi)
    texture_feature_list.append(texture_feats)

# Convert the texture feature dictionaries into a DataFrame.
df_texture = pd.DataFrame(texture_feature_list)

# Append the new texture columns to the original DataFrame.
df_updated = pd.concat([df, df_texture], axis=1)

# Save the updated DataFrame back to CSV.
output_csv_path = os.path.join(data_processed_root, "nodule_features.csv")
df_updated.to_csv(output_csv_path, index=False)
print(f"Texture-based feature extraction complete. Updated CSV saved at: {output_csv_path}")




Texture-based feature extraction complete. Updated CSV saved at: ../data_processed/nodule_features.csv


## Artifact-Noise-Based Features

### Too Heavy to Run on Mac

In [None]:
import os
import numpy as np
import pandas as pd

# Import the artifact-noise feature extraction function.
from modules.artifact_noise_features import extract_artifact_noise_features

# Base directory for processed data.
data_processed_root = "../data_processed"

# Path to the existing CSV file (from previous steps).
csv_path = os.path.join(data_processed_root, "nodule_features.csv")

# Load the existing features DataFrame.
df = pd.read_csv(csv_path)

# List to store artifact-noise feature dictionaries.
artifact_feature_list = []

# Iterate over each row (nodule) in the CSV.
for index, row in df.iterrows():
    # Convert values to string to ensure proper path construction.
    patient_id = str(row['patient_id'])
    nodule_number = str(row['nodule_number'])
    cls = str(row['class'])
    
    patient_dir = os.path.join(data_processed_root, cls, patient_id)
    ct_scan_path = os.path.join(patient_dir, f"{patient_id}_full_volume.npy")
    
    if not os.path.isfile(ct_scan_path):
        print(f"CT scan not found for patient {patient_id}. Skipping row.")
        artifact_feature_list.append({})
        continue
    ct_volume = np.load(ct_scan_path)
    
    # Look for the corresponding mask: first try ASRG mask, then fall back to CMW mask.
    mask_filename = f"{nodule_number}_asrg_mask.npy"
    mask_path = os.path.join(patient_dir, mask_filename)
    if not os.path.isfile(mask_path):
        mask_filename = f"{nodule_number}_cmw_mask.npy"
        mask_path = os.path.join(patient_dir, mask_filename)
        if not os.path.isfile(mask_path):
            print(f"Mask not found for patient {patient_id}, nodule {nodule_number}. Skipping row.")
            artifact_feature_list.append({})
            continue
    mask = np.load(mask_path)
    
    # Extract artifact-noise-based features using the full CT scan and the mask.
    try:
        artifact_feats = extract_artifact_noise_features(ct_volume, mask)
        artifact_feature_list.append(artifact_feats)
    except Exception as e:
        print(f"Error processing artifact-noise features for patient {patient_id}, nodule {nodule_number}: {e}")
        artifact_feature_list.append({})

# Convert the artifact feature dictionaries to a DataFrame.
df_artifact = pd.DataFrame(artifact_feature_list)

# Append the new artifact-noise features to the original DataFrame.
df_updated = pd.concat([df, df_artifact], axis=1)

# Save the updated DataFrame back to CSV.
output_csv_path = os.path.join(data_processed_root, "nodule_features.csv")
df_updated.to_csv(output_csv_path, index=False)
print(f"Artifact-noise-based feature extraction complete. Updated CSV saved at: {output_csv_path}")


## Statistics-Based Features

In [5]:
import os
import numpy as np
import pandas as pd

# Import intensity-based feature extraction function.
from modules.statistic_features import extract_intensity_features
# Reuse the bounding box helper from the artifact_noise module.
from modules.artifact_noise_features import get_bounding_box

# Base directory for processed data.
data_processed_root = "../data_processed"

# Path to the existing features CSV file (from previous shape/texture steps).
csv_path = os.path.join(data_processed_root, "nodule_features.csv")

# Load the existing DataFrame.
df = pd.read_csv(csv_path)

# List to store intensity feature dictionaries.
intensity_feature_list = []

# Iterate over each nodule entry.
for index, row in df.iterrows():
    # Ensure patient id and nodule number are strings.
    patient_id = str(row['patient_id'])
    nodule_number = str(row['nodule_number'])
    cls = str(row['class'])
    
    # Build the patient directory and CT scan path.
    patient_dir = os.path.join(data_processed_root, cls, patient_id)
    ct_scan_path = os.path.join(patient_dir, f"{patient_id}_full_volume.npy")
    
    if not os.path.isfile(ct_scan_path):
        print(f"CT scan not found for patient {patient_id}. Skipping row.")
        intensity_feature_list.append({})
        continue
    ct_volume = np.load(ct_scan_path)
    
    # Locate the nodule mask (prefer ASRG, fall back to CMW).
    mask_filename = f"{nodule_number}_asrg_mask.npy"
    mask_path = os.path.join(patient_dir, mask_filename)
    if not os.path.isfile(mask_path):
        mask_filename = f"{nodule_number}_cmw_mask.npy"
        mask_path = os.path.join(patient_dir, mask_filename)
        if not os.path.isfile(mask_path):
            print(f"Mask not found for patient {patient_id}, nodule {nodule_number}. Skipping row.")
            intensity_feature_list.append({})
            continue
    mask = np.load(mask_path)
    
    # Extract the ROI from the CT scan using the bounding box of the mask.
    bbox = get_bounding_box(mask)
    if bbox is None:
        print(f"No nodule found in mask for patient {patient_id}, nodule {nodule_number}. Skipping row.")
        intensity_feature_list.append({})
        continue
    roi = ct_volume[bbox]
    
    # Extract intensity-based features (statistical moments, histogram analysis).
    intensity_feats = extract_intensity_features(roi, num_bins=50)
    intensity_feature_list.append(intensity_feats)

# Convert the list of intensity feature dictionaries into a DataFrame.
df_intensity = pd.DataFrame(intensity_feature_list)

# Append the new intensity feature columns to the original DataFrame.
df_updated = pd.concat([df, df_intensity], axis=1)

# Save the updated DataFrame back to CSV.
output_csv_path = os.path.join(data_processed_root, "nodule_features.csv")
df_updated.to_csv(output_csv_path, index=False)
print(f"Intensity-based feature extraction complete. Updated CSV saved at: {output_csv_path}")


Intensity-based feature extraction complete. Updated CSV saved at: ../data_processed/nodule_features.csv
