# Simple CTA Processing Pipeline

Clean and straightforward CTA processing with:
- Raw CTA export
- Preprocessed CTA export  
- Multi-patient support
- Kaggle compatibility

## 1. Setup and Configuration

In [1]:
# Setup and Configuration
from pathlib import Path
import numpy as np
from PIL import Image
import pydicom
from skimage.transform import resize

# Directory Configuration - Change for different environments
LOCAL_DIR = "mRs-prediction-3patient"
KAGGLE_DIR = "/kaggle/input/your-dataset"  # Update for Kaggle

# Use local for now, change to KAGGLE_DIR for Kaggle
BASE_DIR = Path(LOCAL_DIR)

# Processing Settings
TARGET_SIZE = (256, 256)
MIN_IMAGES = 100

print(f"Base directory: {BASE_DIR}")
print(f"Directory exists: {BASE_DIR.exists()}")

Base directory: mRs-prediction-3patient
Directory exists: True


## 2. Find Patients and CTA Series

In [2]:
# Find Patient Folders
def find_patient_folders(base_dir):
    """Find all patient folders with CTA data"""
    patients = []
    
    for folder in base_dir.iterdir():
        if folder.is_dir():
            # Check for CTA subfolders
            has_cta = any("cta" in sub.name.lower() for sub in folder.iterdir() if sub.is_dir())
            if has_cta or "cta" in folder.name.lower():
                patients.append(folder)
                print(f"Found patient: {folder.name}")
    
    return patients

# Find CTA Series
def find_cta_series(patient_folder):
    """Find the best CTA series (Angio THIN preferred)"""
    patterns = ["angio.*thin", "plain.*thin", "run.*thin"]
    
    # Navigate to CTA folder
    cta_folder = None
    for subfolder in patient_folder.rglob("*"):
        if subfolder.is_dir() and "cta" in subfolder.name.lower():
            cta_folder = subfolder
            break
    
    if not cta_folder:
        return None
    
    # Find series folders with enough DICOM files
    best_series = None
    max_files = 0
    
    for series_folder in cta_folder.rglob("*"):
        if not series_folder.is_dir():
            continue
            
        # Count DICOM files
        dicom_files = [f for f in series_folder.iterdir() 
                      if f.is_file() and f.suffix.lower() in [".dcm", ".dicom", ""]]
        
        if len(dicom_files) > MIN_IMAGES and len(dicom_files) > max_files:
            # Check series description
            try:
                ds = pydicom.dcmread(str(dicom_files[0]), stop_before_pixels=True)
                series_desc = getattr(ds, 'SeriesDescription', '').lower()
                
                # Prefer Angio THIN
                if "angio" in series_desc and "thin" in series_desc:
                    best_series = series_folder
                    max_files = len(dicom_files)
                    break
                elif len(dicom_files) > max_files:
                    best_series = series_folder
                    max_files = len(dicom_files)
            except:
                continue
    
    return best_series if max_files > MIN_IMAGES else None

# Scan patients
patients = find_patient_folders(BASE_DIR)
print(f"\nFound {len(patients)} patients")

Found patient: sayyad habib CTA

Found 1 patients


## 3. Raw CTA Export

In [3]:
# Raw CTA Export
def export_raw_cta(patient_folder, series_folder, output_dir="images/cta_raw"):
    """Export raw CTA as PNG files"""
    patient_name = patient_folder.name.replace(" ", "_")
    out_path = Path(output_dir) / patient_name
    out_path.mkdir(parents=True, exist_ok=True)
    
    # Get DICOM files
    dicom_files = sorted([f for f in series_folder.iterdir() 
                         if f.is_file() and f.suffix.lower() in [".dcm", ".dicom", ""]])
    
    print(f"Exporting {len(dicom_files)} raw CTA slices for {patient_name}...")
    
    # Load and process DICOM files
    volume = []
    for dicom_file in dicom_files:
        try:
            ds = pydicom.dcmread(str(dicom_file))
            volume.append(ds.pixel_array.astype(np.float32))
        except:
            continue
    
    volume = np.array(volume)
    
    # Normalize to 0-255 for PNG
    vol_min, vol_max = volume.min(), volume.max()
    if vol_max > vol_min:
        volume_norm = (volume - vol_min) / (vol_max - vol_min) * 255
        volume_uint8 = volume_norm.astype(np.uint8)
    else:
        volume_uint8 = np.zeros_like(volume, dtype=np.uint8)
    
    # Save slices
    for i, slice_data in enumerate(volume_uint8):
        filename = f"{patient_name}-slice{i:03d}_z.png"
        Image.fromarray(slice_data).save(out_path / filename)
    
    print(f"Saved {len(volume_uint8)} raw slices to {out_path}")
    return len(volume_uint8)

# Export raw CTA for all patients
raw_results = []
for patient_folder in patients:
    series_folder = find_cta_series(patient_folder)
    if series_folder:
        count = export_raw_cta(patient_folder, series_folder)
        raw_results.append((patient_folder.name, count))
    else:
        print(f"No CTA series found for {patient_folder.name}")

print(f"\nRaw CTA export complete: {len(raw_results)} patients")

Exporting 454 raw CTA slices for sayyad_habib_CTA...
Saved 454 raw slices to images/cta_raw/sayyad_habib_CTA

Raw CTA export complete: 1 patients
Saved 454 raw slices to images/cta_raw/sayyad_habib_CTA

Raw CTA export complete: 1 patients


In [None]:
## 4. Preprocessed CTA Export


üîç DETAILED DIRECTORY STRUCTURE EXPLORATION

üè• Exploring: mRs-prediction-3patient
üìÑ .DS_Store
üìÅ sayyad habib CTA/
  üìÑ .DS_Store
  üìÅ sayyad habib CTA/
    üìÑ .DS_Store
    üìÅ 1.3.46.670589.61.128.0.20250519182905567/
  üìÅ sayyad habib MRI/
    üìÑ .DS_Store
    üìÅ 1.3.46.670589.54.2.39595856851039510601.30159219094231274361/
    üìÅ segmented sayyad habib/

üìã PATIENT FOLDERS FOUND:
 1. sayyad habib CTA
    üìÅ sayyad habib CTA
    üìÅ sayyad habib MRI



In [4]:
# Preprocessed CTA Export
def export_preprocessed_cta(patient_folder, series_folder, output_dir="images/cta_preprocessed"):
    """Export preprocessed CTA (windowed + resized)"""
    patient_name = patient_folder.name.replace(" ", "_")
    out_path = Path(output_dir) / patient_name
    out_path.mkdir(parents=True, exist_ok=True)
    
    # Get DICOM files
    dicom_files = sorted([f for f in series_folder.iterdir() 
                         if f.is_file() and f.suffix.lower() in [".dcm", ".dicom", ""]])
    
    print(f"Preprocessing {len(dicom_files)} CTA slices for {patient_name}...")
    
    # Load DICOM files
    volume = []
    for dicom_file in dicom_files:
        try:
            ds = pydicom.dcmread(str(dicom_file))
            volume.append(ds.pixel_array.astype(np.float32))
        except:
            continue
    
    volume = np.array(volume)
    
    # Apply CT windowing (for vessel visualization)
    window_center = 50
    window_width = 350
    win_min = window_center - window_width / 2
    win_max = window_center + window_width / 2
    
    # Process each slice
    processed_slices = []
    for slice_data in volume:
        # Apply windowing
        windowed = np.clip(slice_data, win_min, win_max)
        
        # Normalize to [0,1]
        normalized = (windowed - win_min) / (win_max - win_min)
        
        # Resize to target size
        resized = resize(normalized, TARGET_SIZE, anti_aliasing=True, preserve_range=True)
        
        # Convert to uint8
        processed = (resized * 255).astype(np.uint8)
        processed_slices.append(processed)
    
    # Save processed slices
    for i, slice_data in enumerate(processed_slices):
        filename = f"{patient_name}-slice{i:03d}_z.png"
        Image.fromarray(slice_data).save(out_path / filename)
    
    print(f"Saved {len(processed_slices)} preprocessed slices to {out_path}")
    return len(processed_slices)

# Export preprocessed CTA for all patients
preprocessed_results = []
for patient_folder in patients:
    series_folder = find_cta_series(patient_folder)
    if series_folder:
        count = export_preprocessed_cta(patient_folder, series_folder)
        preprocessed_results.append((patient_folder.name, count))

print(f"\nPreprocessed CTA export complete: {len(preprocessed_results)} patients")

Preprocessing 454 CTA slices for sayyad_habib_CTA...
Saved 454 preprocessed slices to images/cta_preprocessed/sayyad_habib_CTA

Preprocessed CTA export complete: 1 patients
Saved 454 preprocessed slices to images/cta_preprocessed/sayyad_habib_CTA

Preprocessed CTA export complete: 1 patients


In [None]:
## 5. Summary

üîç ANALYZING CTA 'RAW' EXPORT PROCESSING

‚ùì QUESTION: Are the CTA 'raw' exports truly raw?

Let's examine what processing is applied in the CTA export function:

üìä CTA EXPORT FUNCTION ANALYSIS:

üîß PROCESSING STEPS IN process_patient_cta_export():

1. üìÑ DICOM Reading:
   - ds = pydicom.dcmread(str(dicom_file))
   - pixel_array = ds.pixel_array.astype(np.float32)
   ‚úÖ This gets the raw pixel values from DICOM

2. üè• CT Windowing (CONDITIONAL):
   - IF WindowCenter and WindowWidth exist in DICOM:
     ‚Ä¢ window_center = ds.WindowCenter
     ‚Ä¢ window_width = ds.WindowWidth  
     ‚Ä¢ Apply: np.clip(pixel_array, min_val, max_val)
   ‚ùì This applies windowing if DICOM headers contain window settings

3. üìê Normalization for Storage:
   - volume_min, volume_max = volume.min(), volume.max()
   - volume_normalized = (volume - volume_min) / (volume_max - volume_min)
   - volume_uint8 = (volume_normalized * 255).astype(np.uint8)
   ‚ùå This is NOT raw - it's normalized to 0

In [5]:
# Summary
print("\nCTA PROCESSING SUMMARY")
print("=" * 50)
print(f"Patients processed: {len(patients)}")
print(f"Raw CTA exports: {len(raw_results)}")
print(f"Preprocessed CTA exports: {len(preprocessed_results)}")

print("\nOutput directories:")
print("- images/cta_raw/[patient_name]/")
print("- images/cta_preprocessed/[patient_name]/")

print("\nFor Kaggle deployment:")
print("1. Change BASE_DIR to Path(KAGGLE_DIR)")
print("2. Update KAGGLE_DIR path")
print("3. Run all cells")

print("\nProcessing complete!")


CTA PROCESSING SUMMARY
Patients processed: 1
Raw CTA exports: 1
Preprocessed CTA exports: 1

Output directories:
- images/cta_raw/[patient_name]/
- images/cta_preprocessed/[patient_name]/

For Kaggle deployment:
1. Change BASE_DIR to Path(KAGGLE_DIR)
2. Update KAGGLE_DIR path
3. Run all cells

Processing complete!
