# Data Cleaning Pipeline

This notebook cleans the LC25000 dataset using the final clustering results from manual annotation. The cleaning process removes duplicates and contaminated samples based on the clustering analysis performed in the previous notebooks.

In [3]:
# Import required libraries
import os
import json
import shutil
import pandas as pd
from pathlib import Path
from collections import defaultdict, Counter
import numpy as np
from tqdm import tqdm

# Import project constants
from source.constants import (
    PROJECT_PATH, DATA_DIR, ALL_CANCER_TYPES, 
    ANNOTATIONS_SAVE_DIR, NUM_CLASS_PROTOTYPES
)

In [4]:
# Configuration
EXTRACTOR_NAME = "UNI"  # Feature extractor used for clustering
IMG_NORM = "resize_only"  # Image normalization method used

# Set the mode: 'clean' to de-duplicate, 'organize' to keep all images and annotate with cluster ID
CLEANING_MODE = 'clean'  # Options: 'clean', 'organize'

# Define output directory based on the mode
if CLEANING_MODE == 'clean':
    OUTPUT_DIR = os.path.join(PROJECT_PATH, "LC25000_Clean")
elif CLEANING_MODE == 'organize':
    OUTPUT_DIR = os.path.join(PROJECT_PATH, "LC25000_Organized")
else:
    raise ValueError("CLEANING_MODE must be either 'clean' or 'organize'")

print(f"Project path: {PROJECT_PATH}")
print(f"Data directory: {DATA_DIR}")
print(f"Annotations directory: {ANNOTATIONS_SAVE_DIR}")
print(f"Cleaning Mode: {CLEANING_MODE}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Cancer types to process: {ALL_CANCER_TYPES}")

Project path: /Users/agnieszka/Poznan/project-2-imaging/lc-25k-cleaning/LC25000-clean
Data directory: /Users/agnieszka/Poznan/project-2-imaging/lc-25k-cleaning/LC25000-clean/LC25000
Annotations directory: /Users/agnieszka/Poznan/project-2-imaging/lc-25k-cleaning/LC25000-clean/annotations
Cleaning Mode: clean
Output directory: /Users/agnieszka/Poznan/project-2-imaging/lc-25k-cleaning/LC25000-clean/LC25000_Clean
Cancer types to process: ('colon_aca', 'colon_n', 'lung_aca', 'lung_n', 'lung_scc')


## Step 1: Load Clustering Results

Load the final clustering results for each cancer type. These clusters were created through the semi-automatic annotation process in notebook 2.

In [5]:
def load_final_clusters_csv(cancer_type, extractor_name, img_norm):
    """Load final clustering results from CSV file for a specific cancer type."""
    annotations_dir = os.path.join(ANNOTATIONS_SAVE_DIR, cancer_type, extractor_name, img_norm)
    final_clusters_csv_path = os.path.join(annotations_dir, "final_clusters.csv")
    
    if not os.path.exists(final_clusters_csv_path):
        print(f"Warning: No clustering results found at {final_clusters_csv_path}")
        return None
    
    # Read the CSV file
    df = pd.read_csv(final_clusters_csv_path)
    
    # Group by cluster_label to create clusters dictionary
    clusters = {}
    for cluster_id, group in df.groupby('cluster_label'):
        # Convert relative paths to absolute paths
        img_paths = []
        for img_path in group['img_path']:
            # Remove the './' prefix and make absolute path
            clean_path = img_path.replace('./', '')
            abs_path = os.path.join(PROJECT_PATH, clean_path)
            img_paths.append(abs_path)
        clusters[cluster_id] = img_paths
    
    return clusters

# Load clustering results for all cancer types from CSV files
all_clusters = {}
total_clusters = 0
total_images = 0

for cancer_type in ALL_CANCER_TYPES:
    clusters = load_final_clusters_csv(cancer_type, EXTRACTOR_NAME, IMG_NORM)
    if clusters is not None:
        all_clusters[cancer_type] = clusters
        total_clusters += len(clusters)
        total_images += sum(len(cluster_images) for cluster_images in clusters.values())
        print(f"{cancer_type}: {len(clusters)} clusters, {sum(len(cluster_images) for cluster_images in clusters.values())} images")
    else:
        print(f"No clusters found for {cancer_type}")

print(f"\nTotal: {total_clusters} clusters, {total_images} images across all cancer types")

colon_aca: 249 clusters, 5000 images
colon_n: 257 clusters, 5000 images
lung_aca: 243 clusters, 5000 images
lung_n: 249 clusters, 5000 images
lung_scc: 248 clusters, 5000 images

Total: 1246 clusters, 25000 images across all cancer types


## Step 2: Analyze Cluster Sizes

Analyze the distribution of cluster sizes to understand the data better before cleaning.

In [6]:
def analyze_cluster_sizes(all_clusters):
    """Analyze the distribution of cluster sizes across all cancer types."""
    
    cluster_stats = {}
    
    for cancer_type, clusters in all_clusters.items():
        cluster_sizes = [len(cluster_images) for cluster_images in clusters.values()]
        
        stats = {
            'total_clusters': len(clusters),
            'total_images': sum(cluster_sizes),
            'min_size': min(cluster_sizes) if cluster_sizes else 0,
            'max_size': max(cluster_sizes) if cluster_sizes else 0,
            'mean_size': np.mean(cluster_sizes) if cluster_sizes else 0,
            'median_size': np.median(cluster_sizes) if cluster_sizes else 0,
            'size_distribution': Counter(cluster_sizes)
        }
        
        cluster_stats[cancer_type] = stats
        
        print(f"\n{cancer_type.upper()}:")
        print(f"  Total clusters: {stats['total_clusters']}")
        print(f"  Total images: {stats['total_images']}")
        print(f"  Cluster size - Min: {stats['min_size']}, Max: {stats['max_size']}")
        print(f"  Cluster size - Mean: {stats['mean_size']:.2f}, Median: {stats['median_size']:.2f}")
        
        # Show size distribution
        size_counts = sorted(stats['size_distribution'].items())
        print(f"  Size distribution: {dict(size_counts[:10])}...")  # Show first 10
    
    return cluster_stats

# Analyze cluster sizes
cluster_stats = analyze_cluster_sizes(all_clusters)


COLON_ACA:
  Total clusters: 249
  Total images: 5000
  Cluster size - Min: 6, Max: 34
  Cluster size - Mean: 20.08, Median: 20.00
  Size distribution: {6: 1, 9: 1, 10: 2, 11: 3, 12: 2, 13: 7, 14: 5, 15: 12, 16: 24, 17: 22}...

COLON_N:
  Total clusters: 257
  Total images: 5000
  Cluster size - Min: 1, Max: 53
  Cluster size - Mean: 19.46, Median: 20.00
  Size distribution: {1: 6, 2: 2, 4: 1, 5: 1, 6: 1, 7: 2, 8: 1, 9: 2, 10: 3, 11: 2}...

LUNG_ACA:
  Total clusters: 243
  Total images: 5000
  Cluster size - Min: 7, Max: 45
  Cluster size - Mean: 20.58, Median: 20.00
  Size distribution: {7: 1, 9: 1, 11: 1, 12: 7, 13: 6, 14: 9, 15: 16, 16: 17, 17: 10, 18: 18}...

LUNG_N:
  Total clusters: 249
  Total images: 5000
  Cluster size - Min: 3, Max: 46
  Cluster size - Mean: 20.08, Median: 20.00
  Size distribution: {3: 1, 7: 1, 9: 2, 10: 1, 11: 2, 12: 4, 13: 4, 14: 16, 15: 11, 16: 17}...

LUNG_SCC:
  Total clusters: 248
  Total images: 5000
  Cluster size - Min: 8, Max: 45
  Cluster size -

## Step 3: Define Processing Strategy

Define the strategy for processing the dataset based on the `CLEANING_MODE` flag.
- **clean**: Keep one representative per cluster to de-duplicate the dataset.
- **organize**: Keep all images and annotate filenames with their cluster ID.

In [7]:
# This step is only necessary for 'clean' mode
if CLEANING_MODE == 'clean':
    def select_representative_images(all_clusters, selection_strategy="first"):
        """
        Select representative images from each cluster based on the final_clusters.csv data.
        
        Args:
            all_clusters: Dictionary of clusters for each cancer type (from CSV)
            selection_strategy: Strategy for selecting representative ("first", "random", "median")
        
        Returns:
            Dictionary with selected representative images for each cancer type
        """
        
        representatives = {}
        
        for cancer_type, clusters in all_clusters.items():
            selected_images = []
            
            for cluster_id, cluster_images in clusters.items():
                if len(cluster_images) > 0:
                    if selection_strategy == "first":
                        # Select the first image in the cluster
                        representative = cluster_images[0]
                    elif selection_strategy == "random":
                        # Select a random image from the cluster
                        representative = np.random.choice(cluster_images)
                    elif selection_strategy == "median":
                        # Select the median image (middle index)
                        median_idx = len(cluster_images) // 2
                        representative = cluster_images[median_idx]
                    else:
                        raise ValueError(f"Unknown selection strategy: {selection_strategy}")
                    
                    selected_images.append((representative, cluster_id))  # Include cluster_id for naming
            
            representatives[cancer_type] = selected_images
            print(f"{cancer_type}: Selected {len(selected_images)} representative images from {len(clusters)} clusters")
        
        return representatives

    # Select representative images (using first strategy)
    representative_images = select_representative_images(all_clusters, selection_strategy="first")

    # Print summary
    total_representatives = sum(len(images) for images in representative_images.values())
    print(f"\nTotal representative images selected: {total_representatives}")
else:
    print("Skipping representative selection in 'organize' mode.")
    representative_images = None  # Not needed for organize mode

colon_aca: Selected 249 representative images from 249 clusters
colon_n: Selected 257 representative images from 257 clusters
lung_aca: Selected 243 representative images from 243 clusters
lung_n: Selected 249 representative images from 249 clusters
lung_scc: Selected 248 representative images from 248 clusters

Total representative images selected: 1246


## Step 4: Create Processed Dataset

Copy the images to create the new dataset based on the selected mode.

In [8]:
def create_dataset(mode, all_clusters, representative_images, output_dir):
    """
    Create a new dataset by copying images based on the selected mode.
    
    Args:
        mode: 'clean' or 'organize'
        all_clusters: Dictionary of all clusters
        representative_images: Dictionary of representative images (for 'clean' mode)
        output_dir: Output directory for the new dataset
    """
    
    # Clear the output directory if it exists
    if os.path.exists(output_dir):
        print(f"Output directory {output_dir} already exists. Clearing it first.")
        shutil.rmtree(output_dir)
    os.makedirs(output_dir, exist_ok=True)
    
    copy_stats = {}
    
    for cancer_type, clusters in all_clusters.items():
        cancer_type_dir = os.path.join(output_dir, cancer_type)
        os.makedirs(cancer_type_dir, exist_ok=True)
        
        copied_count = 0
        failed_count = 0
        
        print(f"\nProcessing {cancer_type}...")
        
        if mode == 'clean':
            # In 'clean' mode, only copy representatives
            image_data = representative_images.get(cancer_type, [])
            for image_path, cluster_id in tqdm(image_data, desc=f"Copying {cancer_type}"):
                try:
                    filename = os.path.basename(image_path)
                    name, ext = os.path.splitext(filename)
                    new_filename = f"cluster_{cluster_id:03d}_{name}{ext}"
                    dest_path = os.path.join(cancer_type_dir, new_filename)
                    
                    if os.path.exists(image_path):
                        shutil.copy2(image_path, dest_path)
                        copied_count += 1
                    else:
                        print(f"Warning: Source file not found: {image_path}")
                        failed_count += 1
                except Exception as e:
                    print(f"Error copying {image_path}: {e}")
                    failed_count += 1
            total_to_copy = len(image_data)

        elif mode == 'organize':
            # In 'organize' mode, copy all images
            total_to_copy = sum(len(imgs) for imgs in clusters.values())
            for cluster_id, image_paths in tqdm(clusters.items(), desc=f"Copying {cancer_type}"):
                for image_path in image_paths:
                    try:
                        filename = os.path.basename(image_path)
                        name, ext = os.path.splitext(filename)
                        new_filename = f"cluster_{cluster_id:03d}_{name}{ext}"
                        dest_path = os.path.join(cancer_type_dir, new_filename)
                        
                        if os.path.exists(image_path):
                            shutil.copy2(image_path, dest_path)
                            copied_count += 1
                        else:
                            print(f"Warning: Source file not found: {image_path}")
                            failed_count += 1
                    except Exception as e:
                        print(f"Error copying {image_path}: {e}")
                        failed_count += 1
        
        copy_stats[cancer_type] = {
            'copied': copied_count,
            'failed': failed_count,
            'total': total_to_copy
        }
        
        print(f"{cancer_type}: Copied {copied_count}/{total_to_copy} images")
    
    return copy_stats

# Create the new dataset based on the CLEANING_MODE
print(f"Creating dataset in '{CLEANING_MODE}' mode...")
copy_statistics = create_dataset(CLEANING_MODE, all_clusters, representative_images, OUTPUT_DIR)

Creating dataset in 'clean' mode...

Processing colon_aca...


Copying colon_aca: 100%|██████████| 249/249 [00:00<00:00, 2236.47it/s]



colon_aca: Copied 249/249 images

Processing colon_n...


Copying colon_n: 100%|██████████| 257/257 [00:00<00:00, 2437.94it/s]
Copying colon_n: 100%|██████████| 257/257 [00:00<00:00, 2437.94it/s]


colon_n: Copied 257/257 images

Processing lung_aca...


Copying lung_aca: 100%|██████████| 243/243 [00:00<00:00, 2700.13it/s]
Copying lung_aca: 100%|██████████| 243/243 [00:00<00:00, 2700.13it/s]


lung_aca: Copied 243/243 images

Processing lung_n...


Copying lung_n: 100%|██████████| 249/249 [00:00<00:00, 2361.67it/s]
Copying lung_n: 100%|██████████| 249/249 [00:00<00:00, 2361.67it/s]


lung_n: Copied 249/249 images

Processing lung_scc...


Copying lung_scc: 100%|██████████| 248/248 [00:00<00:00, 2581.85it/s]

lung_scc: Copied 248/248 images





## Step 5: Generate Cleaning Report

Create a comprehensive report of the cleaning process including statistics and metadata.

In [9]:
def generate_cleaning_report(cluster_stats, copy_statistics, output_dir):
    """Generate a comprehensive cleaning report."""
    
    report = {
        'cleaning_metadata': {
            'extractor_used': EXTRACTOR_NAME,
            'normalization_used': IMG_NORM,
            'selection_strategy': 'first',  # cluster centroid
            'date_cleaned': pd.Timestamp.now().isoformat(),
            'source_directory': DATA_DIR,
            'output_directory': output_dir
        },
        'original_dataset_stats': {},
        'clustering_stats': cluster_stats,
        'cleaned_dataset_stats': copy_statistics,
        'cleaning_summary': {}
    }
    
    # Calculate summary statistics
    total_original = sum(stats['total_images'] for stats in cluster_stats.values())
    total_cleaned = sum(stats['copied'] for stats in copy_statistics.values())
    reduction_ratio = 1 - (total_cleaned / total_original) if total_original > 0 else 0
    
    report['cleaning_summary'] = {
        'total_original_images': total_original,
        'total_cleaned_images': total_cleaned,
        'images_removed': total_original - total_cleaned,
        'reduction_ratio': reduction_ratio,
        'compression_ratio': total_cleaned / total_original if total_original > 0 else 0
    }
    
    # Save report as JSON
    report_path = os.path.join(output_dir, 'cleaning_report.json')
    with open(report_path, 'w') as f:
        json.dump(report, f, indent=4)
    
    # Print summary
    print("\\n" + "="*60)
    print("CLEANING REPORT SUMMARY")
    print("="*60)
    print(f"Original dataset: {total_original:,} images")
    print(f"Cleaned dataset: {total_cleaned:,} images")
    print(f"Images removed: {total_original - total_cleaned:,} ({reduction_ratio:.2%})")
    print(f"Compression ratio: {total_cleaned / total_original:.3f}" if total_original > 0 else "N/A")
    
    print("\\nPer-class statistics:")
    for cancer_type in ALL_CANCER_TYPES:
        if cancer_type in cluster_stats and cancer_type in copy_statistics:
            original = cluster_stats[cancer_type]['total_images']
            cleaned = copy_statistics[cancer_type]['copied']
            print(f"  {cancer_type}: {original:,} → {cleaned:,} ({cleaned/original:.3f})")
    
    print(f"\\nDetailed report saved to: {report_path}")
    
    return report

# Generate cleaning report
cleaning_report = generate_cleaning_report(cluster_stats, copy_statistics, OUTPUT_DIR)

CLEANING REPORT SUMMARY
Original dataset: 25,000 images
Cleaned dataset: 1,246 images
Images removed: 23,754 (95.02%)
Compression ratio: 0.050
\nPer-class statistics:
  colon_aca: 5,000 → 249 (0.050)
  colon_n: 5,000 → 257 (0.051)
  lung_aca: 5,000 → 243 (0.049)
  lung_n: 5,000 → 249 (0.050)
  lung_scc: 5,000 → 248 (0.050)
\nDetailed report saved to: /Users/agnieszka/Poznan/project-2-imaging/lc-25k-cleaning/LC25000-clean/LC25000_Clean/cleaning_report.json


## Step 6: Create Image Mapping

Create a mapping file that tracks which original images were kept and which clusters they came from. This step is adjusted based on the cleaning mode.

In [10]:
def create_image_mapping(mode, all_clusters, representative_images, output_dir):
    """
    Create a detailed mapping of which images were kept and which clusters they came from.
    """
    
    mapping_data = []
    
    for cancer_type, clusters in all_clusters.items():
        if mode == 'clean':
            # In 'clean' mode, map only the representatives
            image_data = representative_images.get(cancer_type, [])
            for image_path, cluster_id in image_data:
                cluster_images = clusters[cluster_id]
                mapping_entry = {
                    'cancer_type': cancer_type,
                    'cluster_id': cluster_id,
                    'representative_image': image_path,
                    'cluster_size': len(cluster_images),
                    'duplicate_images': [img for img in cluster_images if img != image_path],
                    'kept_in_cleaned_dataset': True
                }
                mapping_data.append(mapping_entry)
        
        elif mode == 'organize':
            # In 'organize' mode, every image is a "representative" of its cluster
            for cluster_id, image_paths in clusters.items():
                for image_path in image_paths:
                    mapping_entry = {
                        'cancer_type': cancer_type,
                        'cluster_id': cluster_id,
                        'image_path': image_path,
                        'cluster_size': len(image_paths),
                        'kept_in_organized_dataset': True
                    }
                    mapping_data.append(mapping_entry)

    # Create DataFrame for easy analysis
    mapping_df = pd.DataFrame(mapping_data)
    
    # Save as CSV
    mapping_csv_path = os.path.join(output_dir, 'image_mapping.csv')
    mapping_df.to_csv(mapping_csv_path, index=False)
    
    # Save detailed mapping as JSON
    mapping_json_path = os.path.join(output_dir, 'image_mapping.json')
    with open(mapping_json_path, 'w') as f:
        json.dump(mapping_data, f, indent=4)
    
    print(f"Image mapping saved to:")
    print(f"  CSV: {mapping_csv_path}")
    print(f"  JSON: {mapping_json_path}")
    
    return mapping_df

# Create image mapping
mapping_df = create_image_mapping(CLEANING_MODE, all_clusters, representative_images, OUTPUT_DIR)

Image mapping saved to:
  CSV: /Users/agnieszka/Poznan/project-2-imaging/lc-25k-cleaning/LC25000-clean/LC25000_Clean/image_mapping.csv
  JSON: /Users/agnieszka/Poznan/project-2-imaging/lc-25k-cleaning/LC25000-clean/LC25000_Clean/image_mapping.json


## Step 7: Validation and Quality Checks

Perform validation checks on the cleaned dataset to ensure everything was processed correctly.

In [11]:
def validate_cleaned_dataset(output_dir, expected_stats):
    """
    Validate the cleaned dataset by checking file counts and integrity.
    """
    
    validation_results = {
        'validation_passed': True,
        'issues': [],
        'file_counts': {},
        'missing_files': [],
        'unexpected_files': []
    }
    
    print("Validating cleaned dataset...")
    
    # Check if output directory exists
    if not os.path.exists(output_dir):
        validation_results['validation_passed'] = False
        validation_results['issues'].append(f"Output directory does not exist: {output_dir}")
        return validation_results
    
    # Check each cancer type directory
    for cancer_type in ALL_CANCER_TYPES:
        cancer_dir = os.path.join(output_dir, cancer_type)
        
        if not os.path.exists(cancer_dir):
            # This is not a failure if no images were expected for this class
            if cancer_type not in expected_stats or expected_stats[cancer_type]['copied'] == 0:
                continue
            validation_results['validation_passed'] = False
            validation_results['issues'].append(f"Cancer type directory missing: {cancer_dir}")
            continue
        
        # Count files in the directory
        files = [f for f in os.listdir(cancer_dir) if os.path.isfile(os.path.join(cancer_dir, f))]
        file_count = len(files)
        validation_results['file_counts'][cancer_type] = file_count
        
        # Check against expected count
        if cancer_type in expected_stats:
            expected_count = expected_stats[cancer_type]['copied']
            if file_count != expected_count:
                validation_results['validation_passed'] = False
                validation_results['issues'].append(
                    f"{cancer_type}: Expected {expected_count} files, found {file_count}"
                )
        
        print(f"  {cancer_type}: {file_count} files")
    
    # Check for required metadata files
    required_files = ['cleaning_report.json', 'image_mapping.csv', 'image_mapping.json']
    for req_file in required_files:
        file_path = os.path.join(output_dir, req_file)
        if not os.path.exists(file_path):
            validation_results['validation_passed'] = False
            validation_results['issues'].append(f"Required file missing: {req_file}")
    
    # Summary
    if validation_results['validation_passed']:
        print("\nValidation PASSED - Dataset cleaned successfully!")
    else:
        print("\nValidation FAILED - Issues found:")
        for issue in validation_results['issues']:
            print(f"   - {issue}")
    
    return validation_results

# Validate the cleaned dataset
validation_results = validate_cleaned_dataset(OUTPUT_DIR, copy_statistics)

# Print final summary
print(f"\nCleaned dataset location: {OUTPUT_DIR}")
total_cleaned_images = sum(validation_results['file_counts'].values())
print(f"Total images in cleaned dataset: {total_cleaned_images:,}")

# Calculate total reduction
if 'cleaning_summary' in cleaning_report:
    original_total = cleaning_report['cleaning_summary']['total_original_images']
    reduction = original_total - total_cleaned_images
    reduction_pct = (reduction / original_total) * 100 if original_total > 0 else 0
    print(f"Total reduction: {reduction:,} images ({reduction_pct:.1f}%)")

Validating cleaned dataset...
  colon_aca: 249 files
  colon_n: 257 files
  lung_aca: 243 files
  lung_n: 249 files
  lung_scc: 248 files

Validation PASSED - Dataset cleaned successfully!

Cleaned dataset location: /Users/agnieszka/Poznan/project-2-imaging/lc-25k-cleaning/LC25000-clean/LC25000_Clean
Total images in cleaned dataset: 1,246
Total reduction: 23,754 images (95.0%)
