# Data Cleaning Pipeline

This notebook cleans the LC25000 dataset using the final clustering results from manual annotation. The cleaning process removes duplicates and contaminated samples based on the clustering analysis performed in the previous notebooks.

In [None]:
# Import required libraries
import os
import json
import shutil
import pandas as pd
from pathlib import Path
from collections import defaultdict, Counter
import numpy as np
from tqdm import tqdm

# Import project constants
from source.constants import (
    PROJECT_PATH, DATA_DIR, ALL_CANCER_TYPES, 
    ANNOTATIONS_SAVE_DIR, NUM_CLASS_PROTOTYPES
)

In [None]:
# Configuration
EXTRACTOR_NAME = "UNI"  # Feature extractor used for clustering
IMG_NORM = "resize_only"  # Image normalization method used
OUTPUT_DIR = os.path.join(PROJECT_PATH, "LC25000_Clean")  # Output directory for cleaned dataset

print(f"Project path: {PROJECT_PATH}")
print(f"Data directory: {DATA_DIR}")
print(f"Annotations directory: {ANNOTATIONS_SAVE_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Cancer types to process: {ALL_CANCER_TYPES}")

## Step 1: Load Clustering Results

Load the final clustering results for each cancer type. These clusters were created through the semi-automatic annotation process in notebook 2.

In [None]:
def load_final_clusters_csv(cancer_type, extractor_name, img_norm):
    """Load final clustering results from CSV file for a specific cancer type."""
    annotations_dir = os.path.join(ANNOTATIONS_SAVE_DIR, cancer_type, extractor_name, img_norm)
    final_clusters_csv_path = os.path.join(annotations_dir, "final_clusters.csv")
    
    if not os.path.exists(final_clusters_csv_path):
        print(f"Warning: No clustering results found at {final_clusters_csv_path}")
        return None
    
    # Read the CSV file
    df = pd.read_csv(final_clusters_csv_path)
    
    # Group by cluster_label to create clusters dictionary
    clusters = {}
    for cluster_id, group in df.groupby('cluster_label'):
        # Convert relative paths to absolute paths
        img_paths = []
        for img_path in group['img_path']:
            # Remove the './' prefix and make absolute path
            clean_path = img_path.replace('./', '')
            abs_path = os.path.join(PROJECT_PATH, clean_path)
            img_paths.append(abs_path)
        clusters[cluster_id] = img_paths
    
    return clusters

# Load clustering results for all cancer types from CSV files
all_clusters = {}
total_clusters = 0
total_images = 0

for cancer_type in ALL_CANCER_TYPES:
    clusters = load_final_clusters_csv(cancer_type, EXTRACTOR_NAME, IMG_NORM)
    if clusters is not None:
        all_clusters[cancer_type] = clusters
        total_clusters += len(clusters)
        total_images += sum(len(cluster_images) for cluster_images in clusters.values())
        print(f"{cancer_type}: {len(clusters)} clusters, {sum(len(cluster_images) for cluster_images in clusters.values())} images")
    else:
        print(f"No clusters found for {cancer_type}")

print(f"\nTotal: {total_clusters} clusters, {total_images} images across all cancer types")

## Step 2: Analyze Cluster Sizes

Analyze the distribution of cluster sizes to understand the data better before cleaning.

In [None]:
def analyze_cluster_sizes(all_clusters):
    """Analyze the distribution of cluster sizes across all cancer types."""
    
    cluster_stats = {}
    
    for cancer_type, clusters in all_clusters.items():
        cluster_sizes = [len(cluster_images) for cluster_images in clusters.values()]
        
        stats = {
            'total_clusters': len(clusters),
            'total_images': sum(cluster_sizes),
            'min_size': min(cluster_sizes) if cluster_sizes else 0,
            'max_size': max(cluster_sizes) if cluster_sizes else 0,
            'mean_size': np.mean(cluster_sizes) if cluster_sizes else 0,
            'median_size': np.median(cluster_sizes) if cluster_sizes else 0,
            'size_distribution': Counter(cluster_sizes)
        }
        
        cluster_stats[cancer_type] = stats
        
        print(f"\n{cancer_type.upper()}:")
        print(f"  Total clusters: {stats['total_clusters']}")
        print(f"  Total images: {stats['total_images']}")
        print(f"  Cluster size - Min: {stats['min_size']}, Max: {stats['max_size']}")
        print(f"  Cluster size - Mean: {stats['mean_size']:.2f}, Median: {stats['median_size']:.2f}")
        
        # Show size distribution
        size_counts = sorted(stats['size_distribution'].items())
        print(f"  Size distribution: {dict(size_counts[:10])}...")  # Show first 10
    
    return cluster_stats

# Analyze cluster sizes
cluster_stats = analyze_cluster_sizes(all_clusters)

## Step 3: Define Cleaning Strategy

Define the strategy for cleaning the dataset:
1. **Keep one representative per cluster**: From each cluster, keep only one image (the first one in the list, which is typically the cluster centroid)
2. **Remove duplicates**: This eliminates duplicate and near-duplicate images
3. **Maintain class balance**: Ensure we keep a balanced number of samples per class

In [None]:
def select_representative_images(all_clusters, selection_strategy="first"):
    """
    Select representative images from each cluster based on the final_clusters.csv data.
    
    Args:
        all_clusters: Dictionary of clusters for each cancer type (from CSV)
        selection_strategy: Strategy for selecting representative ("first", "random", "median")
    
    Returns:
        Dictionary with selected representative images for each cancer type
    """
    
    representatives = {}
    
    for cancer_type, clusters in all_clusters.items():
        selected_images = []
        
        for cluster_id, cluster_images in clusters.items():
            if len(cluster_images) > 0:
                if selection_strategy == "first":
                    # Select the first image in the cluster
                    representative = cluster_images[0]
                elif selection_strategy == "random":
                    # Select a random image from the cluster
                    representative = np.random.choice(cluster_images)
                elif selection_strategy == "median":
                    # Select the median image (middle index)
                    median_idx = len(cluster_images) // 2
                    representative = cluster_images[median_idx]
                else:
                    raise ValueError(f"Unknown selection strategy: {selection_strategy}")
                
                selected_images.append((representative, cluster_id))  # Include cluster_id for naming
        
        representatives[cancer_type] = selected_images
        print(f"{cancer_type}: Selected {len(selected_images)} representative images from {len(clusters)} clusters")
    
    return representatives

# Select representative images (using first strategy)
representative_images = select_representative_images(all_clusters, selection_strategy="first")

# Print summary
total_representatives = sum(len(images) for images in representative_images.values())
print(f"\nTotal representative images selected: {total_representatives}")

## Step 4: Create Cleaned Dataset

Copy the selected representative images to create the cleaned dataset.

In [None]:
def create_cleaned_dataset(representative_images, output_dir, include_cluster_id=True):
    """
    Create a cleaned dataset by copying representative images from clusters.
    
    Args:
        representative_images: Dictionary of (image_path, cluster_id) tuples for each cancer type
        output_dir: Output directory for cleaned dataset
        include_cluster_id: Whether to include cluster ID in the filename
    """
    
    # Create output directory structure
    os.makedirs(output_dir, exist_ok=True)
    
    copy_stats = {}
    
    for cancer_type, image_data in representative_images.items():
        # Create cancer type directory
        cancer_type_dir = os.path.join(output_dir, cancer_type)
        os.makedirs(cancer_type_dir, exist_ok=True)
        
        copied_count = 0
        failed_count = 0
        
        print(f"\nProcessing {cancer_type}...")
        
        for image_path, cluster_id in tqdm(image_data, desc=f"Copying {cancer_type}"):
            try:
                # Extract filename and extension
                filename = os.path.basename(image_path)
                name, ext = os.path.splitext(filename)
                
                # Create new filename with cluster ID if requested
                if include_cluster_id:
                    new_filename = f"cluster_{cluster_id:03d}_{name}{ext}"
                else:
                    new_filename = filename
                
                # Source path
                source_path = image_path
                
                # Destination path
                dest_path = os.path.join(cancer_type_dir, new_filename)
                
                # Copy the file
                if os.path.exists(source_path):
                    shutil.copy2(source_path, dest_path)
                    copied_count += 1
                else:
                    print(f"Warning: Source file not found: {source_path}")
                    failed_count += 1
                    
            except Exception as e:
                print(f"Error copying {image_path}: {e}")
                failed_count += 1
        
        copy_stats[cancer_type] = {
            'copied': copied_count,
            'failed': failed_count,
            'total': len(image_data)
        }
        
        print(f"{cancer_type}: Copied {copied_count}/{len(image_data)} images")
    
    return copy_stats

# Create the cleaned dataset with cluster IDs in filenames
print(f"Creating cleaned dataset in: {OUTPUT_DIR}")
copy_statistics = create_cleaned_dataset(representative_images, OUTPUT_DIR, include_cluster_id=True)

## Step 5: Generate Cleaning Report

Create a comprehensive report of the cleaning process including statistics and metadata.

In [None]:
def generate_cleaning_report(cluster_stats, copy_statistics, output_dir):
    """Generate a comprehensive cleaning report."""
    
    report = {
        'cleaning_metadata': {
            'extractor_used': EXTRACTOR_NAME,
            'normalization_used': IMG_NORM,
            'selection_strategy': 'first',  # cluster centroid
            'date_cleaned': pd.Timestamp.now().isoformat(),
            'source_directory': DATA_DIR,
            'output_directory': output_dir
        },
        'original_dataset_stats': {},
        'clustering_stats': cluster_stats,
        'cleaned_dataset_stats': copy_statistics,
        'cleaning_summary': {}
    }
    
    # Calculate summary statistics
    total_original = sum(stats['total_images'] for stats in cluster_stats.values())
    total_cleaned = sum(stats['copied'] for stats in copy_statistics.values())
    reduction_ratio = 1 - (total_cleaned / total_original) if total_original > 0 else 0
    
    report['cleaning_summary'] = {
        'total_original_images': total_original,
        'total_cleaned_images': total_cleaned,
        'images_removed': total_original - total_cleaned,
        'reduction_ratio': reduction_ratio,
        'compression_ratio': total_cleaned / total_original if total_original > 0 else 0
    }
    
    # Save report as JSON
    report_path = os.path.join(output_dir, 'cleaning_report.json')
    with open(report_path, 'w') as f:
        json.dump(report, f, indent=4)
    
    # Print summary
    print("\\n" + "="*60)
    print("CLEANING REPORT SUMMARY")
    print("="*60)
    print(f"Original dataset: {total_original:,} images")
    print(f"Cleaned dataset: {total_cleaned:,} images")
    print(f"Images removed: {total_original - total_cleaned:,} ({reduction_ratio:.2%})")
    print(f"Compression ratio: {total_cleaned / total_original:.3f}" if total_original > 0 else "N/A")
    
    print("\\nPer-class statistics:")
    for cancer_type in ALL_CANCER_TYPES:
        if cancer_type in cluster_stats and cancer_type in copy_statistics:
            original = cluster_stats[cancer_type]['total_images']
            cleaned = copy_statistics[cancer_type]['copied']
            print(f"  {cancer_type}: {original:,} â†’ {cleaned:,} ({cleaned/original:.3f})")
    
    print(f"\\nDetailed report saved to: {report_path}")
    
    return report

# Generate cleaning report
cleaning_report = generate_cleaning_report(cluster_stats, copy_statistics, OUTPUT_DIR)

## Step 6: Create Image Mapping

Create a mapping file that tracks which original images were kept in the cleaned dataset.

In [None]:
def create_image_mapping(representative_images, all_clusters, output_dir):
    """
    Create a detailed mapping of which images were kept and which clusters they came from.
    """
    
    mapping_data = []
    
    for cancer_type, image_data in representative_images.items():
        clusters = all_clusters[cancer_type]
        
        # Create mapping for representatives
        for image_path, cluster_id in image_data:
            cluster_images = clusters[cluster_id]
            
            # Create mapping entry
            mapping_entry = {
                'cancer_type': cancer_type,
                'cluster_id': cluster_id,
                'representative_image': image_path,
                'cluster_size': len(cluster_images),
                'duplicate_images': [img for img in cluster_images if img != image_path],
                'kept_in_cleaned_dataset': True
            }
            
            mapping_data.append(mapping_entry)
    
    # Create DataFrame for easy analysis
    mapping_df = pd.DataFrame(mapping_data)
    
    # Save as CSV
    mapping_csv_path = os.path.join(output_dir, 'image_mapping.csv')
    mapping_df.to_csv(mapping_csv_path, index=False)
    
    # Save detailed mapping as JSON
    mapping_json_path = os.path.join(output_dir, 'image_mapping.json')
    with open(mapping_json_path, 'w') as f:
        json.dump(mapping_data, f, indent=4)
    
    print(f"Image mapping saved to:")
    print(f"  CSV: {mapping_csv_path}")
    print(f"  JSON: {mapping_json_path}")
    
    # Print some statistics
    total_clusters = len(mapping_df)
    avg_cluster_size = mapping_df['cluster_size'].mean()
    max_cluster_size = mapping_df['cluster_size'].max()
    min_cluster_size = mapping_df['cluster_size'].min()
    
    print(f"\nMapping Statistics:")
    print(f"  Total clusters: {total_clusters}")
    print(f"  Average cluster size: {avg_cluster_size:.2f}")
    print(f"  Cluster size range: {min_cluster_size} - {max_cluster_size}")
    
    return mapping_df

# Create image mapping
mapping_df = create_image_mapping(representative_images, all_clusters, OUTPUT_DIR)

## Step 7: Validation and Quality Checks

Perform validation checks on the cleaned dataset to ensure everything was processed correctly.

In [None]:
def validate_cleaned_dataset(output_dir, expected_stats):
    """
    Validate the cleaned dataset by checking file counts and integrity.
    """
    
    validation_results = {
        'validation_passed': True,
        'issues': [],
        'file_counts': {},
        'missing_files': [],
        'unexpected_files': []
    }
    
    print("Validating cleaned dataset...")
    
    # Check if output directory exists
    if not os.path.exists(output_dir):
        validation_results['validation_passed'] = False
        validation_results['issues'].append(f"Output directory does not exist: {output_dir}")
        return validation_results
    
    # Check each cancer type directory
    for cancer_type in ALL_CANCER_TYPES:
        cancer_dir = os.path.join(output_dir, cancer_type)
        
        if not os.path.exists(cancer_dir):
            # This is not a failure if no images were expected for this class
            if cancer_type not in expected_stats or expected_stats[cancer_type]['copied'] == 0:
                continue
            validation_results['validation_passed'] = False
            validation_results['issues'].append(f"Cancer type directory missing: {cancer_dir}")
            continue
        
        # Count files in the directory
        files = [f for f in os.listdir(cancer_dir) if os.path.isfile(os.path.join(cancer_dir, f))]
        file_count = len(files)
        validation_results['file_counts'][cancer_type] = file_count
        
        # Check against expected count
        if cancer_type in expected_stats:
            expected_count = expected_stats[cancer_type]['copied']
            if file_count != expected_count:
                validation_results['validation_passed'] = False
                validation_results['issues'].append(
                    f"{cancer_type}: Expected {expected_count} files, found {file_count}"
                )
        
        print(f"  {cancer_type}: {file_count} files")
    
    # Check for required metadata files
    required_files = ['cleaning_report.json', 'image_mapping.csv', 'image_mapping.json']
    for req_file in required_files:
        file_path = os.path.join(output_dir, req_file)
        if not os.path.exists(file_path):
            validation_results['validation_passed'] = False
            validation_results['issues'].append(f"Required file missing: {req_file}")
    
    # Summary
    if validation_results['validation_passed']:
        print("\nValidation PASSED - Dataset cleaned successfully!")
    else:
        print("\nValidation FAILED - Issues found:")
        for issue in validation_results['issues']:
            print(f"   - {issue}")
    
    return validation_results

# Validate the cleaned dataset
validation_results = validate_cleaned_dataset(OUTPUT_DIR, copy_statistics)

# Print final summary
print(f"\nCleaned dataset location: {OUTPUT_DIR}")
total_cleaned_images = sum(validation_results['file_counts'].values())
print(f"Total images in cleaned dataset: {total_cleaned_images:,}")

# Calculate total reduction
if 'cleaning_summary' in cleaning_report:
    original_total = cleaning_report['cleaning_summary']['total_original_images']
    reduction = original_total - total_cleaned_images
    reduction_pct = (reduction / original_total) * 100 if original_total > 0 else 0
    print(f"Total reduction: {reduction:,} images ({reduction_pct:.1f}%)")