# Blood Sample Analysis - Experimentation

This notebook demonstrates the blood sample analysis pipeline and allows for parameter tuning.

In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import sys
from typing import Dict, List
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Add project root to path
sys.path.append('..')

from src.image_processing import load_image, preprocess_image, detect_cells, calculate_population_statistics
from src.anomaly_detection import AnomalyDetector

%matplotlib inline

## Load and Display Sample Image

First, let's load a sample blood image and display it.

In [None]:
def display_image(img, title='Image'):
    """Display an image using matplotlib"""
    if len(img.shape) == 3:
        plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    else:
        plt.imshow(img, cmap='gray')
    plt.title(title)
    plt.axis('off')
    plt.show()

# Load the sample image
sample_path = Path('../data/raw/Capture d\'écran 2025-01-25 225434.png')
print(f'Loading image from: {sample_path}')
image = load_image(str(sample_path))
print(f'Image shape: {image.shape}')
print(f'Image type: {image.dtype}')
display_image(image, 'Original Blood Sample Image')

## Image Preprocessing

Let's examine each step of the preprocessing pipeline.

In [None]:
def show_preprocessing_steps(image):
    """Display each preprocessing step"""
    # Original
    plt.figure(figsize=(20, 5))
    
    plt.subplot(141)
    if len(image.shape) == 3:
        plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    else:
        plt.imshow(image, cmap='gray')
    plt.title('Original')
    plt.axis('off')
    
    # Preprocessed
    preprocessed = preprocess_image(image)
    plt.subplot(142)
    plt.imshow(preprocessed, cmap='gray')
    plt.title('Preprocessed')
    plt.axis('off')
    
    # Cell Detection
    binary, contours = detect_cells(preprocessed)
    result = cv2.cvtColor(preprocessed, cv2.COLOR_GRAY2BGR)
    cv2.drawContours(result, contours, -1, (0, 255, 0), 2)
    
    plt.subplot(143)
    plt.imshow(cv2.cvtColor(result, cv2.COLOR_BGR2RGB))
    plt.title(f'Detected Cells\n({len(contours)} cells found)')
    plt.axis('off')
    
    # Component Segmentation (show first cell)
    if len(contours) > 0:
        x, y, w, h = cv2.boundingRect(contours[0])
        cell_image = preprocessed[y:y+h, x:x+w]
        components = segment_cell_components(cell_image)
        
        plt.subplot(144)
        overlay = np.zeros((*cell_image.shape, 3), dtype=np.uint8)
        overlay[components['nucleus'] > 0] = [255, 0, 0]   # Red for nucleus
        overlay[components['cytoplasm'] > 0] = [0, 255, 0] # Green for cytoplasm
        plt.imshow(overlay)
        plt.title('Cell Components\n(First Cell)')
        plt.axis('off')
    
    plt.tight_layout()
    plt.show()
    
    # Print analysis of first 5 cells
    print('\nCell Analysis:')
    for i, contour in enumerate(contours[:5], 1):
        props = analyze_cell_properties(preprocessed, contour)
        print(f'\nCell {i}:')
        print(f'Area: {props["area"]:.2f}')
        print(f'Circularity: {props["circularity"]:.2f}')
        print(f'Mean intensity: {props["mean_intensity"]:.2f}')
        print(f'Contrast: {props["contrast"]:.2f}')

if 'image' in locals():
    show_preprocessing_steps(image)

## Cell Detection

Now let's examine the cell detection process and tune parameters.

In [None]:
def experiment_with_cell_detection(image, min_areas=[50, 100, 200, 400]):
    """Try different minimum area thresholds for cell detection"""
    preprocessed = preprocess_image(image)
    
    fig, axes = plt.subplots(1, len(min_areas), figsize=(20, 5))
    
    for ax, min_area in zip(axes, min_areas):
        # Apply thresholding
        _, binary = cv2.threshold(
            preprocessed,
            0,
            255,
            cv2.THRESH_BINARY + cv2.THRESH_OTSU
        )
        
        # Find contours
        contours, _ = cv2.findContours(
            binary,
            cv2.RETR_EXTERNAL,
            cv2.CHAIN_APPROX_SIMPLE
        )
        
        # Filter by area
        valid_contours = [cnt for cnt in contours if cv2.contourArea(cnt) > min_area]
        
        # Draw results
        result = cv2.cvtColor(preprocessed, cv2.COLOR_GRAY2BGR)
        cv2.drawContours(result, valid_contours, -1, (0, 255, 0), 2)
        
        ax.imshow(cv2.cvtColor(result, cv2.COLOR_BGR2RGB))
        ax.set_title(f'Min Area = {min_area}\nCells Found: {len(valid_contours)}')
        ax.axis('off')
    
    plt.tight_layout()
    plt.show()

if 'image' in locals():
    experiment_with_cell_detection(image)

## Feature Extraction and Anomaly Detection

Finally, let's examine the features extracted from cells and visualize potential anomalies.

In [None]:
def analyze_cells(image):
    """Extract and analyze individual cells"""
    preprocessed = preprocess_image(image)
    mask, contours = detect_cells(preprocessed)
    
    # Extract individual cells
    cells = []
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        cell = preprocessed[y:y+h, x:x+w]
        cells.append(cell)
    
    # Analyze cells
    detector = AnomalyDetector()
    results = detector.detect_anomalies(cells)
    
    # Display results
    fig, axes = plt.subplots(2, min(5, len(cells)), figsize=(15, 6))
    
    for i, (cell, result) in enumerate(zip(cells[:5], results[:5])):
        # Original cell
        axes[0, i].imshow(cell, cmap='gray')
        axes[0, i].set_title('Normal' if not result['is_anomaly'] else 'Anomaly')
        axes[0, i].axis('off')
        
        # Feature visualization
        features = result['features']
        axes[1, i].bar(range(len(features)), list(features.values()))
        axes[1, i].set_xticklabels(features.keys(), rotation=45)
        axes[1, i].set_title(f'Confidence: {result["confidence"]:.2f}')
    
    plt.tight_layout()
    plt.show()

if 'image' in locals():
    analyze_cells(image)

## Comparative Analysis of Multiple Blood Samples

Let's analyze multiple blood samples and compare their cell population statistics to identify any significant differences between samples.

In [None]:
def analyze_multiple_samples(image_paths: List[Path]) -> Dict[str, pd.DataFrame]:
    """
    Analyze multiple blood sample images and return their statistics
    
    Args:
        image_paths: List of paths to blood sample images
        
    Returns:
        Dictionary containing DataFrames with cell measurements and statistics
    """
    all_measurements = {}
    sample_stats = {}
    
    for path in image_paths:
        print(f"\nProcessing {path.name}...")
        
        # Load and process image
        image = load_image(str(path))
        preprocessed = preprocess_image(image)
        binary, contours = detect_cells(preprocessed)
        
        # Collect measurements for all cells
        cell_measurements = []
        for contour in contours:
            props = analyze_cell_properties(preprocessed, contour)
            props['sample'] = path.name
            cell_measurements.append(props)
        
        # Convert to DataFrame
        df = pd.DataFrame(cell_measurements)
        all_measurements[path.name] = df
        
        # Calculate statistics
        stats = calculate_population_statistics(cell_measurements)
        sample_stats[path.name] = stats
        
        print(f"Found {len(contours)} cells")
        
    return {
        'measurements': all_measurements,
        'statistics': sample_stats
    }

def plot_sample_comparisons(measurements: Dict[str, pd.DataFrame]):
    """Plot comparative visualizations of cell measurements"""
    # Combine all measurements
    all_data = pd.concat(measurements.values(), ignore_index=True)
    
    # Create visualization grid
    fig, axes = plt.subplots(2, 2, figsize=(15, 15))
    fig.suptitle('Comparison of Cell Measurements Across Samples', size=16)
    
    # Area distribution
    sns.boxplot(x='sample', y='area', data=all_data, ax=axes[0,0])
    axes[0,0].set_title('Cell Area Distribution')
    axes[0,0].tick_params(axis='x', rotation=45)
    
    # Circularity distribution
    sns.boxplot(x='sample', y='circularity', data=all_data, ax=axes[0,1])
    axes[0,1].set_title('Cell Circularity Distribution')
    axes[0,1].tick_params(axis='x', rotation=45)
    
    # Mean intensity distribution
    sns.boxplot(x='sample', y='mean_intensity', data=all_data, ax=axes[1,0])
    axes[1,0].set_title('Mean Intensity Distribution')
    axes[1,0].tick_params(axis='x', rotation=45)
    
    # Scatter plot of area vs circularity
    for sample, data in measurements.items():
        axes[1,1].scatter(data['area'], data['circularity'], alpha=0.5, label=sample)
    axes[1,1].set_title('Area vs Circularity')
    axes[1,1].set_xlabel('Area')
    axes[1,1].set_ylabel('Circularity')
    axes[1,1].legend()
    
    plt.tight_layout()
    plt.show()

# Find all PNG files in the raw data directory
raw_dir = Path('../data/raw')
image_paths = list(raw_dir.glob('*.png'))

if image_paths:
    print(f"Found {len(image_paths)} blood sample images:")
    for path in image_paths:
        print(f"- {path.name}")
        
    # Analyze all samples
    results = analyze_multiple_samples(image_paths)
    
    # Plot comparisons
    plot_sample_comparisons(results['measurements'])
    
    # Display statistical summary
    print("\nStatistical Summary:")
    for sample, stats in results['statistics'].items():
        print(f"\n{sample}:")
        for prop, measures in stats.items():
            print(f"\n{prop.title()}:")
            print(f"  Mean ± Std: {measures['mean']:.2f} ± {measures['std']:.2f}")
            print(f"  Median (Q1-Q3): {measures['median']:.2f} ({measures['q1']:.2f}-{measures['q3']:.2f})")
else:
    print("No blood sample images found in the raw data directory.")

## Cell Population Clustering

Now let's try to identify different cell populations using clustering analysis.

In [None]:
def analyze_cell_populations(measurements: Dict[str, pd.DataFrame]):
    """
    Perform clustering analysis on cell measurements to identify distinct populations
    """
    # Combine all measurements
    all_data = pd.concat(measurements.values(), ignore_index=True)
    
    # Select features for clustering
    features = ['area', 'circularity', 'mean_intensity', 'contrast']
    X = all_data[features]
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Find optimal number of clusters
    silhouette_scores = []
    K = range(2, 6)
    
    for k in K:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X_scaled)
        score = silhouette_score(X_scaled, kmeans.labels_)
        silhouette_scores.append(score)
    
    # Use optimal number of clusters
    optimal_k = K[np.argmax(silhouette_scores)]
    kmeans = KMeans(n_clusters=optimal_k, random_state=42)
    all_data['cluster'] = kmeans.fit_predict(X_scaled)
    
    # Visualize clusters
    fig, axes = plt.subplots(2, 2, figsize=(15, 15))
    fig.suptitle(f'Cell Population Clusters (K={optimal_k})', size=16)
    
    # Area vs Circularity
    scatter = axes[0,0].scatter(all_data['area'], all_data['circularity'], 
                               c=all_data['cluster'], cmap='viridis')
    axes[0,0].set_title('Area vs Circularity')
    axes[0,0].set_xlabel('Area')
    axes[0,0].set_ylabel('Circularity')
    
    # Mean Intensity vs Contrast
    scatter = axes[0,1].scatter(all_data['mean_intensity'], all_data['contrast'], 
                               c=all_data['cluster'], cmap='viridis')
    axes[0,1].set_title('Mean Intensity vs Contrast')
    axes[0,1].set_xlabel('Mean Intensity')
    axes[0,1].set_ylabel('Contrast')
    
    # Population distribution per sample
    population_counts = pd.crosstab(all_data['sample'], all_data['cluster'])
    population_counts.plot(kind='bar', stacked=True, ax=axes[1,0])
    axes[1,0].set_title('Cell Population Distribution by Sample')
    axes[1,0].tick_params(axis='x', rotation=45)
    
    # Cluster characteristics
    cluster_means = all_data.groupby('cluster')[features].mean()
    cluster_means.plot(kind='bar', ax=axes[1,1])
    axes[1,1].set_title('Cluster Characteristics')
    axes[1,1].tick_params(axis='x', rotation=0)
    
    plt.tight_layout()
    plt.show()
    
    # Print cluster statistics
    print("\nCluster Statistics:")
    for cluster in range(optimal_k):
        cluster_data = all_data[all_data['cluster'] == cluster]
        print(f"\nCluster {cluster} ({len(cluster_data)} cells):")
        for feature in features:
            mean = cluster_data[feature].mean()
            std = cluster_data[feature].std()
            print(f"{feature}: {mean:.2f} ± {std:.2f}")

if 'results' in locals():
    analyze_cell_populations(results['measurements'])