In [None]:
# Automatic Dataset Download for Pathology Data Analysis
import os
import requests
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import json

def create_synthetic_pathology_dataset():
    """Create synthetic pathology dataset for demonstration"""
    np.random.seed(42)  # For reproducible results
    
    # Generate synthetic pathology data
    n_samples = 1000
    
    data = {
        'patient_id': [f'P{i:04d}' for i in range(1, n_samples + 1)],
        'age': np.random.normal(65, 12, n_samples).astype(int),
        'gender': np.random.choice(['M', 'F'], n_samples),
        'tumor_grade': np.random.choice([1, 2, 3], n_samples, p=[0.3, 0.5, 0.2]),
        'tumor_size_mm': np.random.lognormal(2.5, 0.8, n_samples),
        'stage': np.random.choice(['I', 'II', 'III', 'IV'], n_samples, p=[0.4, 0.3, 0.2, 0.1]),
        'lymph_node_positive': np.random.choice([0, 1], n_samples, p=[0.6, 0.4]),
        'her2_status': np.random.choice(['Positive', 'Negative'], n_samples, p=[0.2, 0.8]),
        'er_status': np.random.choice(['Positive', 'Negative'], n_samples, p=[0.7, 0.3]),
        'pr_status': np.random.choice(['Positive', 'Negative'], n_samples, p=[0.6, 0.4]),
        'ki67_percentage': np.random.beta(2, 5, n_samples) * 100,
        'mitotic_count': np.random.poisson(8, n_samples),
        'nuclear_grade': np.random.choice([1, 2, 3], n_samples, p=[0.2, 0.6, 0.2]),
        'histologic_type': np.random.choice(
            ['Invasive Ductal', 'Invasive Lobular', 'Mixed', 'Other'], 
            n_samples, p=[0.7, 0.15, 0.1, 0.05]
        ),
        'survival_months': np.random.exponential(36, n_samples),
        'recurrence': np.random.choice([0, 1], n_samples, p=[0.75, 0.25])
    }
    
    # Add some correlations to make data more realistic
    for i in range(n_samples):
        # Higher grade tumors tend to be larger
        if data['tumor_grade'][i] == 3:
            data['tumor_size_mm'][i] *= 1.5
        # Older patients more likely to have ER positive tumors
        if data['age'][i] > 70:
            data['er_status'][i] = np.random.choice(['Positive', 'Negative'], p=[0.85, 0.15])
    
    return pd.DataFrame(data)

def download_pathology_datasets():
    """Download and prepare pathology datasets for analysis"""
    data_dir = Path("../data")
    pathology_dir = data_dir / "pathology_datasets"
    pathology_dir.mkdir(parents=True, exist_ok=True)
    
    # Try to download real datasets, fall back to synthetic
    datasets_info = {}
    
    # Download TCGA clinical data sample (if available)
    tcga_url = "https://raw.githubusercontent.com/cBioPortal/datahub/master/public/brca_tcga/data_clinical_sample.txt"
    tcga_file = pathology_dir / "tcga_brca_clinical.txt"
    
    print("🏥 Preparing pathology datasets...")
    
    try:
        print("📥 Attempting to download TCGA BRCA clinical data...")
        response = requests.get(tcga_url, timeout=30)
        response.raise_for_status()
        
        with open(tcga_file, 'w') as f:
            f.write(response.text)
        print("✅ Downloaded TCGA BRCA clinical data")
        datasets_info['tcga_brca'] = str(tcga_file)
    except Exception as e:
        print(f"❌ TCGA download failed: {e}")
        print("🎭 Generating synthetic TCGA-like dataset...")
        
        # Create synthetic dataset that mimics TCGA structure
        synthetic_tcga = create_synthetic_pathology_dataset()
        tcga_synthetic_file = pathology_dir / "synthetic_tcga_brca.csv"
        synthetic_tcga.to_csv(tcga_synthetic_file, index=False)
        datasets_info['tcga_brca_synthetic'] = str(tcga_synthetic_file)
        print("✅ Created synthetic TCGA-like dataset")
    
    # Create additional synthetic datasets for different analysis scenarios
    print("🔬 Creating specialized analysis datasets...")
    
    # Dataset for survival analysis
    survival_data = create_synthetic_pathology_dataset()
    survival_data['death_event'] = np.random.choice([0, 1], len(survival_data), p=[0.7, 0.3])
    survival_file = pathology_dir / "pathology_survival_data.csv"
    survival_data.to_csv(survival_file, index=False)
    datasets_info['survival_analysis'] = str(survival_file)
    
    # Dataset for biomarker analysis
    biomarker_data = pd.DataFrame({
        'sample_id': [f'S{i:04d}' for i in range(500)],
        'cd3_positive': np.random.poisson(50, 500),
        'cd8_positive': np.random.poisson(30, 500),
        'cd68_positive': np.random.poisson(25, 500),
        'foxp3_positive': np.random.poisson(15, 500),
        'tumor_area_mm2': np.random.lognormal(3, 0.5, 500),
        'stroma_percentage': np.random.beta(3, 2, 500) * 100,
        'necrosis_percentage': np.random.beta(1, 4, 500) * 100,
        'response_to_treatment': np.random.choice(['Complete', 'Partial', 'None'], 500, p=[0.3, 0.4, 0.3])
    })
    biomarker_file = pathology_dir / "biomarker_analysis_data.csv"
    biomarker_data.to_csv(biomarker_file, index=False)
    datasets_info['biomarker_analysis'] = str(biomarker_file)
    
    # Save dataset information
    info_file = pathology_dir / "datasets_info.json"
    with open(info_file, 'w') as f:
        json.dump(datasets_info, f, indent=2)
    
    print(f"🎯 Pathology datasets ready! {len(datasets_info)} datasets created")
    print(f"📁 Data location: {pathology_dir.absolute()}")
    
    return pathology_dir, datasets_info

# Initialize pathology datasets
pathology_data_dir, available_datasets = download_pathology_datasets()

print("📊 Available datasets:")
for name, path in available_datasets.items():
    print(f"  • {name}: {Path(path).name}")

print("\n🔍 Ready to explore pathology data with pandas!")

# Pandas DataFrames for Pathology Data Analysis

Welcome to the essential data manipulation tutorial for digital pathology! In this notebook, you'll learn how to use pandas DataFrames to organize, analyze, and manipulate pathology data effectively.

## Learning Objectives
1. Load and explore pathology datasets using pandas
2. Perform data cleaning and preprocessing
3. Handle missing values in clinical data
4. Create summary statistics for pathology features
5. Filter and query datasets for specific conditions

## Prerequisites
- Basic Python knowledge
- Understanding of image processing concepts from previous tutorials

## 1. Environment Setup

First, let's install and import the necessary libraries for data manipulation and analysis.

In [None]:
# Install required packages (run this in terminal if not already installed)
# !pip install pandas numpy matplotlib seaborn scikit-learn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set pandas display options for better output
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

print("✅ All libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## 2. Loading Pathology Dataset

Let's create a realistic pathology dataset that mimics what you'd encounter in clinical practice. This includes patient information, slide characteristics, and diagnostic features.

In [None]:
# Create a synthetic pathology dataset
np.random.seed(42)

# Generate synthetic data
n_patients = 500
patient_ids = [f"P{i:04d}" for i in range(1, n_patients + 1)]
slide_ids = [f"S{i:04d}" for i in range(1, n_patients + 1)]

# Patient demographics
ages = np.random.normal(65, 12, n_patients).astype(int)
ages = np.clip(ages, 25, 90)
genders = np.random.choice(['M', 'F'], n_patients, p=[0.45, 0.55])

# Tissue types and diagnoses
tissue_types = np.random.choice(['Breast', 'Lung', 'Colon', 'Prostate', 'Liver'], 
                               n_patients, p=[0.3, 0.25, 0.2, 0.15, 0.1])
diagnoses = []
for tissue in tissue_types:
    if tissue == 'Breast':
        diagnoses.append(np.random.choice(['Benign', 'IDC', 'ILC', 'DCIS'], p=[0.4, 0.3, 0.2, 0.1]))
    elif tissue == 'Lung':
        diagnoses.append(np.random.choice(['Normal', 'Adenocarcinoma', 'SCC', 'SCLC'], p=[0.3, 0.4, 0.2, 0.1]))
    else:
        diagnoses.append(np.random.choice(['Normal', 'Adenocarcinoma', 'Other'], p=[0.5, 0.3, 0.2]))

# Pathology features (extracted from image analysis)
nuclear_area = np.random.normal(150, 30, n_patients)
nuclear_perimeter = np.random.normal(45, 8, n_patients)
nuclear_compactness = np.random.uniform(0.1, 0.9, n_patients)
mitotic_count = np.random.poisson(5, n_patients)
cell_density = np.random.normal(1000, 200, n_patients)

# Staining intensities (H&E analysis)
hematoxylin_intensity = np.random.normal(0.6, 0.15, n_patients)
eosin_intensity = np.random.normal(0.4, 0.12, n_patients)

# Image quality metrics
blur_metric = np.random.uniform(0.1, 0.9, n_patients)
contrast_metric = np.random.uniform(0.2, 1.0, n_patients)

# Create DataFrame
pathology_df = pd.DataFrame({
    'patient_id': patient_ids,
    'slide_id': slide_ids,
    'age': ages,
    'gender': genders,
    'tissue_type': tissue_types,
    'diagnosis': diagnoses,
    'nuclear_area': nuclear_area,
    'nuclear_perimeter': nuclear_perimeter,
    'nuclear_compactness': nuclear_compactness,
    'mitotic_count': mitotic_count,
    'cell_density': cell_density,
    'hematoxylin_intensity': hematoxylin_intensity,
    'eosin_intensity': eosin_intensity,
    'blur_metric': blur_metric,
    'contrast_metric': contrast_metric
})

print("✅ Pathology dataset created successfully!")
print(f"Dataset shape: {pathology_df.shape}")
pathology_df.head()

## 3. Data Exploration and Summary Statistics

Let's explore our dataset to understand the distribution of variables and identify any patterns.

In [None]:
# Basic information about the dataset
print("=== DATASET OVERVIEW ===")
print(f"Number of patients: {len(pathology_df)}")
print(f"Number of features: {len(pathology_df.columns)}")
print(f"Memory usage: {pathology_df.memory_usage(deep=True).sum() / 1024:.2f} KB")
print()

# Data types
print("=== DATA TYPES ===")
print(pathology_df.dtypes)
print()

# Missing values check
print("=== MISSING VALUES ===")
missing_values = pathology_df.isnull().sum()
print(missing_values[missing_values > 0] if missing_values.sum() > 0 else "No missing values found ✅")
print()

# Basic statistics for numerical columns
print("=== NUMERICAL FEATURES SUMMARY ===")
numerical_cols = pathology_df.select_dtypes(include=[np.number]).columns
pathology_df[numerical_cols].describe().round(2)

In [None]:
# Categorical variables distribution
print("=== CATEGORICAL FEATURES DISTRIBUTION ===")
categorical_cols = ['gender', 'tissue_type', 'diagnosis']

for col in categorical_cols:
    print(f"\n{col.upper()} Distribution:")
    counts = pathology_df[col].value_counts()
    percentages = (counts / len(pathology_df) * 100).round(1)
    
    for value, count, pct in zip(counts.index, counts.values, percentages.values):
        print(f"  {value}: {count} ({pct}%)")

## 4. Data Filtering and Querying

Learn how to filter datasets based on specific criteria - a crucial skill for pathology research.

In [None]:
# Example queries commonly used in pathology research

print("=== PATHOLOGY DATA QUERIES ===")

# 1. Filter by age groups
elderly_patients = pathology_df[pathology_df['age'] >= 70]
print(f"1. Elderly patients (≥70 years): {len(elderly_patients)} ({len(elderly_patients)/len(pathology_df)*100:.1f}%)")

# 2. High mitotic count cases (potentially aggressive tumors)
high_mitotic = pathology_df[pathology_df['mitotic_count'] > 8]
print(f"2. High mitotic count (>8): {len(high_mitotic)} ({len(high_mitotic)/len(pathology_df)*100:.1f}%)")

# 3. Breast cancer cases only
breast_cancer = pathology_df[
    (pathology_df['tissue_type'] == 'Breast') & 
    (pathology_df['diagnosis'].isin(['IDC', 'ILC', 'DCIS']))
]
print(f"3. Breast cancer cases: {len(breast_cancer)} ({len(breast_cancer)/len(pathology_df)*100:.1f}%)")

# 4. Complex query: Female lung cancer patients with large nuclei
complex_query = pathology_df[
    (pathology_df['gender'] == 'F') & 
    (pathology_df['tissue_type'] == 'Lung') & 
    (pathology_df['diagnosis'] != 'Normal') &
    (pathology_df['nuclear_area'] > pathology_df['nuclear_area'].quantile(0.75))
]
print(f"4. Female lung cancer patients with large nuclei: {len(complex_query)}")

# 5. Quality control: slides with good image quality
good_quality = pathology_df[
    (pathology_df['blur_metric'] > 0.6) & 
    (pathology_df['contrast_metric'] > 0.5)
]
print(f"5. Good quality slides: {len(good_quality)} ({len(good_quality)/len(pathology_df)*100:.1f}%)")

## 5. Data Grouping and Aggregation

Group data by categorical variables to find patterns and generate summary statistics.

In [None]:
# Group analysis - essential for pathology research

print("=== GROUP ANALYSIS ===")

# 1. Nuclear features by diagnosis
print("1. NUCLEAR AREA BY DIAGNOSIS:")
nuclear_by_diagnosis = pathology_df.groupby('diagnosis')['nuclear_area'].agg([
    'count', 'mean', 'std', 'min', 'max'
]).round(2)
print(nuclear_by_diagnosis)
print()

# 2. Age distribution by tissue type
print("2. AGE STATISTICS BY TISSUE TYPE:")
age_by_tissue = pathology_df.groupby('tissue_type')['age'].agg([
    'count', 'mean', 'median', 'std'
]).round(1)
print(age_by_tissue)
print()

# 3. Gender distribution across tissue types
print("3. GENDER DISTRIBUTION BY TISSUE TYPE:")
gender_tissue_crosstab = pd.crosstab(
    pathology_df['tissue_type'], 
    pathology_df['gender'], 
    margins=True, 
    normalize='index'
) * 100
print(gender_tissue_crosstab.round(1))
print()

# 4. Multiple metrics by diagnosis
print("4. COMPREHENSIVE METRICS BY DIAGNOSIS:")
comprehensive_stats = pathology_df.groupby('diagnosis').agg({
    'nuclear_area': ['mean', 'std'],
    'mitotic_count': ['mean', 'median'],
    'cell_density': ['mean', 'std'],
    'age': ['mean', 'count']
}).round(2)
print(comprehensive_stats)

## 6. Data Transformation and Feature Engineering

Create new features and transform existing ones for better analysis.

In [None]:
# Feature engineering for pathology data
print("=== FEATURE ENGINEERING ===")

# Create a copy for transformations
pathology_enhanced = pathology_df.copy()

# 1. Age groups (categorical)
def categorize_age(age):
    if age < 40:
        return 'Young'
    elif age < 65:
        return 'Middle-aged'
    else:
        return 'Elderly'

pathology_enhanced['age_group'] = pathology_enhanced['age'].apply(categorize_age)

# 2. Nuclear shape index (compactness derived from area and perimeter)
pathology_enhanced['nuclear_shape_index'] = (
    4 * np.pi * pathology_enhanced['nuclear_area'] / 
    (pathology_enhanced['nuclear_perimeter'] ** 2)
)

# 3. Mitotic rate categories
def categorize_mitotic_rate(count):
    if count <= 2:
        return 'Low'
    elif count <= 6:
        return 'Moderate'
    else:
        return 'High'

pathology_enhanced['mitotic_rate'] = pathology_enhanced['mitotic_count'].apply(categorize_mitotic_rate)

# 4. Staining ratio (H&E balance)
pathology_enhanced['staining_ratio'] = (
    pathology_enhanced['hematoxylin_intensity'] / 
    pathology_enhanced['eosin_intensity']
)

# 5. Image quality score (composite metric)
pathology_enhanced['quality_score'] = (
    pathology_enhanced['blur_metric'] * 0.6 + 
    pathology_enhanced['contrast_metric'] * 0.4
)

# 6. Binary diagnosis (malignant vs benign/normal)
malignant_diagnoses = ['IDC', 'ILC', 'DCIS', 'Adenocarcinoma', 'SCC', 'SCLC']
pathology_enhanced['is_malignant'] = pathology_enhanced['diagnosis'].isin(malignant_diagnoses)

print("✅ New features created:")
new_features = ['age_group', 'nuclear_shape_index', 'mitotic_rate', 
               'staining_ratio', 'quality_score', 'is_malignant']
for feature in new_features:
    print(f"  - {feature}")

print(f"\nEnhanced dataset shape: {pathology_enhanced.shape}")
pathology_enhanced[new_features].head()

## 7. Data Export and Saving

Learn how to save your processed data for use in other notebooks or applications.

In [None]:
# Create data directory if it doesn't exist
data_dir = Path('../data')
data_dir.mkdir(exist_ok=True)

# Save the processed dataset
output_file = data_dir / 'processed_pathology_data.csv'
pathology_enhanced.to_csv(output_file, index=False)

print(f"✅ Dataset saved to: {output_file}")
print(f"File size: {output_file.stat().st_size / 1024:.2f} KB")

# Also save a summary report
summary_file = data_dir / 'pathology_data_summary.txt'
with open(summary_file, 'w') as f:
    f.write("PATHOLOGY DATASET SUMMARY REPORT\n")
    f.write("=" * 40 + "\n\n")
    f.write(f"Generated on: {pd.Timestamp.now()}\n")
    f.write(f"Dataset shape: {pathology_enhanced.shape}\n")
    f.write(f"Number of patients: {len(pathology_enhanced)}\n\n")
    
    f.write("TISSUE TYPE DISTRIBUTION:\n")
    tissue_dist = pathology_enhanced['tissue_type'].value_counts()
    for tissue, count in tissue_dist.items():
        f.write(f"  {tissue}: {count} ({count/len(pathology_enhanced)*100:.1f}%)\n")
    
    f.write("\nDIAGNOSIS DISTRIBUTION:\n")
    diag_dist = pathology_enhanced['diagnosis'].value_counts()
    for diag, count in diag_dist.items():
        f.write(f"  {diag}: {count} ({count/len(pathology_enhanced)*100:.1f}%)\n")

print(f"✅ Summary report saved to: {summary_file}")

## 8. Practice Exercises

Try these exercises to reinforce your pandas skills with pathology data.

In [None]:
# EXERCISE 1: Find patients with specific criteria
print("EXERCISE 1: Complex Patient Selection")
print("Task: Find male patients over 60 with high nuclear area and good image quality")

# Your solution here:
exercise_1_result = pathology_enhanced[
    (pathology_enhanced['gender'] == 'M') &
    (pathology_enhanced['age'] > 60) &
    (pathology_enhanced['nuclear_area'] > pathology_enhanced['nuclear_area'].quantile(0.75)) &
    (pathology_enhanced['quality_score'] > 0.7)
]

print(f"✅ Found {len(exercise_1_result)} patients matching criteria")
print("Sample results:")
print(exercise_1_result[['patient_id', 'age', 'nuclear_area', 'quality_score']].head())

In [None]:
# EXERCISE 2: Calculate diagnostic accuracy metrics
print("\nEXERCISE 2: Diagnostic Performance Analysis")
print("Task: Calculate malignancy rates by tissue type")

# Your solution here:
malignancy_by_tissue = pathology_enhanced.groupby('tissue_type').agg({
    'is_malignant': ['count', 'sum', 'mean']
}).round(3)

malignancy_by_tissue.columns = ['total_cases', 'malignant_cases', 'malignancy_rate']

print("✅ Malignancy rates by tissue type:")
print(malignancy_by_tissue)

## 9. Auto-Validation Tests

Run these tests to verify your understanding of pandas operations.

In [None]:
# Auto-validation tests
print("=== AUTO-VALIDATION TESTS ===")

# Test 1: Dataset integrity
assert len(pathology_enhanced) == n_patients, "❌ Dataset size mismatch"
print("✅ Test 1 passed: Dataset size is correct")

# Test 2: No missing values in key columns
key_columns = ['patient_id', 'age', 'gender', 'tissue_type', 'diagnosis']
assert pathology_enhanced[key_columns].isnull().sum().sum() == 0, "❌ Missing values in key columns"
print("✅ Test 2 passed: No missing values in key columns")

# Test 3: Age range validation
assert pathology_enhanced['age'].min() >= 25, "❌ Age minimum out of range"
assert pathology_enhanced['age'].max() <= 90, "❌ Age maximum out of range"
print("✅ Test 3 passed: Age range is valid")

# Test 4: Feature engineering validation
assert 'nuclear_shape_index' in pathology_enhanced.columns, "❌ Nuclear shape index not created"
assert pathology_enhanced['nuclear_shape_index'].min() > 0, "❌ Invalid nuclear shape index values"
print("✅ Test 4 passed: Feature engineering completed correctly")

# Test 5: Data types validation
assert pathology_enhanced['is_malignant'].dtype == bool, "❌ Malignant flag should be boolean"
print("✅ Test 5 passed: Data types are correct")

print("\n🎉 All validation tests passed! You've successfully mastered pandas for pathology data!")

## 10. Next Steps

Congratulations! You've learned the fundamentals of using pandas for pathology data analysis. In the next notebook, you'll learn about:

- Statistical analysis and plotting
- Correlation analysis
- Hypothesis testing with pathology data
- Advanced data visualization techniques

**Key Skills Acquired:**
✅ Loading and exploring pathology datasets  
✅ Data filtering and querying  
✅ Grouping and aggregation  
✅ Feature engineering  
✅ Data validation and quality control  

**For Further Practice:**
- Try loading real pathology datasets from TCGA or other public repositories
- Practice with different tissue types and staining techniques
- Explore temporal analysis with longitudinal patient data