In [None]:
# -*- coding: utf-8 -*-
"""
Training and Test Datasets: Practical Examples
===============================================

This notebook demonstrates how to properly split datasets for training and testing
AI models, with real-world examples from healthcare and finance domains.

Author: AI Education Series
Date: 2024
"""

# ========================================
# SECTION 1: Setup and Imports
# ========================================

# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, TimeSeriesSplit, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.datasets import make_classification
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Configure plotting
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (10, 6)

print("✅ All libraries imported successfully!")
print("📚 This notebook will teach you how to properly split datasets for AI training and testing.")

# ========================================
# SECTION 2: Understanding the Basics
# ========================================

print("\n" + "="*60)
print("SECTION 2: Understanding Dataset Splitting Basics")
print("="*60)

# Create a simple synthetic dataset to demonstrate concepts
def create_sample_dataset(n_samples=1000):
    """
    Create a synthetic dataset for demonstration purposes.
    This represents a simplified version of real-world data.
    """
    X, y = make_classification(
        n_samples=n_samples,
        n_features=5,
        n_redundant=0,
        n_informative=5,
        n_clusters_per_class=1,
        random_state=42
    )

    # Create a DataFrame with meaningful column names
    feature_names = ['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5']
    df = pd.DataFrame(X, columns=feature_names)
    df['Target'] = y

    return df

# Create and display our sample dataset
sample_data = create_sample_dataset(1000)
print("📊 Sample Dataset Created:")
print(f"Shape: {sample_data.shape}")
print(f"Columns: {list(sample_data.columns)}")
print("\nFirst 5 rows:")
print(sample_data.head())

# Show the target distribution
print(f"\nTarget Distribution:")
print(sample_data['Target'].value_counts())
print(f"Class 0: {(sample_data['Target'] == 0).sum()} samples ({(sample_data['Target'] == 0).mean():.1%})")
print(f"Class 1: {(sample_data['Target'] == 1).sum()} samples ({(sample_data['Target'] == 1).mean():.1%})")

# ========================================
# SECTION 3: Simple Train-Test Split
# ========================================

print("\n" + "="*60)
print("SECTION 3: Simple Train-Test Split")
print("="*60)

# Separate features and target
X = sample_data.drop('Target', axis=1)
y = sample_data['Target']

# Perform a simple 80-20 split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,      # 20% for testing
    random_state=42,    # For reproducibility
    stratify=y          # Maintain class distribution
)

print(f"📈 Dataset Split Results:")
print(f"Original dataset size: {len(X)} samples")
print(f"Training set size: {len(X_train)} samples ({len(X_train)/len(X):.1%})")
print(f"Test set size: {len(X_test)} samples ({len(X_test)/len(X):.1%})")

# Check if class distribution is maintained
print(f"\nClass Distribution Check:")
print(f"Original - Class 0: {(y == 0).mean():.1%}, Class 1: {(y == 1).mean():.1%}")
print(f"Training - Class 0: {(y_train == 0).mean():.1%}, Class 1: {(y_train == 1).mean():.1%}")
print(f"Test     - Class 0: {(y_test == 0).mean():.1%}, Class 1: {(y_test == 1).mean():.1%}")

# Visualize the split
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Original data
axes[0].bar(['Class 0', 'Class 1'], [sum(y == 0), sum(y == 1)], color=['lightblue', 'lightcoral'])
axes[0].set_title('Original Dataset')
axes[0].set_ylabel('Number of Samples')

# Training data
axes[1].bar(['Class 0', 'Class 1'], [sum(y_train == 0), sum(y_train == 1)], color=['lightblue', 'lightcoral'])
axes[1].set_title('Training Set')
axes[1].set_ylabel('Number of Samples')

# Test data
axes[2].bar(['Class 0', 'Class 1'], [sum(y_test == 0), sum(y_test == 1)], color=['lightblue', 'lightcoral'])
axes[2].set_title('Test Set')
axes[2].set_ylabel('Number of Samples')

plt.tight_layout()
plt.show()

# ========================================
# SECTION 4: Healthcare Example - Medical Diagnosis
# ========================================

print("\n" + "="*60)
print("SECTION 4: Healthcare Example - Medical Diagnosis")
print("="*60)

# Create a synthetic medical dataset
def create_medical_dataset(n_patients=2000):
    """
    Create a synthetic medical dataset for disease diagnosis.
    This simulates patient data with various health indicators.
    """
    np.random.seed(42)

    # Generate patient features
    age = np.random.normal(55, 15, n_patients)
    age = np.clip(age, 18, 90)  # Reasonable age range

    blood_pressure = np.random.normal(120, 20, n_patients)
    cholesterol = np.random.normal(200, 40, n_patients)
    blood_sugar = np.random.normal(100, 20, n_patients)
    heart_rate = np.random.normal(72, 12, n_patients)

    # Create target variable (disease) based on realistic relationships
    # Higher age, BP, cholesterol, and blood sugar increase disease risk
    disease_risk = (
        (age - 18) / 72 * 0.3 +  # Age factor
        np.maximum(blood_pressure - 120, 0) / 60 * 0.25 +  # BP factor
        np.maximum(cholesterol - 200, 0) / 100 * 0.25 +  # Cholesterol factor
        np.maximum(blood_sugar - 100, 0) / 50 * 0.2  # Blood sugar factor
    )

    # Add some randomness
    disease_risk += np.random.normal(0, 0.1, n_patients)

    # Convert to binary classification (disease: 1, no disease: 0)
    disease = (disease_risk > 0.5).astype(int)

    # Create DataFrame
    medical_data = pd.DataFrame({
        'Age': age,
        'Blood_Pressure': blood_pressure,
        'Cholesterol': cholesterol,
        'Blood_Sugar': blood_sugar,
        'Heart_Rate': heart_rate,
        'Disease': disease
    })

    return medical_data

# Create the medical dataset
medical_data = create_medical_dataset(2000)
print("🏥 Medical Dataset Created:")
print(f"Shape: {medical_data.shape}")
print(f"Disease prevalence: {medical_data['Disease'].mean():.1%}")
print("\nFirst 5 patients:")
print(medical_data.head())

# Visualize the medical data
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

features = ['Age', 'Blood_Pressure', 'Cholesterol', 'Blood_Sugar', 'Heart_Rate']
for i, feature in enumerate(features):
    for disease_status in [0, 1]:
        data = medical_data[medical_data['Disease'] == disease_status][feature]
        axes[i].hist(data, alpha=0.7, label=f'Disease: {disease_status}', bins=30)
    axes[i].set_title(f'{feature} Distribution')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Frequency')
    axes[i].legend()

# Disease distribution
axes[5].bar(['No Disease', 'Disease'],
           [sum(medical_data['Disease'] == 0), sum(medical_data['Disease'] == 1)],
           color=['lightgreen', 'lightcoral'])
axes[5].set_title('Disease Distribution')
axes[5].set_ylabel('Number of Patients')

plt.tight_layout()
plt.show()

# Split the medical data properly
X_medical = medical_data.drop('Disease', axis=1)
y_medical = medical_data['Disease']

# Use stratified split to maintain disease prevalence
X_train_med, X_test_med, y_train_med, y_test_med = train_test_split(
    X_medical, y_medical,
    test_size=0.2,
    random_state=42,
    stratify=y_medical  # Critical for medical data!
)

print(f"\n🏥 Medical Data Split Results:")
print(f"Training set: {len(X_train_med)} patients")
print(f"Test set: {len(X_test_med)} patients")
print(f"Disease prevalence in training: {y_train_med.mean():.1%}")
print(f"Disease prevalence in test: {y_test_med.mean():.1%}")

# Train a simple model on medical data
# Scale the features (important for medical data)
scaler_med = StandardScaler()
X_train_med_scaled = scaler_med.fit_transform(X_train_med)
X_test_med_scaled = scaler_med.transform(X_test_med)

# Train a Random Forest classifier
rf_medical = RandomForestClassifier(n_estimators=100, random_state=42)
rf_medical.fit(X_train_med_scaled, y_train_med)

# Make predictions
y_pred_med = rf_medical.predict(X_test_med_scaled)

# Evaluate the model
accuracy_med = accuracy_score(y_test_med, y_pred_med)
print(f"\n📊 Medical Model Performance:")
print(f"Accuracy: {accuracy_med:.3f}")
print(f"Classification Report:")
print(classification_report(y_test_med, y_pred_med, target_names=['No Disease', 'Disease']))

# ========================================
# SECTION 5: Finance Example - Credit Risk Assessment
# ========================================

print("\n" + "="*60)
print("SECTION 5: Finance Example - Credit Risk Assessment")
print("="*60)

# Create a synthetic financial dataset
def create_credit_dataset(n_customers=5000):
    """
    Create a synthetic credit risk dataset.
    This simulates customer data for loan approval decisions.
    """
    np.random.seed(42)

    # Generate customer features
    age = np.random.normal(35, 12, n_customers)
    age = np.clip(age, 18, 80)

    income = np.random.lognormal(10.5, 0.5, n_customers)  # Log-normal for realistic income distribution
    income = np.clip(income, 20000, 200000)

    credit_score = np.random.normal(650, 100, n_customers)
    credit_score = np.clip(credit_score, 300, 850)

    employment_years = np.random.exponential(5, n_customers)
    employment_years = np.clip(employment_years, 0, 40)

    debt_to_income = np.random.beta(2, 5, n_customers)  # Beta distribution for ratios

    # Create default risk based on realistic relationships
    default_risk = (
        (850 - credit_score) / 550 * 0.4 +  # Lower credit score = higher risk
        debt_to_income * 0.3 +  # Higher debt-to-income = higher risk
        (40 - age) / 22 * 0.2 +  # Younger age = slightly higher risk
        (10 - employment_years) / 10 * 0.1  # Less employment = higher risk
    )

    # Add some randomness
    default_risk += np.random.normal(0, 0.1, n_customers)
    default_risk = np.clip(default_risk, 0, 1)

    # Convert to binary classification (default: 1, no default: 0)
    # Make defaults relatively rare (realistic for good lending)
    default = (default_risk > 0.7).astype(int)

    # Create DataFrame
    credit_data = pd.DataFrame({
        'Age': age,
        'Income': income,
        'Credit_Score': credit_score,
        'Employment_Years': employment_years,
        'Debt_to_Income_Ratio': debt_to_income,
        'Default': default
    })

    return credit_data

# Create the credit dataset
credit_data = create_credit_dataset(5000)
print("💳 Credit Dataset Created:")
print(f"Shape: {credit_data.shape}")
print(f"Default rate: {credit_data['Default'].mean():.1%}")
print("\nFirst 5 customers:")
print(credit_data.head())

# Visualize the credit data
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

features = ['Age', 'Income', 'Credit_Score', 'Employment_Years', 'Debt_to_Income_Ratio']
for i, feature in enumerate(features):
    for default_status in [0, 1]:
        data = credit_data[credit_data['Default'] == default_status][feature]
        axes[i].hist(data, alpha=0.7, label=f'Default: {default_status}', bins=30)
    axes[i].set_title(f'{feature} Distribution')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Frequency')
    axes[i].legend()

# Default distribution
axes[5].bar(['No Default', 'Default'],
           [sum(credit_data['Default'] == 0), sum(credit_data['Default'] == 1)],
           color=['lightgreen', 'lightcoral'])
axes[5].set_title('Default Distribution')
axes[5].set_ylabel('Number of Customers')

plt.tight_layout()
plt.show()

# Split the credit data
X_credit = credit_data.drop('Default', axis=1)
y_credit = credit_data['Default']

# Use stratified split to maintain default rate
X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(
    X_credit, y_credit,
    test_size=0.2,
    random_state=42,
    stratify=y_credit  # Critical for imbalanced financial data!
)

print(f"\n💳 Credit Data Split Results:")
print(f"Training set: {len(X_train_credit)} customers")
print(f"Test set: {len(X_test_credit)} customers")
print(f"Default rate in training: {y_train_credit.mean():.1%}")
print(f"Default rate in test: {y_test_credit.mean():.1%}")

# Train a model on credit data
# Scale the features
scaler_credit = StandardScaler()
X_train_credit_scaled = scaler_credit.fit_transform(X_train_credit)
X_test_credit_scaled = scaler_credit.transform(X_test_credit)

# Train a Random Forest classifier
rf_credit = RandomForestClassifier(n_estimators=100, random_state=42)
rf_credit.fit(X_train_credit_scaled, y_train_credit)

# Make predictions
y_pred_credit = rf_credit.predict(X_test_credit_scaled)

# Evaluate the model
accuracy_credit = accuracy_score(y_test_credit, y_pred_credit)
print(f"\n📊 Credit Model Performance:")
print(f"Accuracy: {accuracy_credit:.3f}")
print(f"Classification Report:")
print(classification_report(y_test_credit, y_pred_credit, target_names=['No Default', 'Default']))

# ========================================
# SECTION 6: Time Series Splitting (Financial Data)
# ========================================

print("\n" + "="*60)
print("SECTION 6: Time Series Splitting for Financial Data")
print("="*60)

# Create a time series dataset (stock prices)
def create_stock_data(n_days=1000):
    """
    Create synthetic stock price data with time series structure.
    """
    np.random.seed(42)

    # Generate dates
    dates = pd.date_range('2020-01-01', periods=n_days, freq='D')

    # Generate stock price features
    # Simple random walk for stock prices
    returns = np.random.normal(0.001, 0.02, n_days)  # Daily returns
    prices = 100 * np.exp(np.cumsum(returns))  # Stock prices

    # Technical indicators
    volume = np.random.lognormal(15, 0.5, n_days)
    moving_avg_5 = pd.Series(prices).rolling(5).mean()
    moving_avg_20 = pd.Series(prices).rolling(20).mean()

    # Target: next day price movement (up or down)
    next_day_return = np.roll(returns, -1)
    target = (next_day_return > 0).astype(int)

    # Create DataFrame
    stock_data = pd.DataFrame({
        'Date': dates,
        'Price': prices,
        'Volume': volume,
        'MA_5': moving_avg_5,
        'MA_20': moving_avg_20,
        'Target': target
    })

    # Drop last row (no future data for target)
    stock_data = stock_data[:-1]

    # Drop rows with NaN values (from moving averages)
    stock_data = stock_data.dropna()

    return stock_data

# Create stock data
stock_data = create_stock_data(1000)
print("📈 Stock Data Created:")
print(f"Shape: {stock_data.shape}")
print(f"Date range: {stock_data['Date'].min()} to {stock_data['Date'].max()}")
print(f"Up days: {stock_data['Target'].mean():.1%}")

# Visualize stock data
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Stock price over time
axes[0, 0].plot(stock_data['Date'], stock_data['Price'])
axes[0, 0].set_title('Stock Price Over Time')
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('Price ($)')
axes[0, 0].tick_params(axis='x', rotation=45)

# Volume over time
axes[0, 1].plot(stock_data['Date'], stock_data['Volume'])
axes[0, 1].set_title('Trading Volume Over Time')
axes[0, 1].set_xlabel('Date')
axes[0, 1].set_ylabel('Volume')
axes[0, 1].tick_params(axis='x', rotation=45)

# Moving averages
axes[1, 0].plot(stock_data['Date'], stock_data['Price'], label='Price', alpha=0.7)
axes[1, 0].plot(stock_data['Date'], stock_data['MA_5'], label='5-day MA', alpha=0.8)
axes[1, 0].plot(stock_data['Date'], stock_data['MA_20'], label='20-day MA', alpha=0.8)
axes[1, 0].set_title('Price and Moving Averages')
axes[1, 0].set_xlabel('Date')
axes[1, 0].set_ylabel('Price ($)')
axes[1, 0].legend()
axes[1, 0].tick_params(axis='x', rotation=45)

# Target distribution
axes[1, 1].bar(['Down Days', 'Up Days'],
              [sum(stock_data['Target'] == 0), sum(stock_data['Target'] == 1)],
              color=['lightcoral', 'lightgreen'])
axes[1, 1].set_title('Daily Price Movement Distribution')
axes[1, 1].set_ylabel('Number of Days')

plt.tight_layout()
plt.show()

# Time series splitting (CRITICAL for financial data!)
# Never use random splitting for time series data!

# Prepare features for time series model
X_stock = stock_data[['Price', 'Volume', 'MA_5', 'MA_20']]
y_stock = stock_data['Target']

# Time-based split: 80% for training, 20% for testing
split_point = int(0.8 * len(stock_data))

X_train_stock = X_stock[:split_point]
X_test_stock = X_stock[split_point:]
y_train_stock = y_stock[:split_point]
y_test_stock = y_stock[split_point:]

print(f"\n📈 Time Series Split Results:")
print(f"Training period: {stock_data['Date'].iloc[0]} to {stock_data['Date'].iloc[split_point-1]}")
print(f"Testing period: {stock_data['Date'].iloc[split_point]} to {stock_data['Date'].iloc[-1]}")
print(f"Training samples: {len(X_train_stock)}")
print(f"Test samples: {len(X_test_stock)}")

# Visualize the time series split
plt.figure(figsize=(15, 6))
plt.plot(stock_data['Date'][:split_point], stock_data['Price'][:split_point],
         label='Training Data', color='blue', alpha=0.7)
plt.plot(stock_data['Date'][split_point:], stock_data['Price'][split_point:],
         label='Test Data', color='red', alpha=0.7)
plt.axvline(x=stock_data['Date'].iloc[split_point], color='black', linestyle='--',
            label='Split Point')
plt.title('Time Series Split: Training vs Test Data')
plt.xlabel('Date')
plt.ylabel('Stock Price ($)')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Train model on time series data
# Scale the features
scaler_stock = StandardScaler()
X_train_stock_scaled = scaler_stock.fit_transform(X_train_stock)
X_test_stock_scaled = scaler_stock.transform(X_test_stock)

# Train a Random Forest classifier
rf_stock = RandomForestClassifier(n_estimators=100, random_state=42)
rf_stock.fit(X_train_stock_scaled, y_train_stock)

# Make predictions
y_pred_stock = rf_stock.predict(X_test_stock_scaled)

# Evaluate the model
accuracy_stock = accuracy_score(y_test_stock, y_pred_stock)
print(f"\n📊 Stock Prediction Model Performance:")
print(f"Accuracy: {accuracy_stock:.3f}")
print(f"Classification Report:")
print(classification_report(y_test_stock, y_pred_stock, target_names=['Down Day', 'Up Day']))

# ========================================
# SECTION 7: Cross-Validation Example
# ========================================

print("\n" + "="*60)
print("SECTION 7: Cross-Validation for Robust Evaluation")
print("="*60)

from sklearn.model_selection import cross_val_score, StratifiedKFold

# Use our medical dataset for cross-validation example
print("🏥 Performing Cross-Validation on Medical Data:")

# Create stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(rf_medical, X_train_med_scaled, y_train_med,
                           cv=skf, scoring='accuracy')

print(f"Cross-Validation Results:")
print(f"Individual fold scores: {cv_scores}")
print(f"Mean CV accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

# Compare with simple train-test split
simple_score = rf_medical.score(X_test_med_scaled, y_test_med)
print(f"Simple train-test accuracy: {simple_score:.3f}")

# Visualize cross-validation results
plt.figure(figsize=(10, 6))
plt.bar(range(1, 6), cv_scores, alpha=0.7, color='skyblue')
plt.axhline(y=cv_scores.mean(), color='red', linestyle='--',
            label=f'Mean: {cv_scores.mean():.3f}')
plt.axhline(y=simple_score, color='green', linestyle='--',
            label=f'Simple Split: {simple_score:.3f}')
plt.title('Cross-Validation Scores vs Simple Train-Test Split')
plt.xlabel('Fold')
plt.ylabel('Accuracy')
plt.legend()
plt.ylim(0, 1)
plt.show()

# ========================================
# SECTION 8: Common Pitfalls and How to Avoid Them
# ========================================

print("\n" + "="*60)
print("SECTION 8: Common Pitfalls and How to Avoid Them")
print("="*60)

# Pitfall 1: Data Leakage Example
print("⚠️ PITFALL 1: Data Leakage")
print("Example: Scaling before splitting (WRONG way)")

# WRONG: Scale entire dataset before splitting
X_wrong = sample_data.drop('Target', axis=1)
y_wrong = sample_data['Target']

# This is WRONG - it leaks information from test set to training
scaler_wrong = StandardScaler()
X_scaled_wrong = scaler_wrong.fit_transform(X_wrong)  # Uses ALL data for scaling

# Then split
X_train_wrong, X_test_wrong, y_train_wrong, y_test_wrong = train_test_split(
    X_scaled_wrong, y_wrong, test_size=0.2, random_state=42
)

print("❌ WRONG: Scaled entire dataset before splitting")
print(f"Training set mean: {X_train_wrong.mean():.4f}")
print(f"Test set mean: {X_test_wrong.mean():.4f}")

# CORRECT: Scale only training data, then apply to test
print("\n✅ CORRECT: Scale only training data")
X_train_correct, X_test_correct, y_train_correct, y_test_correct = train_test_split(
    X_wrong, y_wrong, test_size=0.2, random_state=42
)

scaler_correct = StandardScaler()
X_train_scaled_correct = scaler_correct.fit_transform(X_train_correct)  # Fit only on training
X_test_scaled_correct = scaler_correct.transform(X_test_correct)  # Transform test using training stats

print(f"Training set mean: {X_train_scaled_correct.mean():.4f}")
print(f"Test set mean: {X_test_scaled_correct.mean():.4f}")

# Pitfall 2: Ignoring Class Imbalance
print("\n⚠️ PITFALL 2: Ignoring Class Imbalance")

# Create imbalanced dataset
imbalanced_data = create_medical_dataset(1000)
# Make disease very rare (like in real life)
imbalanced_data['Disease'] = (imbalanced_data['Disease'] == 1) & (np.random.random(1000) < 0.05)

print(f"Imbalanced dataset - Disease prevalence: {imbalanced_data['Disease'].mean():.1%}")

X_imb = imbalanced_data.drop('Disease', axis=1)
y_imb = imbalanced_data['Disease']

# WRONG: Random split without stratification
X_train_imb_wrong, X_test_imb_wrong, y_train_imb_wrong, y_test_imb_wrong = train_test_split(
    X_imb, y_imb, test_size=0.2, random_state=42
)

print(f"❌ WRONG (no stratification):")
print(f"Training disease rate: {y_train_imb_wrong.mean():.1%}")
print(f"Test disease rate: {y_test_imb_wrong.mean():.1%}")

# CORRECT: Use stratified split
X_train_imb_correct, X_test_imb_correct, y_train_imb_correct, y_test_imb_correct = train_test_split(
    X_imb, y_imb, test_size=0.2, random_state=42, stratify=y_imb
)

print(f"✅ CORRECT (with stratification):")
print(f"Training disease rate: {y_train_imb_correct.mean():.1%}")
print(f"Test disease rate: {y_test_imb_correct.mean():.1%}")

# Pitfall 3: Wrong splitting for time series
print("\n⚠️ PITFALL 3: Random Splitting for Time Series")

# Show why random splitting fails for time series
dates = pd.date_range('2020-01-01', periods=100, freq='D')
time_series_data = pd.DataFrame({
    'Date': dates,
    'Value': np.cumsum(np.random.randn(100)) + 100  # Random walk
})

# WRONG: Random split
indices = np.arange(len(time_series_data))
train_idx_wrong, test_idx_wrong = train_test_split(indices, test_size=0.2, random_state=42)

print("❌ WRONG: Random split for time series")
print(f"Training dates range: {time_series_data.iloc[train_idx_wrong]['Date'].min()} to {time_series_data.iloc[train_idx_wrong]['Date'].max()}")
print(f"Test dates range: {time_series_data.iloc[test_idx_wrong]['Date'].min()} to {time_series_data.iloc[test_idx_wrong]['Date'].max()}")

# CORRECT: Temporal split
split_point = int(0.8 * len(time_series_data))
train_idx_correct = np.arange(split_point)
test_idx_correct = np.arange(split_point, len(time_series_data))

print(f"✅ CORRECT: Temporal split")
print(f"Training dates range: {time_series_data.iloc[train_idx_correct]['Date'].min()} to {time_series_data.iloc[train_idx_correct]['Date'].max()}")
print(f"Test dates range: {time_series_data.iloc[test_idx_correct]['Date'].min()} to {time_series_data.iloc[test_idx_correct]['Date'].max()}")

# ========================================
# SECTION 9: Best Practices Summary
# ========================================

print("\n" + "="*60)
print("SECTION 9: Best Practices Summary")
print("="*60)

print("✅ BEST PRACTICES CHECKLIST:")
print("1. ✓ Always maintain strict separation between train and test sets")
print("2. ✓ Use stratified splitting for imbalanced datasets")
print("3. ✓ Use temporal splitting for time series data")
print("4. ✓ Scale/normalize data AFTER splitting, not before")
print("5. ✓ Ensure test set represents real-world conditions")
print("6. ✓ Use cross-validation for robust performance estimates")
print("7. ✓ Check for data leakage and temporal dependencies")
print("8. ✓ Document your splitting strategy and rationale")
print("9. ✓ Monitor for distribution shifts between train and test")
print("10. ✓ Reserve test set for final evaluation only")

print("\n❌ COMMON MISTAKES TO AVOID:")
print("1. ✗ Using test set for model selection or hyperparameter tuning")
print("2. ✗ Preprocessing entire dataset before splitting")
print("3. ✗ Random splitting for time series or sequential data")
print("4. ✗ Ignoring class imbalance in splitting")
print("5. ✗ Including duplicate or highly similar samples in both sets")
print("6. ✗ Not checking if test set is representative")
print("7. ✗ Using too small test sets (less than 15-20%)")
print("8. ✗ Not documenting the splitting process")

print("\n🎯 DOMAIN-SPECIFIC CONSIDERATIONS:")
print("Healthcare:")
print("- Ensure patient data doesn't leak between sets")
print("- Consider temporal aspects of medical data")
print("- Account for rare diseases in stratification")
print("- Validate across different hospitals/demographics")

print("\nFinance:")
print("- Always use temporal splitting for market data")
print("- Account for economic regime changes")
print("- Consider transaction-level vs customer-level splitting")
print("- Be aware of survivorship bias in historical data")

print("\n" + "="*60)
print("🎉 CONGRATULATIONS!")
print("You've completed the comprehensive guide to training and test datasets!")
print("Remember: Proper dataset splitting is the foundation of trustworthy AI.")
print("="*60)

# Final demonstration: Quick model comparison
print("\n📊 FINAL DEMONSTRATION: Model Comparison")

# Compare performance across our three domains
models_performance = {
    'Medical Diagnosis': accuracy_med,
    'Credit Risk': accuracy_credit,
    'Stock Prediction': accuracy_stock
}

plt.figure(figsize=(10, 6))
domains = list(models_performance.keys())
accuracies = list(models_performance.values())

bars = plt.bar(domains, accuracies, color=['lightblue', 'lightgreen', 'lightcoral'])
plt.title('Model Performance Across Different Domains')
plt.ylabel('Accuracy')
plt.ylim(0, 1)

# Add value labels on bars
for bar, acc in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f'{acc:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

print(f"\nModel Performance Summary:")
for domain, accuracy in models_performance.items():
    print(f"{domain}: {accuracy:.3f}")

print(f"\n🚀 Next Steps:")
print("1. Practice with your own datasets")
print("2. Experiment with different splitting strategies")
print("3. Try cross-validation on various problems")
print("4. Always validate your splits make sense for your domain")
print("5. Remember: Good data splitting is the key to reliable AI!")