In [None]:
# Feature Selection in AI - Practical Examples
# Google Colab Notebook
# This notebook demonstrates feature selection techniques with real-world examples

# First, let's install and import the necessary libraries
!pip install pandas numpy scikit-learn matplotlib seaborn plotly

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif, RFE, SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

print("✅ All libraries imported successfully!")
print("📚 This notebook will guide you through feature selection step-by-step")

# ==============================================================================
# SECTION 1: Understanding Features with a Simple Example
# ==============================================================================

print("\n" + "="*60)
print("SECTION 1: Understanding What Features Are")
print("="*60)

# Let's create a simple dataset to understand features
# Imagine we're predicting whether someone will buy a product

# Create sample data
np.random.seed(42)
n_customers = 1000

# Generate customer features
age = np.random.normal(35, 12, n_customers)
income = np.random.normal(50000, 15000, n_customers)
website_visits = np.random.poisson(5, n_customers)
previous_purchases = np.random.poisson(2, n_customers)
random_noise = np.random.normal(0, 1, n_customers)  # This should be irrelevant

# Create purchase decision (this is what we want to predict)
# Let's say purchase depends on age, income, and previous purchases
purchase_probability = (
    0.3 * (age - 20) / 20 +  # Older people more likely to buy
    0.4 * (income - 30000) / 30000 +  # Higher income more likely to buy
    0.3 * previous_purchases / 5 +  # Previous customers more likely to buy
    0.1 * np.random.normal(0, 1, n_customers)  # Add some randomness
)

# Convert to binary purchase decision
will_purchase = (purchase_probability > 0.5).astype(int)

# Create DataFrame
customer_data = pd.DataFrame({
    'age': age,
    'income': income,
    'website_visits': website_visits,
    'previous_purchases': previous_purchases,
    'random_noise': random_noise,
    'will_purchase': will_purchase
})

print("📊 Sample Customer Data:")
print(customer_data.head())
print(f"\n📈 Dataset shape: {customer_data.shape}")
print(f"📈 Purchase rate: {customer_data['will_purchase'].mean():.2%}")

# ==============================================================================
# SECTION 2: Visualizing Feature Relationships
# ==============================================================================

print("\n" + "="*60)
print("SECTION 2: Visualizing How Features Relate to Our Target")
print("="*60)

# Create visualizations to understand feature relationships
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Feature Relationships with Purchase Decision', fontsize=16)

features = ['age', 'income', 'website_visits', 'previous_purchases', 'random_noise']

for i, feature in enumerate(features):
    row = i // 3
    col = i % 3

    # Create box plots to show distribution by purchase decision
    customer_data.boxplot(column=feature, by='will_purchase', ax=axes[row, col])
    axes[row, col].set_title(f'{feature.title()} vs Purchase Decision')
    axes[row, col].set_xlabel('Will Purchase (0=No, 1=Yes)')

# Remove the empty subplot
fig.delaxes(axes[1, 2])

plt.tight_layout()
plt.show()

# Calculate correlations
correlations = customer_data.corr()['will_purchase'].sort_values(ascending=False)
print("\n📊 Feature Correlations with Purchase Decision:")
for feature, corr in correlations.items():
    if feature != 'will_purchase':
        print(f"{feature:20s}: {corr:6.3f}")

# ==============================================================================
# SECTION 3: Manual Feature Selection Based on Domain Knowledge
# ==============================================================================

print("\n" + "="*60)
print("SECTION 3: Manual Feature Selection (Domain Expert Approach)")
print("="*60)

# Let's pretend we're domain experts and manually select features
# Based on business logic, we know that:
# 1. Age matters (older people might have more disposable income)
# 2. Income matters (higher income = more purchasing power)
# 3. Previous purchases matter (loyalty/satisfaction)
# 4. Website visits might matter (engagement)
# 5. Random noise should NOT matter

manually_selected_features = ['age', 'income', 'previous_purchases', 'website_visits']
print("🧠 Domain Expert Selection:")
print("✅ Selected features:", manually_selected_features)
print("❌ Excluded features: random_noise (obviously irrelevant)")

# Split data for manual selection
X_manual = customer_data[manually_selected_features]
y = customer_data['will_purchase']

X_train_manual, X_test_manual, y_train, y_test = train_test_split(
    X_manual, y, test_size=0.3, random_state=42
)

# Train a simple model with manually selected features
manual_model = RandomForestClassifier(n_estimators=100, random_state=42)
manual_model.fit(X_train_manual, y_train)

# Evaluate performance
manual_predictions = manual_model.predict(X_test_manual)
manual_accuracy = accuracy_score(y_test, manual_predictions)

print(f"\n📊 Manual Selection Results:")
print(f"Accuracy: {manual_accuracy:.3f}")
print(f"Features used: {len(manually_selected_features)}")

# ==============================================================================
# SECTION 4: Statistical Feature Selection Methods
# ==============================================================================

print("\n" + "="*60)
print("SECTION 4: Statistical Feature Selection Methods")
print("="*60)

# Prepare all features for statistical selection
all_features = ['age', 'income', 'website_visits', 'previous_purchases', 'random_noise']
X_all = customer_data[all_features]

# Method 1: SelectKBest (Univariate Selection)
print("🔍 Method 1: SelectKBest (Univariate Statistical Tests)")
print("This method selects features based on statistical tests")

selector_kbest = SelectKBest(score_func=f_classif, k=3)
X_selected_kbest = selector_kbest.fit_transform(X_all, y)

# Get selected feature names
selected_features_kbest = [all_features[i] for i in selector_kbest.get_support(indices=True)]
feature_scores = selector_kbest.scores_

print("\n📊 Feature Scores (higher is better):")
for feature, score in zip(all_features, feature_scores):
    selected = "✅" if feature in selected_features_kbest else "❌"
    print(f"{selected} {feature:20s}: {score:8.2f}")

# Method 2: Recursive Feature Elimination (RFE)
print("\n🔍 Method 2: Recursive Feature Elimination (RFE)")
print("This method recursively removes features and builds models")

estimator = LogisticRegression(random_state=42)
selector_rfe = RFE(estimator, n_features_to_select=3)
X_selected_rfe = selector_rfe.fit_transform(X_all, y)

selected_features_rfe = [all_features[i] for i in selector_rfe.get_support(indices=True)]
feature_rankings = selector_rfe.ranking_

print("\n📊 Feature Rankings (1 is best):")
for feature, rank in zip(all_features, feature_rankings):
    selected = "✅" if feature in selected_features_rfe else "❌"
    print(f"{selected} {feature:20s}: Rank {rank}")

# Method 3: Model-Based Selection (Feature Importance)
print("\n🔍 Method 3: Model-Based Selection (Feature Importance)")
print("This method uses a model to determine feature importance")

selector_model = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))
X_selected_model = selector_model.fit_transform(X_all, y)

selected_features_model = [all_features[i] for i in selector_model.get_support(indices=True)]
feature_importances = selector_model.estimator_.feature_importances_

print("\n📊 Feature Importance (higher is better):")
for feature, importance in zip(all_features, feature_importances):
    selected = "✅" if feature in selected_features_model else "❌"
    print(f"{selected} {feature:20s}: {importance:.4f}")

# ==============================================================================
# SECTION 5: Comparing Feature Selection Methods
# ==============================================================================

print("\n" + "="*60)
print("SECTION 5: Comparing Different Feature Selection Methods")
print("="*60)

# Compare all methods
methods = {
    'Manual Selection': manually_selected_features,
    'SelectKBest': selected_features_kbest,
    'RFE': selected_features_rfe,
    'Model-Based': selected_features_model
}

print("🔍 Comparison of Feature Selection Methods:")
for method_name, selected_features in methods.items():
    print(f"\n{method_name}:")
    print(f"  Selected: {selected_features}")
    print(f"  Count: {len(selected_features)}")

# Test each method's performance
results = {}

for method_name, selected_features in methods.items():
    # Prepare data
    X_method = customer_data[selected_features]
    X_train_method, X_test_method, _, _ = train_test_split(
        X_method, y, test_size=0.3, random_state=42
    )

    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train_method, y_train)

    # Evaluate
    predictions = model.predict(X_test_method)
    accuracy = accuracy_score(y_test, predictions)

    results[method_name] = {
        'accuracy': accuracy,
        'num_features': len(selected_features),
        'features': selected_features
    }

print("\n📊 Performance Comparison:")
print(f"{'Method':<20} {'Accuracy':<10} {'# Features':<12} {'Features'}")
print("-" * 70)
for method_name, result in results.items():
    print(f"{method_name:<20} {result['accuracy']:.3f}     {result['num_features']:<12} {result['features']}")

# ==============================================================================
# SECTION 6: Real-World Healthcare Example
# ==============================================================================

print("\n" + "="*60)
print("SECTION 6: Real-World Example - Healthcare (Breast Cancer Detection)")
print("="*60)

# Load breast cancer dataset (classic medical AI example)
cancer_data = load_breast_cancer()
X_cancer = pd.DataFrame(cancer_data.data, columns=cancer_data.feature_names)
y_cancer = cancer_data.target

print(f"📊 Breast Cancer Dataset:")
print(f"Samples: {X_cancer.shape[0]}")
print(f"Features: {X_cancer.shape[1]}")
print(f"Target: {cancer_data.target_names}")

# Show first few feature names
print(f"\n🔍 First 10 features:")
for i, feature in enumerate(X_cancer.columns[:10]):
    print(f"  {i+1:2d}. {feature}")

# This dataset has 30 features - too many to interpret easily
# Let's use feature selection to find the most important ones

# Apply feature selection
print("\n🏥 Applying Feature Selection for Medical Diagnosis:")

# Use SelectKBest to find top 10 features
selector_medical = SelectKBest(score_func=f_classif, k=10)
X_cancer_selected = selector_medical.fit_transform(X_cancer, y_cancer)

# Get selected feature names and scores
selected_indices = selector_medical.get_support(indices=True)
selected_features_medical = [X_cancer.columns[i] for i in selected_indices]
medical_scores = selector_medical.scores_

print(f"\n📊 Top 10 Most Important Features for Cancer Detection:")
feature_score_pairs = [(X_cancer.columns[i], medical_scores[i]) for i in selected_indices]
feature_score_pairs.sort(key=lambda x: x[1], reverse=True)

for i, (feature, score) in enumerate(feature_score_pairs):
    print(f"  {i+1:2d}. {feature:<25} Score: {score:8.2f}")

# Compare model performance with all features vs selected features
X_train_all, X_test_all, y_train_cancer, y_test_cancer = train_test_split(
    X_cancer, y_cancer, test_size=0.3, random_state=42
)

X_train_selected, X_test_selected, _, _ = train_test_split(
    X_cancer_selected, y_cancer, test_size=0.3, random_state=42
)

# Train models
model_all_features = RandomForestClassifier(n_estimators=100, random_state=42)
model_selected_features = RandomForestClassifier(n_estimators=100, random_state=42)

model_all_features.fit(X_train_all, y_train_cancer)
model_selected_features.fit(X_train_selected, y_train_cancer)

# Evaluate both models
pred_all = model_all_features.predict(X_test_all)
pred_selected = model_selected_features.predict(X_test_selected)

acc_all = accuracy_score(y_test_cancer, pred_all)
acc_selected = accuracy_score(y_test_cancer, pred_selected)

print(f"\n📊 Medical Diagnosis Results:")
print(f"All features ({X_cancer.shape[1]} features):      Accuracy = {acc_all:.3f}")
print(f"Selected features (10 features):    Accuracy = {acc_selected:.3f}")
print(f"Feature reduction: {X_cancer.shape[1]} → 10 features ({X_cancer.shape[1]-10} fewer)")

# ==============================================================================
# SECTION 7: Real-World Finance Example
# ==============================================================================

print("\n" + "="*60)
print("SECTION 7: Real-World Example - Finance (Credit Risk Assessment)")
print("="*60)

# Create a realistic credit risk dataset
print("💳 Creating Credit Risk Assessment Dataset...")

np.random.seed(42)
n_applicants = 2000

# Generate realistic financial features
credit_score = np.random.normal(650, 100, n_applicants)
annual_income = np.random.lognormal(np.log(50000), 0.5, n_applicants)
debt_to_income = np.random.beta(2, 5, n_applicants)  # Most people have reasonable debt
employment_years = np.random.exponential(5, n_applicants)
num_credit_cards = np.random.poisson(3, n_applicants)
monthly_expenses = annual_income * 0.6 * np.random.normal(1, 0.2, n_applicants)
savings_balance = np.random.exponential(10000, n_applicants)

# Create some irrelevant features (these shouldn't predict default)
favorite_color = np.random.choice(['red', 'blue', 'green', 'yellow'], n_applicants)
birth_month = np.random.randint(1, 13, n_applicants)
lucky_number = np.random.randint(1, 101, n_applicants)

# Create default probability based on financial logic
default_probability = (
    0.3 * np.maximum(0, (700 - credit_score) / 200) +  # Lower credit score = higher risk
    0.2 * np.maximum(0, (debt_to_income - 0.4) / 0.4) +  # High debt ratio = higher risk
    0.2 * np.maximum(0, (40000 - annual_income) / 40000) +  # Low income = higher risk
    0.1 * np.maximum(0, (2 - employment_years) / 2) +  # Short employment = higher risk
    0.2 * np.random.beta(2, 8, n_applicants)  # Random component
)

# Convert to binary default decision
will_default = (default_probability > 0.4).astype(int)

# Create DataFrame
credit_data = pd.DataFrame({
    'credit_score': credit_score,
    'annual_income': annual_income,
    'debt_to_income_ratio': debt_to_income,
    'employment_years': employment_years,
    'num_credit_cards': num_credit_cards,
    'monthly_expenses': monthly_expenses,
    'savings_balance': savings_balance,
    'favorite_color_encoded': pd.Categorical(favorite_color).codes,
    'birth_month': birth_month,
    'lucky_number': lucky_number,
    'will_default': will_default
})

print(f"📊 Credit Risk Dataset:")
print(f"Applicants: {credit_data.shape[0]}")
print(f"Features: {credit_data.shape[1] - 1}")
print(f"Default rate: {credit_data['will_default'].mean():.2%}")

# Apply feature selection for credit risk
finance_features = [col for col in credit_data.columns if col != 'will_default']
X_finance = credit_data[finance_features]
y_finance = credit_data['will_default']

# Use multiple feature selection methods
print("\n💰 Applying Feature Selection for Credit Risk:")

# Method 1: Correlation-based selection
correlations_finance = credit_data.corr()['will_default'].abs().sort_values(ascending=False)
top_correlated = correlations_finance.head(6).index.tolist()
top_correlated.remove('will_default')  # Remove target variable

print("📊 Top 5 Features by Correlation with Default:")
for i, feature in enumerate(top_correlated, 1):
    corr = correlations_finance[feature]
    print(f"  {i}. {feature:<25} Correlation: {corr:.3f}")

# Method 2: Statistical significance
selector_finance = SelectKBest(score_func=f_classif, k=5)
X_finance_selected = selector_finance.fit_transform(X_finance, y_finance)

selected_indices_finance = selector_finance.get_support(indices=True)
selected_features_finance = [finance_features[i] for i in selected_indices_finance]
finance_scores = selector_finance.scores_

print(f"\n📊 Top 5 Features by Statistical Significance:")
feature_score_pairs_finance = [(finance_features[i], finance_scores[i]) for i in selected_indices_finance]
feature_score_pairs_finance.sort(key=lambda x: x[1], reverse=True)

for i, (feature, score) in enumerate(feature_score_pairs_finance, 1):
    print(f"  {i}. {feature:<25} Score: {score:8.2f}")

# Compare performance
X_train_finance, X_test_finance, y_train_finance, y_test_finance = train_test_split(
    X_finance[selected_features_finance], y_finance, test_size=0.3, random_state=42
)

finance_model = RandomForestClassifier(n_estimators=100, random_state=42)
finance_model.fit(X_train_finance, y_train_finance)

finance_predictions = finance_model.predict(X_test_finance)
finance_accuracy = accuracy_score(y_test_finance, finance_predictions)

print(f"\n📊 Credit Risk Model Results:")
print(f"Selected features: {selected_features_finance}")
print(f"Accuracy: {finance_accuracy:.3f}")

# Show feature importance
feature_importance_finance = finance_model.feature_importances_
print(f"\n📊 Feature Importance in Final Model:")
for feature, importance in zip(selected_features_finance, feature_importance_finance):
    print(f"  {feature:<25} Importance: {importance:.3f}")

# ==============================================================================
# SECTION 8: Practical Tips and Best Practices
# ==============================================================================

print("\n" + "="*60)
print("SECTION 8: Practical Tips and Best Practices")
print("="*60)

print("💡 KEY TAKEAWAYS FROM OUR EXAMPLES:")
print()
print("1. 🎯 DOMAIN KNOWLEDGE IS CRUCIAL")
print("   - Our manual selection often performed as well as statistical methods")
print("   - Subject matter experts can identify relevant features that statistics might miss")
print("   - Always validate statistical results with domain expertise")
print()
print("2. 📊 DIFFERENT METHODS COMPLEMENT EACH OTHER")
print("   - Statistical methods (SelectKBest, RFE) find data patterns")
print("   - Model-based methods find features that work well together")
print("   - Correlation analysis reveals direct relationships")
print()
print("3. 🔍 QUALITY OVER QUANTITY")
print("   - 10 good features often beat 100 mediocre ones")
print("   - Our cancer detection worked well with just 10 out of 30 features")
print("   - Fewer features = faster training, easier interpretation")
print()
print("4. 🏥 REAL-WORLD CONSTRAINTS MATTER")
print("   - Healthcare: Features must be clinically meaningful")
print("   - Finance: Features must be legally compliant and ethically sound")
print("   - Consider data collection costs and availability")
print()
print("5. 🔄 ITERATION IS KEY")
print("   - Feature selection is not a one-time process")
print("   - Test different combinations and methods")
print("   - Monitor performance over time and adjust as needed")

print("\n🛠️ PRACTICAL IMPLEMENTATION CHECKLIST:")
print()
print("Before Starting:")
print("✅ Understand your problem domain thoroughly")
print("✅ Consult with subject matter experts")
print("✅ Understand data collection constraints")
print()
print("During Feature Selection:")
print("✅ Start with domain knowledge")
print("✅ Apply multiple statistical methods")
print("✅ Check for data leakage and unrealistic features")
print("✅ Consider feature interactions, not just individual features")
print()
print("After Feature Selection:")
print("✅ Validate results with domain experts")
print("✅ Test on completely new data")
print("✅ Monitor performance over time")
print("✅ Document your decisions and rationale")

# ==============================================================================
# SECTION 9: Interactive Exercise
# ==============================================================================

print("\n" + "="*60)
print("SECTION 9: Try It Yourself!")
print("="*60)

print("🎯 EXERCISE: Create Your Own Feature Selection Pipeline")
print()
print("Below is a template you can modify to try feature selection on your own data:")
print()

# Template code for users to modify
template_code = '''
# TEMPLATE: Feature Selection Pipeline
# Modify this code to work with your own dataset

def feature_selection_pipeline(X, y, target_name="target"):
    """
    Complete feature selection pipeline

    Parameters:
    X: DataFrame with features
    y: Target variable
    target_name: Name of what you're predicting
    """

    print(f"🔍 Feature Selection for {target_name}")
    print(f"Dataset shape: {X.shape}")

    # Step 1: Basic statistics
    print("\\n📊 Basic Feature Statistics:")
    print(X.describe())

    # Step 2: Correlation analysis
    if len(X.columns) < 20:  # Only show if not too many features
        correlations = pd.concat([X, y], axis=1).corr()[y.name].sort_values(ascending=False)
        print(f"\\n📈 Correlations with {target_name}:")
        for feature, corr in correlations.items():
            if feature != y.name:
                print(f"  {feature:<20}: {corr:6.3f}")

    # Step 3: Statistical feature selection
    k = min(10, X.shape[1])  # Select top 10 or all if fewer
    selector = SelectKBest(score_func=f_classif, k=k)
    X_selected = selector.fit_transform(X, y)

    selected_features = [X.columns[i] for i in selector.get_support(indices=True)]

    print(f"\\n✅ Selected {len(selected_features)} features:")
    for feature in selected_features:
        print(f"  - {feature}")

    return X_selected, selected_features

# Example usage with our customer data:
X_example = customer_data[['age', 'income', 'website_visits', 'previous_purchases', 'random_noise']]
y_example = customer_data['will_purchase']

X_selected_example, selected_features_example = feature_selection_pipeline(
    X_example, y_example, "customer purchase"
)
'''

print("📝 Here's the template code:")
print(template_code)

print("\n🎯 TO USE THIS TEMPLATE:")
print("1. Replace X_example and y_example with your own data")
print("2. Modify the target_name parameter")
print("3. Adjust the number of features to select (k parameter)")
print("4. Run the pipeline and analyze results")

print("\n🏁 FINAL THOUGHTS:")
print("Feature selection is both an art and a science. The best approach combines:")
print("- Domain expertise and business understanding")
print("- Statistical analysis and data-driven insights")
print("- Practical constraints and real-world considerations")
print("- Iterative testing and continuous improvement")
print()
print("Remember: The goal is not just to build accurate models, but to build")
print("models that are interpretable, maintainable, and useful in practice!")

print("\n✨ Congratulations! You've completed the feature selection tutorial!")
print("You now have the knowledge and tools to select features effectively for your AI projects.")