In [None]:
# Enhanced Random Forest Predictor with Log Transformation
class LogTransformRandomForestPredictor(RandomForestLossPredictor):
    """
    Enhanced Random Forest predictor with log transformation for target variable
    - Applies np.log1p() to target during training
    - Applies np.expm1() to predictions to reverse the transformation
    """
    
    def __init__(self, n_estimators=100, max_depth=10, random_state=42, use_log_transform=True):
        super().__init__(n_estimators, max_depth, random_state)
        self.use_log_transform = use_log_transform
        
    def prepare_data(self, df, target_col, test_size=0.2):
        """
        Prepare data with optional log transformation of target
        """
        print("üîÑ Preparing data with log transformation...")
        
        # Separate features and target
        X = df.drop(columns=[target_col])
        y = df[target_col].copy()
        
        # Apply log transformation to target if enabled
        if self.use_log_transform:
            print("   ‚úÖ Applying log1p transformation to target variable")
            # Add small constant to handle zeros, then apply log1p
            y = np.log1p(y)
            print(f"   Target range after log1p: [{y.min():.4f}, {y.max():.4f}]")
        
        # Identify column types
        numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
        categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
        
        print(f"Numerical features ({len(numerical_cols)}): {numerical_cols}")
        print(f"Categorical features ({len(categorical_cols)}): {categorical_cols}")
        
        # Create preprocessing pipeline
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numerical_cols),
                ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
            ])
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=self.random_state
        )
        
        print(f"Training set: {X_train.shape[0]} samples")
        print(f"Test set: {X_test.shape[0]} samples")
        
        # Store for later use
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.preprocessor = preprocessor
        self.numerical_cols = numerical_cols
        self.categorical_cols = categorical_cols
        
        return X_train, X_test, y_train, y_test
    
    def predict(self, new_data):
        """
        Make predictions on new data with inverse log transformation
        """
        if self.pipeline is None:
            raise ValueError("Model not trained yet! Call train_model() first.")
        
        # Get predictions from the model (in log space)
        log_predictions = self.pipeline.predict(new_data)
        
        # Apply inverse transformation if log transform was used
        if self.use_log_transform:
            # Apply expm1 to reverse the log1p transformation
            predictions = np.expm1(log_predictions)
            print("   ‚úÖ Applied expm1 inverse transformation to predictions")
        else:
            predictions = log_predictions
            
        return predictions
    
    def evaluate_model(self):
        """
        Evaluate model with proper inverse transformation for MAPE calculation
        """
        print("üìä Evaluating model performance with inverse transformation...")
        
        # Get predictions in log space
        log_predictions = self.pipeline.predict(self.X_test)
        
        # Apply inverse transformation to get predictions in original scale
        if self.use_log_transform:
            y_pred = np.expm1(log_predictions)
            y_test_original = np.expm1(self.y_test)  # Convert test set back to original scale
            print("   ‚úÖ Applied expm1 inverse transformation for evaluation")
        else:
            y_pred = log_predictions
            y_test_original = self.y_test
        
        # Calculate MAPE on original scale
        mape = mean_absolute_percentage_error(y_test_original, y_pred) * 100
        
        # Calculate other metrics on original scale
        r2 = r2_score(y_test_original, y_pred)
        mse = mean_squared_error(y_test_original, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test_original, y_pred)
        
        print(f"Test Set Performance (Original Scale):")
        print(f"MAPE: {mape:.4f}% üéØ (Primary Metric)")
        print(f"R¬≤ Score: {r2:.4f}")
        print(f"RMSE: {rmse:.2f}")
        print(f"MAE: {mae:.2f}")
        
        # MAPE interpretation
        if mape < 10:
            print("üìà Excellent prediction accuracy (MAPE < 10%)")
        elif mape < 20:
            print("üìä Good prediction accuracy (MAPE < 20%)")
        elif mape < 50:
            print("‚ö†Ô∏è Reasonable prediction accuracy (MAPE < 50%)")
        else:
            print("‚ùå Poor prediction accuracy (MAPE > 50%)")
        
        # Also evaluate in log space for comparison
        if self.use_log_transform:
            log_r2 = r2_score(self.y_test, log_predictions)
            log_mse = mean_squared_error(self.y_test, log_predictions)
            print(f"\nLog Space Performance (for reference):")
            print(f"Log R¬≤ Score: {log_r2:.4f}")
            print(f"Log MSE: {log_mse:.4f}")
        
        return {
            'mape': mape,
            'r2': r2,
            'rmse': rmse,
            'mae': mae,
            'predictions': y_pred,
            'y_test_original': y_test_original
        }

print("‚úÖ LogTransformRandomForestPredictor class created!")
print("üìù Key features:")
print("  - Automatically applies np.log1p() to target during training")
print("  - Automatically applies np.expm1() to predictions")
print("  - Evaluates MAPE on original scale for meaningful results")
print("  - Handles zero values gracefully with log1p/expm1")

In [None]:
# Demonstration: Using Log Transform Predictor
print("üéØ TESTING LOG TRANSFORM RANDOM FOREST PREDICTOR")
print("=" * 60)

# Create log-transform predictor
log_rf_predictor = LogTransformRandomForestPredictor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    use_log_transform=True  # Enable log transformation
)

print("\n1Ô∏è‚É£ Data Preparation with Log Transform")
X_train_log, X_test_log, y_train_log, y_test_log = log_rf_predictor.prepare_data(
    df_ml, 'estimated_loss', test_size=0.2
)

print("\n2Ô∏è‚É£ Model Training (on log-transformed target)")
cv_scores = log_rf_predictor.train_model(cv_folds=5)

print("\n3Ô∏è‚É£ Model Evaluation (with automatic inverse transformation)")
results = log_rf_predictor.evaluate_model()

print("\n4Ô∏è‚É£ Feature Importance Analysis")
log_rf_predictor.plot_feature_importance(top_n=12)

print("\n5Ô∏è‚É£ Test Predictions with Inverse Transform")
# Test with some sample data
test_sample = df_ml.drop(columns=['estimated_loss']).head(5)
predictions = log_rf_predictor.predict(test_sample)

print("Sample predictions (automatically inverse-transformed):")
original_values = df_ml['estimated_loss'].head(5).values
for i, (pred, actual) in enumerate(zip(predictions, original_values)):
    error_pct = abs(pred - actual) / actual * 100 if actual > 0 else 0
    print(f"  Sample {i+1}: Predicted=${pred:.2f}, Actual=${actual:.2f}, Error={error_pct:.1f}%")

In [None]:
# Comparison: Log Transform vs Regular Predictor
print("‚öñÔ∏è COMPARING LOG TRANSFORM vs REGULAR PREDICTOR")
print("=" * 60)

# Test both approaches
print("\nüîÑ Testing Regular Predictor (no log transform)...")
regular_predictor = LogTransformRandomForestPredictor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    use_log_transform=False  # Disable log transformation
)

# Train regular predictor
regular_predictor.prepare_data(df_ml, 'estimated_loss', test_size=0.2)
regular_predictor.train_model(cv_folds=3)
regular_results = regular_predictor.evaluate_model()

print("\nüîÑ Testing Log Transform Predictor...")
log_predictor = LogTransformRandomForestPredictor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    use_log_transform=True  # Enable log transformation
)

# Train log predictor  
log_predictor.prepare_data(df_ml, 'estimated_loss', test_size=0.2)
log_predictor.train_model(cv_folds=3)
log_results = log_predictor.evaluate_model()

print("\nüìä COMPARISON RESULTS")
print("=" * 40)
print(f"{'Metric':<15} {'Regular':<12} {'Log Transform':<15} {'Improvement'}")
print("-" * 55)

metrics = ['mape', 'r2', 'rmse', 'mae']
for metric in metrics:
    regular_val = regular_results[metric]
    log_val = log_results[metric]
    
    # Calculate improvement (for MAPE and RMSE, lower is better)
    if metric in ['mape', 'rmse', 'mae']:
        improvement = ((regular_val - log_val) / regular_val) * 100
        improvement_str = f"{improvement:+.1f}%" if improvement != 0 else "0.0%"
    else:  # For R2, higher is better
        improvement = ((log_val - regular_val) / regular_val) * 100 if regular_val != 0 else 0
        improvement_str = f"{improvement:+.1f}%" if improvement != 0 else "0.0%"
    
    print(f"{metric.upper():<15} {regular_val:<12.3f} {log_val:<15.3f} {improvement_str}")

print("\nüí° LOG TRANSFORMATION BENEFITS:")
print("‚úÖ Reduces impact of outliers")
print("‚úÖ Handles skewed target distributions better") 
print("‚úÖ Often improves prediction of small values")
print("‚úÖ Can reduce heteroscedasticity (non-constant variance)")
print("‚úÖ Makes multiplicative relationships additive")

In [None]:
# Quick Guide: Applying Log Transform to Existing Models
print("üìù QUICK GUIDE: ADDING LOG TRANSFORM TO EXISTING MODELS")
print("=" * 60)

print("""
üîÑ STEP-BY-STEP PROCESS:

1Ô∏è‚É£ PREPARE TARGET WITH LOG TRANSFORM:
   # During data preparation
   y_original = df['estimated_loss']
   y_log = np.log1p(y_original)  # Use log1p to handle zeros
   
2Ô∏è‚É£ TRAIN MODEL ON LOG-TRANSFORMED TARGET:
   # Train your model using y_log instead of y_original
   model.fit(X_train, y_log_train)
   
3Ô∏è‚É£ INVERSE TRANSFORM PREDICTIONS:
   # After getting predictions
   log_predictions = model.predict(X_test)
   final_predictions = np.expm1(log_predictions)  # Inverse transform
   
4Ô∏è‚É£ EVALUATE ON ORIGINAL SCALE:
   # Calculate MAPE on original scale
   y_test_original = np.expm1(y_log_test)  # Convert test back to original
   mape = mean_absolute_percentage_error(y_test_original, final_predictions)

üí° KEY POINTS:
‚úÖ Always use log1p/expm1 pair (handles zeros gracefully)
‚úÖ Train on log-transformed target
‚úÖ Apply expm1 to ALL predictions before evaluation
‚úÖ Evaluate metrics on original scale for meaningful interpretation
‚úÖ Log transformation often helps with skewed financial data
""")

print("\nüõ†Ô∏è UTILITY FUNCTIONS:")

def apply_log_transform_to_target(y):
    """Apply log1p transformation to target variable"""
    return np.log1p(y)

def inverse_log_transform_predictions(log_predictions):
    """Apply expm1 to inverse log transformation"""
    return np.expm1(log_predictions)

def evaluate_with_log_inverse(model, X_test, y_test_log, y_test_original=None):
    """Evaluate model with automatic log inverse transformation"""
    # Get predictions in log space
    log_pred = model.predict(X_test)
    
    # Transform back to original scale
    pred_original = inverse_log_transform_predictions(log_pred)
    
    # If original test values not provided, compute them
    if y_test_original is None:
        y_test_original = inverse_log_transform_predictions(y_test_log)
    
    # Calculate metrics on original scale
    mape = mean_absolute_percentage_error(y_test_original, pred_original) * 100
    r2 = r2_score(y_test_original, pred_original)
    rmse = np.sqrt(mean_squared_error(y_test_original, pred_original))
    
    return {
        'mape': mape,
        'r2': r2, 
        'rmse': rmse,
        'predictions_original': pred_original,
        'predictions_log': log_pred
    }

print("‚úÖ Utility functions defined!")
print("   - apply_log_transform_to_target()")
print("   - inverse_log_transform_predictions()")
print("   - evaluate_with_log_inverse()")

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr, spearmanr
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
# Fill NaN values with 0 for all numerical columns
df = df.fillna(0)
print("NaN values filled with 0")
print(f"Dataset shape after filling: {df.shape}")
print(f"Remaining NaN values: {df.isnull().sum().sum()}")

In [None]:
# Create sample dataset for demonstration
np.random.seed(42)
n_samples = 1000

# Generate correlated features with different strength relationships
data = {
    'feature_1': np.random.normal(0, 1, n_samples),
    'feature_2': np.random.normal(0, 1, n_samples),
    'feature_3': np.random.normal(0, 1, n_samples),
    'feature_4': np.random.exponential(2, n_samples),
    'feature_5': np.random.uniform(-5, 5, n_samples)
}

# Create target variable with different relationships to features
target = (2 * data['feature_1'] + 
         0.5 * data['feature_2'] + 
         -1.5 * data['feature_3'] + 
         0.1 * data['feature_4'] + 
         0.05 * data['feature_5'] + 
         np.random.normal(0, 0.5, n_samples))

data['target'] = target
df = pd.DataFrame(data)

print("Sample dataset created:")
print(df.head())
print(f"\nDataset shape: {df.shape}")
print(f"\nData types:\n{df.dtypes}")

In [None]:
def analyze_numerical_relationships(df, target_col, exclude_cols=None):
    """
    Comprehensive analysis of relationships between numerical columns and target variable
    
    Parameters:
    df: pandas DataFrame
    target_col: string, name of target column
    exclude_cols: list, columns to exclude from analysis
    """
    
    if exclude_cols is None:
        exclude_cols = []
    
    # Select only numerical columns
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    # Remove target column and excluded columns
    feature_cols = [col for col in numerical_cols if col != target_col and col not in exclude_cols]
    
    print(f"Analyzing relationships between {len(feature_cols)} numerical features and target '{target_col}'")
    print(f"Features: {feature_cols}")
    
    return feature_cols

# Example usage
target_column = 'target'
feature_columns = analyze_numerical_relationships(df, target_column)

In [None]:
# 1. CORRELATION ANALYSIS
def correlation_analysis(df, target_col, feature_cols):
    """Calculate and display correlation coefficients"""
    
    correlations = []
    
    for col in feature_cols:
        # Pearson correlation (linear relationships)
        pearson_corr, pearson_p = pearsonr(df[col], df[target_col])
        
        # Spearman correlation (monotonic relationships)
        spearman_corr, spearman_p = spearmanr(df[col], df[target_col])
        
        correlations.append({
            'Feature': col,
            'Pearson_Correlation': pearson_corr,
            'Pearson_p_value': pearson_p,
            'Spearman_Correlation': spearman_corr,
            'Spearman_p_value': spearman_p,
            'Abs_Pearson': abs(pearson_corr)
        })
    
    corr_df = pd.DataFrame(correlations)
    corr_df = corr_df.sort_values('Abs_Pearson', ascending=False)
    
    print("CORRELATION ANALYSIS:")
    print("=" * 50)
    print(corr_df.round(4))
    
    return corr_df

# Run correlation analysis
correlation_results = correlation_analysis(df, target_column, feature_columns)

In [None]:
# 2. CORRELATION HEATMAP
def plot_correlation_heatmap(df, target_col, feature_cols):
    """Create correlation heatmap"""
    
    # Create correlation matrix
    cols_to_plot = feature_cols + [target_col]
    correlation_matrix = df[cols_to_plot].corr()
    
    plt.figure(figsize=(10, 8))
    
    # Create heatmap
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
    sns.heatmap(correlation_matrix, 
                mask=mask,
                annot=True, 
                cmap='RdBu_r', 
                center=0,
                square=True,
                fmt='.3f',
                cbar_kws={"shrink": .8})
    
    plt.title(f'Correlation Matrix: Features vs {target_col}', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

plot_correlation_heatmap(df, target_column, feature_columns)

In [None]:
# 3. SCATTER PLOTS WITH REGRESSION LINES
def plot_scatter_relationships(df, target_col, feature_cols, cols_per_row=3, remove_outliers=True, outlier_threshold=3):
    """Create scatter plots for each feature vs target"""
    
    n_features = len(feature_cols)
    n_rows = (n_features + cols_per_row - 1) // cols_per_row
    
    fig, axes = plt.subplots(n_rows, cols_per_row, figsize=(15, 5*n_rows))
    
    # Flatten axes array for easier indexing
    if n_rows == 1:
        axes = [axes] if cols_per_row == 1 else axes
    else:
        axes = axes.flatten()
    
    for i, col in enumerate(feature_cols):
        ax = axes[i]
        
        # Create a copy of data for outlier removal
        df_plot = df[[col, target_col]].copy()
        
        if remove_outliers:
            # Remove outliers using z-score method
            z_scores_col = np.abs((df_plot[col] - df_plot[col].mean()) / df_plot[col].std())
            z_scores_target = np.abs((df_plot[target_col] - df_plot[target_col].mean()) / df_plot[target_col].std())
            
            # Keep points where both feature and target are within threshold
            mask = (z_scores_col < outlier_threshold) & (z_scores_target < outlier_threshold)
            df_plot = df_plot[mask]
            
            outliers_removed = len(df) - len(df_plot)
            if outliers_removed > 0:
                print(f"Removed {outliers_removed} outliers from {col} vs {target_col} plot")
        
        # Scatter plot
        ax.scatter(df_plot[col], df_plot[target_col], alpha=0.6, s=20)
        
        # Add regression line
        sns.regplot(data=df_plot, x=col, y=target_col, ax=ax, scatter=False, color='red')
        
        # Calculate correlation for title
        corr = df_plot[col].corr(df_plot[target_col])
        
        ax.set_xlabel(col, fontsize=10)
        ax.set_ylabel(target_col, fontsize=10)
        title = f'{col} vs {target_col}\nCorrelation: {corr:.3f}'
        if remove_outliers and outliers_removed > 0:
            title += f'\n({outliers_removed} outliers removed)'
        ax.set_title(title, fontsize=11)
        ax.grid(True, alpha=0.3)
    
    # Hide unused subplots
    for i in range(n_features, len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

plot_scatter_relationships(df, target_column, feature_columns)

In [None]:
# 4. PAIRPLOT FOR COMPREHENSIVE VIEW
def plot_pairplot(df, target_col, feature_cols, sample_size=None):
    """Create pairplot showing relationships between all variables"""
    
    # Sample data if too large
    if sample_size and len(df) > sample_size:
        df_sample = df.sample(sample_size, random_state=42)
        print(f"Sampling {sample_size} rows for pairplot visualization")
    else:
        df_sample = df
    
    # Select columns for pairplot
    cols_to_plot = feature_cols + [target_col]
    
    # Create pairplot
    g = sns.pairplot(df_sample[cols_to_plot], 
                     diag_kind='hist',
                     plot_kws={'alpha': 0.6, 's': 20},
                     diag_kws={'alpha': 0.7})
    
    # Highlight target variable
    for ax in g.axes[-1, :]:  # Bottom row
        ax.set_xlabel(ax.get_xlabel(), fontweight='bold' if ax.get_xlabel() == target_col else 'normal')
    
    for ax in g.axes[:, -1]:  # Right column
        ax.set_ylabel(ax.get_ylabel(), fontweight='bold' if ax.get_ylabel() == target_col else 'normal')
    
    plt.suptitle(f'Pairplot: Feature Relationships with {target_col}', y=1.02, fontsize=14, fontweight='bold')
    plt.show()

plot_pairplot(df, target_column, feature_columns, sample_size=500)

In [None]:
# 5. DISTRIBUTION PLOTS
def plot_distributions(df, target_col, feature_cols):
    """Plot distributions of features and target variable"""
    
    n_cols = len(feature_cols) + 1  # +1 for target
    n_rows = 2  # One for histograms, one for box plots
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 8))
    
    # Plot histograms
    for i, col in enumerate(feature_cols + [target_col]):
        ax = axes[0, i]
        ax.hist(df[col], bins=30, alpha=0.7, edgecolor='black')
        ax.set_title(f'Distribution of {col}', fontsize=10)
        ax.set_xlabel(col)
        ax.set_ylabel('Frequency')
        ax.grid(True, alpha=0.3)
        
        # Highlight target column
        if col == target_col:
            ax.set_facecolor('#ffe6e6')
    
    # Plot box plots
    for i, col in enumerate(feature_cols + [target_col]):
        ax = axes[1, i]
        ax.boxplot(df[col])
        ax.set_title(f'Box Plot of {col}', fontsize=10)
        ax.set_ylabel(col)
        ax.grid(True, alpha=0.3)
        
        # Highlight target column
        if col == target_col:
            ax.set_facecolor('#ffe6e6')
    
    plt.tight_layout()
    plt.show()

plot_distributions(df, target_column, feature_columns)

In [None]:
# 6. FEATURE IMPORTANCE VISUALIZATION
def plot_feature_importance(correlation_results, top_n=None):
    """Plot feature importance based on correlation strength"""
    
    if top_n:
        plot_data = correlation_results.head(top_n)
    else:
        plot_data = correlation_results
    
    plt.figure(figsize=(10, 6))
    
    # Create horizontal bar plot
    colors = ['red' if x < 0 else 'blue' for x in plot_data['Pearson_Correlation']]
    
    plt.barh(range(len(plot_data)), plot_data['Pearson_Correlation'], color=colors, alpha=0.7)
    plt.yticks(range(len(plot_data)), plot_data['Feature'])
    plt.xlabel('Correlation with Target')
    plt.title('Feature Importance (Pearson Correlation)', fontsize=14, fontweight='bold')
    plt.grid(True, alpha=0.3, axis='x')
    
    # Add correlation values on bars
    for i, (idx, row) in enumerate(plot_data.iterrows()):
        plt.text(row['Pearson_Correlation'] + (0.01 if row['Pearson_Correlation'] > 0 else -0.01), 
                i, f'{row["Pearson_Correlation"]:.3f}', 
                va='center', ha='left' if row['Pearson_Correlation'] > 0 else 'right')
    
    plt.tight_layout()
    plt.show()

plot_feature_importance(correlation_results)

In [None]:
# 7. BINNED ANALYSIS FOR NON-LINEAR RELATIONSHIPS
def plot_binned_analysis(df, target_col, feature_cols, n_bins=10):
    """Analyze relationships using binned approach"""
    
    n_features = len(feature_cols)
    n_cols = 2
    n_rows = (n_features + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(12, 4*n_rows))
    
    if n_rows == 1:
        axes = [axes] if n_cols == 1 else axes
    else:
        axes = axes.flatten()
    
    for i, col in enumerate(feature_cols):
        ax = axes[i]
        
        # Create bins
        df['bins'] = pd.cut(df[col], bins=n_bins, precision=2)
        
        # Calculate mean target value for each bin
        binned_stats = df.groupby('bins')[target_col].agg(['mean', 'std', 'count']).reset_index()
        binned_stats['bin_center'] = binned_stats['bins'].apply(lambda x: x.mid)
        
        # Plot mean target values
        ax.errorbar(binned_stats['bin_center'], binned_stats['mean'], 
                   yerr=binned_stats['std'], fmt='o-', capsize=5, capthick=2)
        
        ax.set_xlabel(f'{col} (binned)')
        ax.set_ylabel(f'Mean {target_col}')
        ax.set_title(f'Binned Analysis: {col} vs {target_col}')
        ax.grid(True, alpha=0.3)
        ax.tick_params(axis='x', rotation=45)
    
    # Clean up
    df.drop('bins', axis=1, inplace=True)
    
    # Hide unused subplots
    for i in range(n_features, len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

plot_binned_analysis(df, target_column, feature_columns)

In [None]:
# 8. COMPREHENSIVE ANALYSIS FUNCTION
def comprehensive_numerical_analysis(df, target_col, exclude_cols=None, 
                                   show_distributions=True, show_pairplot=True, 
                                   show_binned=True, sample_size_pairplot=500):
    """
    Run comprehensive analysis of numerical relationships with target variable
    
    Parameters:
    df: pandas DataFrame
    target_col: string, name of target column
    exclude_cols: list, columns to exclude from analysis
    show_distributions: bool, whether to show distribution plots
    show_pairplot: bool, whether to show pairplot
    show_binned: bool, whether to show binned analysis
    sample_size_pairplot: int, sample size for pairplot
    """
    
    print("üîç COMPREHENSIVE NUMERICAL RELATIONSHIP ANALYSIS")
    print("=" * 60)
    
    # Get feature columns
    feature_cols = analyze_numerical_relationships(df, target_col, exclude_cols)
    
    if len(feature_cols) == 0:
        print("No numerical features found for analysis!")
        return
    
    print(f"\nüìä Analyzing {len(feature_cols)} features against target '{target_col}'")
    
    # 1. Correlation Analysis
    print("\n1Ô∏è‚É£ CORRELATION ANALYSIS")
    correlation_results = correlation_analysis(df, target_col, feature_cols)
    
    # 2. Correlation Heatmap
    print("\n2Ô∏è‚É£ CORRELATION HEATMAP")
    plot_correlation_heatmap(df, target_col, feature_cols)
    
    # 3. Scatter Plots
    print("\n3Ô∏è‚É£ SCATTER PLOTS WITH REGRESSION LINES")
    plot_scatter_relationships(df, target_col, feature_cols)
    
    # 4. Feature Importance
    print("\n4Ô∏è‚É£ FEATURE IMPORTANCE")
    plot_feature_importance(correlation_results)
    
    # 5. Optional: Distributions
    if show_distributions:
        print("\n5Ô∏è‚É£ DISTRIBUTION ANALYSIS")
        plot_distributions(df, target_col, feature_cols)
    
    # 6. Optional: Pairplot
    if show_pairplot and len(feature_cols) <= 10:  # Limit for readability
        print("\n6Ô∏è‚É£ PAIRPLOT ANALYSIS")
        plot_pairplot(df, target_col, feature_cols, sample_size_pairplot)
    elif show_pairplot:
        print(f"\n‚ö†Ô∏è Skipping pairplot: too many features ({len(feature_cols)}). Limit is 10.")
    
    # 7. Optional: Binned Analysis
    if show_binned:
        print("\n7Ô∏è‚É£ BINNED ANALYSIS")
        plot_binned_analysis(df, target_col, feature_cols)
    
    print("\n‚úÖ Analysis complete!")
    
    return correlation_results

# Example usage with all options
results = comprehensive_numerical_analysis(
    df=df, 
    target_col='target',
    show_distributions=True,
    show_pairplot=True,
    show_binned=True
)

## üöÄ Quick Start Guide

### For Your Own Dataset:

```python
# Load your data
df = pd.read_csv('your_data.csv')  # or pd.read_excel(), etc.

# Quick analysis
results = comprehensive_numerical_analysis(
    df=df, 
    target_col='your_target_column_name',
    exclude_cols=['id', 'date'],  # columns to exclude
    show_distributions=True,
    show_pairplot=True,
    show_binned=True
)
```

### Individual Visualization Functions:

1. **`correlation_analysis(df, target_col, feature_cols)`** - Calculate correlations
2. **`plot_correlation_heatmap(df, target_col, feature_cols)`** - Correlation heatmap
3. **`plot_scatter_relationships(df, target_col, feature_cols)`** - Scatter plots with regression
4. **`plot_feature_importance(correlation_results)`** - Feature importance bars
5. **`plot_distributions(df, target_col, feature_cols)`** - Distribution analysis
6. **`plot_pairplot(df, target_col, feature_cols)`** - Comprehensive pairplot
7. **`plot_binned_analysis(df, target_col, feature_cols)`** - Binned relationship analysis

### Key Features:
- ‚úÖ **Correlation Analysis**: Pearson & Spearman correlations with p-values
- ‚úÖ **Multiple Visualizations**: Scatter plots, heatmaps, distributions, pairplots
- ‚úÖ **Feature Importance**: Ranked by correlation strength
- ‚úÖ **Non-linear Detection**: Binned analysis for complex relationships
- ‚úÖ **Statistical Significance**: P-values for correlation tests
- ‚úÖ **Flexible & Modular**: Use individual functions or comprehensive analysis

# üìä CATEGORICAL VARIABLES vs NUMERICAL TARGET

## Functions for analyzing relationships between categorical features and numerical target variables

In [None]:
# Additional imports for categorical analysis
from scipy import stats
import pandas as pd
import numpy as np

# Create sample dataset with categorical variables for demonstration
np.random.seed(42)
n_samples = 1000

# Create categorical variables with different relationships to target
categories_a = np.random.choice(['Category_A', 'Category_B', 'Category_C', 'Category_D'], n_samples, p=[0.3, 0.25, 0.25, 0.2])
categories_b = np.random.choice(['Type_1', 'Type_2', 'Type_3'], n_samples, p=[0.4, 0.35, 0.25])
categories_c = np.random.choice(['Low', 'Medium', 'High'], n_samples, p=[0.33, 0.34, 0.33])
categories_d = np.random.choice(['Yes', 'No'], n_samples, p=[0.6, 0.4])

# Create target variable with different effects from categorical variables
target_cat = np.random.normal(0, 1, n_samples)

# Add categorical effects to target
category_effects = {
    'Category_A': 2.0, 'Category_B': 0.5, 'Category_C': -1.0, 'Category_D': -0.5,
    'Type_1': 1.5, 'Type_2': 0.0, 'Type_3': -1.2,
    'Low': -1.8, 'Medium': 0.2, 'High': 1.6,
    'Yes': 0.8, 'No': -0.8
}

for i in range(n_samples):
    target_cat[i] += category_effects[categories_a[i]]
    target_cat[i] += category_effects[categories_b[i]]
    target_cat[i] += category_effects[categories_c[i]]
    target_cat[i] += category_effects[categories_d[i]]

# Create DataFrame with categorical variables
df_cat = pd.DataFrame({
    'feature_group': categories_a,
    'product_type': categories_b,
    'priority_level': categories_c,
    'has_feature': categories_d,
    'target_value': target_cat
})

print("Sample dataset with categorical variables created:")
print(df_cat.head(10))
print(f"\nDataset shape: {df_cat.shape}")
print(f"\nData types:\n{df_cat.dtypes}")
print(f"\nCategorical value counts:")
for col in df_cat.select_dtypes(include=['object']).columns:
    print(f"\n{col}:")
    print(df_cat[col].value_counts())

In [None]:
def analyze_categorical_relationships(df, target_col, exclude_cols=None):
    """
    Identify categorical columns and prepare for analysis
    
    Parameters:
    df: pandas DataFrame
    target_col: string, name of target column
    exclude_cols: list, columns to exclude from analysis
    """
    
    if exclude_cols is None:
        exclude_cols = []
    
    # Select categorical columns (object, category types)
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # Remove target column and excluded columns
    feature_cols = [col for col in categorical_cols if col != target_col and col not in exclude_cols]
    
    print(f"Analyzing relationships between {len(feature_cols)} categorical features and target '{target_col}'")
    print(f"Categorical features: {feature_cols}")
    
    # Check if target is numerical
    if not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Target column '{target_col}' must be numerical")
    
    return feature_cols

# Example usage
target_col_cat = 'target_value'
categorical_features = analyze_categorical_relationships(df_cat, target_col_cat)

In [None]:
# 1. STATISTICAL ANALYSIS FOR CATEGORICAL VARIABLES
def categorical_statistical_analysis(df, target_col, categorical_cols):
    """
    Perform statistical tests for categorical variables vs numerical target
    """
    
    results = []
    
    for col in categorical_cols:
        # Get groups
        groups = [df[df[col] == category][target_col].values for category in df[col].unique()]
        
        # Remove empty groups
        groups = [group for group in groups if len(group) > 0]
        
        if len(groups) < 2:
            continue
            
        # Perform ANOVA (Analysis of Variance)
        try:
            f_stat, p_value_anova = stats.f_oneway(*groups)
        except:
            f_stat, p_value_anova = np.nan, np.nan
        
        # Perform Kruskal-Wallis test (non-parametric alternative to ANOVA)
        try:
            h_stat, p_value_kw = stats.kruskal(*groups)
        except:
            h_stat, p_value_kw = np.nan, np.nan
        
        # Calculate effect size (eta-squared for ANOVA)
        try:
            # Total sum of squares
            grand_mean = df[target_col].mean()
            ss_total = ((df[target_col] - grand_mean) ** 2).sum()
            
            # Between-group sum of squares
            ss_between = 0
            for category in df[col].unique():
                group_data = df[df[col] == category][target_col]
                if len(group_data) > 0:
                    group_mean = group_data.mean()
                    ss_between += len(group_data) * (group_mean - grand_mean) ** 2
            
            eta_squared = ss_between / ss_total if ss_total > 0 else 0
        except:
            eta_squared = np.nan
        
        # Calculate descriptive statistics
        group_stats = df.groupby(col)[target_col].agg(['count', 'mean', 'std']).round(3)
        
        results.append({
            'Feature': col,
            'Unique_Categories': df[col].nunique(),
            'F_Statistic': f_stat,
            'ANOVA_p_value': p_value_anova,
            'Kruskal_Wallis_H': h_stat,
            'KW_p_value': p_value_kw,
            'Effect_Size_EtaSquared': eta_squared,
            'Group_Stats': group_stats
        })
    
    results_df = pd.DataFrame(results)
    
    if len(results_df) > 0:
        results_df = results_df.sort_values('Effect_Size_EtaSquared', ascending=False)
        
        print("CATEGORICAL STATISTICAL ANALYSIS:")
        print("=" * 60)
        print("ANOVA: Tests if means differ significantly between categories")
        print("Kruskal-Wallis: Non-parametric alternative to ANOVA")
        print("Effect Size (Œ∑¬≤): 0.01=small, 0.06=medium, 0.14=large effect")
        print("=" * 60)
        
        display_cols = ['Feature', 'Unique_Categories', 'F_Statistic', 'ANOVA_p_value', 
                       'Kruskal_Wallis_H', 'KW_p_value', 'Effect_Size_EtaSquared']
        print(results_df[display_cols].round(4))
        
        # Show group statistics for top features
        print(f"\nüìä GROUP STATISTICS FOR TOP FEATURES:")
        for i, row in results_df.head(3).iterrows():
            print(f"\n{row['Feature']}:")
            print(row['Group_Stats'])
    
    return results_df

# Run statistical analysis
stats_results = categorical_statistical_analysis(df_cat, target_col_cat, categorical_features)

In [None]:
# 2. BOX PLOTS FOR CATEGORICAL VARIABLES
def plot_categorical_boxplots(df, target_col, categorical_cols, cols_per_row=2):
    """
    Create box plots for each categorical variable vs target
    """
    
    n_features = len(categorical_cols)
    n_rows = (n_features + cols_per_row - 1) // cols_per_row
    
    fig, axes = plt.subplots(n_rows, cols_per_row, figsize=(6*cols_per_row, 5*n_rows))
    
    # Handle single subplot case
    if n_rows == 1 and cols_per_row == 1:
        axes = [axes]
    elif n_rows == 1:
        axes = axes
    else:
        axes = axes.flatten()
    
    for i, col in enumerate(categorical_cols):
        ax = axes[i]
        
        # Create box plot
        df.boxplot(column=target_col, by=col, ax=ax)
        
        # Customize plot
        ax.set_title(f'{col} vs {target_col}')
        ax.set_xlabel(col)
        ax.set_ylabel(target_col)
        ax.grid(True, alpha=0.3)
        
        # Rotate x-axis labels if needed
        labels = ax.get_xticklabels()
        if any(len(label.get_text()) > 8 for label in labels):
            ax.tick_params(axis='x', rotation=45)
    
    # Hide unused subplots
    for i in range(n_features, len(axes)):
        axes[i].set_visible(False)
    
    plt.suptitle('')  # Remove default suptitle
    plt.tight_layout()
    plt.show()

plot_categorical_boxplots(df_cat, target_col_cat, categorical_features)

In [None]:
# 3. VIOLIN PLOTS FOR DISTRIBUTION VISUALIZATION
def plot_categorical_violins(df, target_col, categorical_cols, cols_per_row=2):
    """
    Create violin plots showing distribution shape for each category
    """
    
    n_features = len(categorical_cols)
    n_rows = (n_features + cols_per_row - 1) // cols_per_row
    
    fig, axes = plt.subplots(n_rows, cols_per_row, figsize=(8*cols_per_row, 5*n_rows))
    
    # Handle single subplot case
    if n_rows == 1 and cols_per_row == 1:
        axes = [axes]
    elif n_rows == 1:
        axes = axes
    else:
        axes = axes.flatten()
    
    for i, col in enumerate(categorical_cols):
        ax = axes[i]
        
        # Create violin plot
        sns.violinplot(data=df, x=col, y=target_col, ax=ax)
        
        # Customize plot
        ax.set_title(f'Distribution of {target_col} by {col}', fontsize=12, fontweight='bold')
        ax.set_xlabel(col)
        ax.set_ylabel(target_col)
        ax.grid(True, alpha=0.3)
        
        # Rotate x-axis labels if needed
        labels = ax.get_xticklabels()
        if any(len(label.get_text()) > 8 for label in labels):
            ax.tick_params(axis='x', rotation=45)
        
        # Add mean markers
        means = df.groupby(col)[target_col].mean()
        for j, (category, mean_val) in enumerate(means.items()):
            ax.plot(j, mean_val, marker='D', color='red', markersize=8, markeredgecolor='white', markeredgewidth=1)
    
    # Hide unused subplots
    for i in range(n_features, len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

plot_categorical_violins(df_cat, target_col_cat, categorical_features)

In [None]:
# 4. BAR PLOTS WITH CONFIDENCE INTERVALS
def plot_categorical_means(df, target_col, categorical_cols, cols_per_row=2):
    """
    Create bar plots showing mean target values by category with confidence intervals
    """
    
    n_features = len(categorical_cols)
    n_rows = (n_features + cols_per_row - 1) // cols_per_row
    
    fig, axes = plt.subplots(n_rows, cols_per_row, figsize=(8*cols_per_row, 5*n_rows))
    
    # Handle single subplot case
    if n_rows == 1 and cols_per_row == 1:
        axes = [axes]
    elif n_rows == 1:
        axes = axes
    else:
        axes = axes.flatten()
    
    for i, col in enumerate(categorical_cols):
        ax = axes[i]
        
        # Calculate statistics
        stats_data = df.groupby(col)[target_col].agg(['mean', 'std', 'count', 'sem']).reset_index()
        stats_data['ci'] = stats_data['sem'] * 1.96  # 95% confidence interval
        
        # Create bar plot
        bars = ax.bar(stats_data[col], stats_data['mean'], 
                     yerr=stats_data['ci'], capsize=5, alpha=0.7,
                     color=plt.cm.Set3(np.linspace(0, 1, len(stats_data))))
        
        # Customize plot
        ax.set_title(f'Mean {target_col} by {col}', fontsize=12, fontweight='bold')
        ax.set_xlabel(col)
        ax.set_ylabel(f'Mean {target_col}')
        ax.grid(True, alpha=0.3, axis='y')
        
        # Add value labels on bars
        for bar, mean_val, count in zip(bars, stats_data['mean'], stats_data['count']):
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{mean_val:.2f}\n(n={count})',
                   ha='center', va='bottom', fontsize=9)
        
        # Rotate x-axis labels if needed
        labels = ax.get_xticklabels()
        if any(len(label.get_text()) > 8 for label in labels):
            ax.tick_params(axis='x', rotation=45)
    
    # Hide unused subplots
    for i in range(n_features, len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

plot_categorical_means(df_cat, target_col_cat, categorical_features)

In [None]:
# 5. SWARM/STRIP PLOTS FOR INDIVIDUAL DATA POINTS
def plot_categorical_swarm(df, target_col, categorical_cols, cols_per_row=2, sample_size=1000):
    """
    Create swarm plots showing individual data points for each category
    """
    
    # Sample data if too large
    if len(df) > sample_size:
        df_sample = df.sample(sample_size, random_state=42)
        print(f"Sampling {sample_size} rows for swarm plot visualization")
    else:
        df_sample = df
    
    n_features = len(categorical_cols)
    n_rows = (n_features + cols_per_row - 1) // cols_per_row
    
    fig, axes = plt.subplots(n_rows, cols_per_row, figsize=(8*cols_per_row, 5*n_rows))
    
    # Handle single subplot case
    if n_rows == 1 and cols_per_row == 1:
        axes = [axes]
    elif n_rows == 1:
        axes = axes
    else:
        axes = axes.flatten()
    
    for i, col in enumerate(categorical_cols):
        ax = axes[i]
        
        # Try swarm plot first, fallback to strip plot if too many points
        try:
            sns.swarmplot(data=df_sample, x=col, y=target_col, ax=ax, size=4, alpha=0.7)
        except:
            # Fallback to strip plot for large datasets
            sns.stripplot(data=df_sample, x=col, y=target_col, ax=ax, size=4, alpha=0.7, jitter=True)
        
        # Overlay box plot
        sns.boxplot(data=df_sample, x=col, y=target_col, ax=ax, 
                   boxprops=dict(alpha=0.3), showfliers=False)
        
        # Customize plot
        ax.set_title(f'{col} vs {target_col} (Individual Points)', fontsize=12, fontweight='bold')
        ax.set_xlabel(col)
        ax.set_ylabel(target_col)
        ax.grid(True, alpha=0.3)
        
        # Rotate x-axis labels if needed
        labels = ax.get_xticklabels()
        if any(len(label.get_text()) > 8 for label in labels):
            ax.tick_params(axis='x', rotation=45)
    
    # Hide unused subplots
    for i in range(n_features, len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

plot_categorical_swarm(df_cat, target_col_cat, categorical_features)

In [None]:
# 6. EFFECT SIZE VISUALIZATION
def plot_effect_sizes(stats_results):
    """
    Visualize effect sizes for categorical variables
    """
    
    if len(stats_results) == 0:
        print("No statistical results to plot")
        return
    
    # Create effect size plot
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Effect size bar plot
    colors = ['green' if x >= 0.14 else 'orange' if x >= 0.06 else 'red' 
              for x in stats_results['Effect_Size_EtaSquared']]
    
    bars = ax1.barh(range(len(stats_results)), stats_results['Effect_Size_EtaSquared'], color=colors, alpha=0.7)
    ax1.set_yticks(range(len(stats_results)))
    ax1.set_yticklabels(stats_results['Feature'])
    ax1.set_xlabel('Effect Size (Œ∑¬≤)')
    ax1.set_title('Effect Sizes for Categorical Variables', fontweight='bold')
    ax1.grid(True, alpha=0.3, axis='x')
    
    # Add effect size interpretation lines
    ax1.axvline(x=0.01, color='red', linestyle='--', alpha=0.7, label='Small (0.01)')
    ax1.axvline(x=0.06, color='orange', linestyle='--', alpha=0.7, label='Medium (0.06)')
    ax1.axvline(x=0.14, color='green', linestyle='--', alpha=0.7, label='Large (0.14)')
    ax1.legend()
    
    # Add values on bars
    for bar, value in zip(bars, stats_results['Effect_Size_EtaSquared']):
        width = bar.get_width()
        ax1.text(width + 0.001, bar.get_y() + bar.get_height()/2,
                f'{value:.3f}', ha='left', va='center', fontsize=10)
    
    # P-value significance plot
    p_values = stats_results['ANOVA_p_value']
    colors_p = ['green' if x < 0.001 else 'orange' if x < 0.01 else 'red' if x < 0.05 else 'gray' 
                for x in p_values]
    
    bars2 = ax2.barh(range(len(stats_results)), -np.log10(p_values + 1e-16), color=colors_p, alpha=0.7)
    ax2.set_yticks(range(len(stats_results)))
    ax2.set_yticklabels(stats_results['Feature'])
    ax2.set_xlabel('-log10(p-value)')
    ax2.set_title('Statistical Significance (ANOVA)', fontweight='bold')
    ax2.grid(True, alpha=0.3, axis='x')
    
    # Add significance lines
    ax2.axvline(x=-np.log10(0.05), color='red', linestyle='--', alpha=0.7, label='p=0.05')
    ax2.axvline(x=-np.log10(0.01), color='orange', linestyle='--', alpha=0.7, label='p=0.01')
    ax2.axvline(x=-np.log10(0.001), color='green', linestyle='--', alpha=0.7, label='p=0.001')
    ax2.legend()
    
    # Add p-values on bars
    for bar, p_val in zip(bars2, p_values):
        width = bar.get_width()
        ax2.text(width + 0.1, bar.get_y() + bar.get_height()/2,
                f'p={p_val:.3f}' if p_val >= 0.001 else 'p<0.001', 
                ha='left', va='center', fontsize=9)
    
    plt.tight_layout()
    plt.show()

plot_effect_sizes(stats_results)

In [None]:
# 7. SUMMARY STATISTICS TABLE
def create_summary_table(df, target_col, categorical_cols):
    """
    Create comprehensive summary statistics table
    """
    
    summary_data = []
    
    for col in categorical_cols:
        # Group statistics
        group_stats = df.groupby(col)[target_col].agg([
            'count', 'mean', 'median', 'std', 'min', 'max'
        ]).round(3)
        
        # Overall statistics for comparison
        overall_mean = df[target_col].mean()
        overall_std = df[target_col].std()
        
        # Calculate standardized effect sizes (Cohen's d) relative to overall mean
        for category in group_stats.index:
            group_mean = group_stats.loc[category, 'mean']
            group_std = group_stats.loc[category, 'std']
            cohens_d = (group_mean - overall_mean) / overall_std
            
            summary_data.append({
                'Feature': col,
                'Category': category,
                'Count': group_stats.loc[category, 'count'],
                'Mean': group_stats.loc[category, 'mean'],
                'Median': group_stats.loc[category, 'median'],
                'Std': group_stats.loc[category, 'std'],
                'Min': group_stats.loc[category, 'min'],
                'Max': group_stats.loc[category, 'max'],
                'Cohens_d': cohens_d,
                'Effect_Magnitude': 'Large' if abs(cohens_d) >= 0.8 else 
                                  'Medium' if abs(cohens_d) >= 0.5 else 
                                  'Small' if abs(cohens_d) >= 0.2 else 'Negligible'
            })
    
    summary_df = pd.DataFrame(summary_data)
    
    print("üìã DETAILED SUMMARY STATISTICS BY CATEGORY")
    print("=" * 80)
    print("Cohen's d interpretation: 0.2=small, 0.5=medium, 0.8=large effect")
    print("=" * 80)
    
    # Display by feature
    for feature in categorical_cols:
        feature_data = summary_df[summary_df['Feature'] == feature]
        print(f"\nüîç {feature.upper()}:")
        display_cols = ['Category', 'Count', 'Mean', 'Median', 'Std', 'Cohens_d', 'Effect_Magnitude']
        print(feature_data[display_cols].to_string(index=False))
    
    return summary_df

summary_table = create_summary_table(df_cat, target_col_cat, categorical_features)

In [None]:
# 8. COMPREHENSIVE CATEGORICAL ANALYSIS FUNCTION
def comprehensive_categorical_analysis(df, target_col, exclude_cols=None, 
                                     show_boxplots=True, show_violins=True, 
                                     show_means=True, show_swarm=True,
                                     show_effect_sizes=True, show_summary=True):
    """
    Run comprehensive analysis of categorical relationships with numerical target variable
    
    Parameters:
    df: pandas DataFrame
    target_col: string, name of numerical target column
    exclude_cols: list, columns to exclude from analysis
    show_boxplots: bool, whether to show box plots
    show_violins: bool, whether to show violin plots
    show_means: bool, whether to show mean comparison bar plots
    show_swarm: bool, whether to show swarm/strip plots
    show_effect_sizes: bool, whether to show effect size visualization
    show_summary: bool, whether to show detailed summary statistics
    """
    
    print("üîç COMPREHENSIVE CATEGORICAL vs NUMERICAL TARGET ANALYSIS")
    print("=" * 70)
    
    # Get categorical columns
    categorical_cols = analyze_categorical_relationships(df, target_col, exclude_cols)
    
    if len(categorical_cols) == 0:
        print("No categorical features found for analysis!")
        return None, None
    
    print(f"\nüìä Analyzing {len(categorical_cols)} categorical features against numerical target '{target_col}'")
    
    # 1. Statistical Analysis
    print("\n1Ô∏è‚É£ STATISTICAL ANALYSIS")
    stats_results = categorical_statistical_analysis(df, target_col, categorical_cols)
    
    # 2. Box Plots
    if show_boxplots:
        print("\n2Ô∏è‚É£ BOX PLOTS")
        plot_categorical_boxplots(df, target_col, categorical_cols)
    
    # 3. Violin Plots
    if show_violins:
        print("\n3Ô∏è‚É£ VIOLIN PLOTS (Distribution Shapes)")
        plot_categorical_violins(df, target_col, categorical_cols)
    
    # 4. Mean Comparison Bar Plots
    if show_means:
        print("\n4Ô∏è‚É£ MEAN COMPARISON WITH CONFIDENCE INTERVALS")
        plot_categorical_means(df, target_col, categorical_cols)
    
    # 5. Effect Size Visualization
    if show_effect_sizes and len(stats_results) > 0:
        print("\n5Ô∏è‚É£ EFFECT SIZE & SIGNIFICANCE VISUALIZATION")
        plot_effect_sizes(stats_results)
    
    # 6. Swarm Plots
    if show_swarm:
        print("\n6Ô∏è‚É£ INDIVIDUAL DATA POINTS (SWARM PLOTS)")
        plot_categorical_swarm(df, target_col, categorical_cols)
    
    # 7. Summary Statistics
    if show_summary:
        print("\n7Ô∏è‚É£ DETAILED SUMMARY STATISTICS")
        summary_table = create_summary_table(df, target_col, categorical_cols)
    else:
        summary_table = None
    
    print("\n‚úÖ Categorical analysis complete!")
    
    return stats_results, summary_table

# Example usage with all options
cat_stats_results, cat_summary = comprehensive_categorical_analysis(
    df=df_cat, 
    target_col='target_value',
    show_boxplots=True,
    show_violins=True,
    show_means=True,
    show_swarm=True,
    show_effect_sizes=True,
    show_summary=True
)

## üöÄ Quick Start Guide for Categorical Analysis

### For Your Own Dataset:

```python
# Load your data
df = pd.read_csv('your_data.csv')

# Quick categorical analysis
cat_stats, cat_summary = comprehensive_categorical_analysis(
    df=df, 
    target_col='your_numerical_target_column',
    exclude_cols=['id', 'timestamp'],  # columns to exclude
    show_boxplots=True,
    show_violins=True,
    show_means=True,
    show_swarm=True,
    show_effect_sizes=True,
    show_summary=True
)
```

### Individual Categorical Visualization Functions:

1. **`categorical_statistical_analysis(df, target_col, categorical_cols)`** - ANOVA & Kruskal-Wallis tests
2. **`plot_categorical_boxplots(df, target_col, categorical_cols)`** - Box plots for each category
3. **`plot_categorical_violins(df, target_col, categorical_cols)`** - Distribution shapes by category
4. **`plot_categorical_means(df, target_col, categorical_cols)`** - Mean comparison with confidence intervals
5. **`plot_categorical_swarm(df, target_col, categorical_cols)`** - Individual data points visualization
6. **`plot_effect_sizes(stats_results)`** - Effect size and significance visualization
7. **`create_summary_table(df, target_col, categorical_cols)`** - Detailed statistics by category

### Key Features for Categorical Analysis:
- ‚úÖ **Statistical Tests**: ANOVA and Kruskal-Wallis for group differences
- ‚úÖ **Effect Sizes**: Eta-squared and Cohen's d for practical significance
- ‚úÖ **Multiple Visualizations**: Box plots, violin plots, bar charts, swarm plots
- ‚úÖ **Confidence Intervals**: Statistical uncertainty visualization
- ‚úÖ **Distribution Comparison**: Shape analysis across categories
- ‚úÖ **Individual Points**: See actual data distribution within categories
- ‚úÖ **Comprehensive Summary**: Detailed statistics for each category

# ü§ñ MACHINE LEARNING MODEL FOR NUMERICAL PREDICTION

## Comprehensive ML pipeline for predicting estimated_loss with feature importance analysis

In [None]:
# Additional imports for machine learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Machine learning libraries imported successfully!")

In [None]:
# Create realistic dataset for estimated_loss prediction
np.random.seed(42)
n_samples = 5000

# Create mixed feature dataset
data_ml = {
    # Numerical features
    'transaction_amount': np.random.lognormal(3, 1.5, n_samples),  # Transaction amounts
    'account_age_days': np.random.exponential(365, n_samples),     # Account age
    'previous_claims': np.random.poisson(2, n_samples),           # Previous claims count
    'risk_score': np.random.beta(2, 5, n_samples) * 100,         # Risk score 0-100
    'merchant_rating': np.random.normal(4.2, 0.8, n_samples),    # Merchant rating 1-5
    
    # Categorical features
    'transaction_type': np.random.choice(['purchase', 'transfer', 'withdrawal', 'refund'], 
                                       n_samples, p=[0.6, 0.2, 0.15, 0.05]),
    'merchant_category': np.random.choice(['retail', 'food', 'travel', 'entertainment', 'other'], 
                                        n_samples, p=[0.3, 0.25, 0.2, 0.15, 0.1]),
    'user_tier': np.random.choice(['bronze', 'silver', 'gold', 'platinum'], 
                                n_samples, p=[0.4, 0.3, 0.2, 0.1]),
    'payment_method': np.random.choice(['credit_card', 'debit_card', 'bank_transfer', 'digital_wallet'], 
                                     n_samples, p=[0.4, 0.3, 0.2, 0.1]),
    'country': np.random.choice(['US', 'UK', 'CA', 'AU', 'DE'], 
                              n_samples, p=[0.5, 0.2, 0.15, 0.1, 0.05])
}

# Create target variable (estimated_loss) with realistic relationships
estimated_loss = np.zeros(n_samples)

for i in range(n_samples):
    base_loss = 0
    
    # Transaction amount effect (higher amounts = higher potential loss)
    base_loss += data_ml['transaction_amount'][i] * 0.02
    
    # Risk score effect
    base_loss += data_ml['risk_score'][i] * 0.5
    
    # Previous claims effect
    base_loss += data_ml['previous_claims'][i] * 8
    
    # Account age effect (newer accounts riskier)
    base_loss += max(0, (100 - data_ml['account_age_days'][i]/10)) * 0.3
    
    # Merchant rating effect (lower rating = higher risk)
    base_loss += (5 - data_ml['merchant_rating'][i]) * 10
    
    # Categorical effects
    transaction_effects = {'purchase': 0, 'transfer': 15, 'withdrawal': 25, 'refund': -10}
    base_loss += transaction_effects[data_ml['transaction_type'][i]]
    
    category_effects = {'retail': 0, 'food': 5, 'travel': 20, 'entertainment': 10, 'other': 15}
    base_loss += category_effects[data_ml['merchant_category'][i]]
    
    tier_effects = {'bronze': 20, 'silver': 10, 'gold': 5, 'platinum': 0}
    base_loss += tier_effects[data_ml['user_tier'][i]]
    
    payment_effects = {'credit_card': 0, 'debit_card': 5, 'bank_transfer': 10, 'digital_wallet': 8}
    base_loss += payment_effects[data_ml['payment_method'][i]]
    
    country_effects = {'US': 0, 'UK': 5, 'CA': 3, 'AU': 7, 'DE': 4}
    base_loss += country_effects[data_ml['country'][i]]
    
    # Add some noise
    base_loss += np.random.normal(0, 15)
    
    # Ensure non-negative
    estimated_loss[i] = max(0, base_loss)

data_ml['estimated_loss'] = estimated_loss

# Create DataFrame
df_ml = pd.DataFrame(data_ml)

print("Machine Learning Dataset Created:")
print(f"Shape: {df_ml.shape}")
print(f"\nFirst 5 rows:")
print(df_ml.head())
print(f"\nTarget variable statistics:")
print(df_ml['estimated_loss'].describe())
print(f"\nData types:")
print(df_ml.dtypes)
print(f"\nMissing values:")
print(df_ml.isnull().sum())

In [None]:
# Data preprocessing function
def prepare_ml_data(df, target_col, test_size=0.2, random_state=42):
    """
    Prepare data for machine learning with proper encoding
    
    Parameters:
    df: pandas DataFrame
    target_col: string, name of target column
    test_size: float, proportion of data for testing
    random_state: int, for reproducibility
    """
    
    # Separate features and target
    X = df.drop(columns=[target_col])
    y = df[target_col]
    
    # Identify numerical and categorical columns
    numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    print(f"Numerical features ({len(numerical_cols)}): {numerical_cols}")
    print(f"Categorical features ({len(categorical_cols)}): {categorical_cols}")
    
    # Create preprocessing pipelines
    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)
    
    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    
    print(f"\nData split:")
    print(f"Training set: {X_train.shape[0]} samples")
    print(f"Test set: {X_test.shape[0]} samples")
    
    return X_train, X_test, y_train, y_test, preprocessor, numerical_cols, categorical_cols

# Prepare the data
X_train, X_test, y_train, y_test, preprocessor, num_cols, cat_cols = prepare_ml_data(
    df_ml, 'estimated_loss'
)

In [None]:
# Model training and evaluation class
class EstimatedLossPredictor:
    """
    Comprehensive machine learning model for predicting estimated_loss
    """
    
    def __init__(self, preprocessor):
        self.preprocessor = preprocessor
        self.models = {}
        self.results = {}
        self.best_model = None
        self.best_score = float('-inf')
        
    def define_models(self):
        """Define different models to compare"""
        self.models = {
            'Linear Regression': LinearRegression(),
            'Ridge Regression': Ridge(alpha=1.0),
            'Lasso Regression': Lasso(alpha=1.0),
            'Decision Tree': DecisionTreeRegressor(random_state=42, max_depth=10),
            'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10),
            'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42, max_depth=6),
            'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42, max_depth=6)
        }
        
    def train_models(self, X_train, y_train, cv_folds=5):
        """Train all models and perform cross-validation"""
        
        print("üîÑ Training models...")
        print("=" * 50)
        
        for name, model in self.models.items():
            print(f"\nTraining {name}...")
            
            # Create pipeline
            pipeline = Pipeline([
                ('preprocessor', self.preprocessor),
                ('regressor', model)
            ])
            
            # Cross-validation
            cv_scores = cross_val_score(pipeline, X_train, y_train, 
                                      cv=cv_folds, scoring='r2')
            
            # Fit the model
            pipeline.fit(X_train, y_train)
            
            # Store results
            self.results[name] = {
                'pipeline': pipeline,
                'cv_mean': cv_scores.mean(),
                'cv_std': cv_scores.std(),
                'cv_scores': cv_scores
            }
            
            print(f"CV R¬≤ Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
            
            # Track best model
            if cv_scores.mean() > self.best_score:
                self.best_score = cv_scores.mean()
                self.best_model = name
        
        print(f"\nüèÜ Best model: {self.best_model} (R¬≤ = {self.best_score:.4f})")
    
    def evaluate_models(self, X_test, y_test):
        """Evaluate all models on test set"""
        
        print("\nüìä Model Evaluation on Test Set:")
        print("=" * 60)
        
        evaluation_results = []
        
        for name, result in self.results.items():
            pipeline = result['pipeline']
            
            # Predictions
            y_pred = pipeline.predict(X_test)
            
            # Metrics
            r2 = r2_score(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            
            evaluation_results.append({
                'Model': name,
                'R¬≤': r2,
                'RMSE': rmse,
                'MAE': mae,
                'CV_R¬≤_Mean': result['cv_mean'],
                'CV_R¬≤_Std': result['cv_std']
            })
        
        # Create results DataFrame
        eval_df = pd.DataFrame(evaluation_results)
        eval_df = eval_df.sort_values('R¬≤', ascending=False)
        
        print(eval_df.round(4))
        
        return eval_df
    
    def get_feature_importance(self, feature_names=None):
        """Extract feature importance from the best model"""
        
        if self.best_model is None:
            print("No models trained yet!")
            return None
            
        best_pipeline = self.results[self.best_model]['pipeline']
        regressor = best_pipeline.named_steps['regressor']
        
        # Get feature names after preprocessing
        if feature_names is None:
            try:
                # Get feature names from preprocessor
                feature_names = best_pipeline.named_steps['preprocessor'].get_feature_names_out()
            except:
                feature_names = [f'feature_{i}' for i in range(len(regressor.feature_importances_))]
        
        # Extract importance based on model type
        if hasattr(regressor, 'feature_importances_'):
            # Tree-based models
            importances = regressor.feature_importances_
        elif hasattr(regressor, 'coef_'):
            # Linear models
            importances = np.abs(regressor.coef_)
        else:
            print(f"Feature importance not available for {self.best_model}")
            return None
        
        # Create importance DataFrame
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': importances
        }).sort_values('importance', ascending=False)
        
        return importance_df

# Initialize predictor
predictor = EstimatedLossPredictor(preprocessor)
predictor.define_models()

print("‚úÖ EstimatedLossPredictor initialized!")

In [None]:
# Train all models
predictor.train_models(X_train, y_train, cv_folds=5)

In [None]:
# Evaluate models on test set
evaluation_results = predictor.evaluate_models(X_test, y_test)

In [None]:
# Feature importance analysis and visualization
def plot_feature_importance(importance_df, top_n=20, figsize=(12, 8)):
    """
    Plot feature importance
    """
    
    # Select top N features
    plot_data = importance_df.head(top_n)
    
    plt.figure(figsize=figsize)
    
    # Create horizontal bar plot
    bars = plt.barh(range(len(plot_data)), plot_data['importance'], 
                   color=plt.cm.viridis(np.linspace(0, 1, len(plot_data))))
    
    # Customize plot
    plt.yticks(range(len(plot_data)), plot_data['feature'])
    plt.xlabel('Feature Importance')
    plt.title(f'Top {top_n} Feature Importances - {predictor.best_model}', 
              fontsize=14, fontweight='bold')
    plt.grid(True, alpha=0.3, axis='x')
    
    # Add importance values on bars
    for i, (bar, importance) in enumerate(zip(bars, plot_data['importance'])):
        width = bar.get_width()
        plt.text(width + max(plot_data['importance']) * 0.01, 
                bar.get_y() + bar.get_height()/2,
                f'{importance:.4f}', ha='left', va='center', fontsize=9)
    
    plt.tight_layout()
    plt.show()

# Get and plot feature importance
importance_df = predictor.get_feature_importance()

if importance_df is not None:
    print("\nüîç FEATURE IMPORTANCE ANALYSIS:")
    print("=" * 50)
    print(f"Top 15 most important features for {predictor.best_model}:")
    print(importance_df.head(15).round(4))
    
    plot_feature_importance(importance_df, top_n=15)

In [None]:
# Model performance visualization
def plot_model_comparison(evaluation_results):
    """
    Create comprehensive model comparison plots
    """
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. R¬≤ Score comparison
    ax1 = axes[0, 0]
    bars1 = ax1.bar(evaluation_results['Model'], evaluation_results['R¬≤'], 
                   color=plt.cm.Set3(np.linspace(0, 1, len(evaluation_results))))
    ax1.set_title('R¬≤ Score Comparison', fontweight='bold')
    ax1.set_ylabel('R¬≤ Score')
    ax1.tick_params(axis='x', rotation=45)
    ax1.grid(True, alpha=0.3, axis='y')
    
    # Add values on bars
    for bar, value in zip(bars1, evaluation_results['R¬≤']):
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{value:.3f}', ha='center', va='bottom', fontsize=9)
    
    # 2. RMSE comparison
    ax2 = axes[0, 1]
    bars2 = ax2.bar(evaluation_results['Model'], evaluation_results['RMSE'], 
                   color=plt.cm.Set2(np.linspace(0, 1, len(evaluation_results))))
    ax2.set_title('RMSE Comparison (Lower is Better)', fontweight='bold')
    ax2.set_ylabel('RMSE')
    ax2.tick_params(axis='x', rotation=45)
    ax2.grid(True, alpha=0.3, axis='y')
    
    # Add values on bars
    for bar, value in zip(bars2, evaluation_results['RMSE']):
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + max(evaluation_results['RMSE']) * 0.01,
                f'{value:.1f}', ha='center', va='bottom', fontsize=9)
    
    # 3. Cross-validation vs Test performance
    ax3 = axes[1, 0]
    x_pos = np.arange(len(evaluation_results['Model']))
    width = 0.35
    
    bars3a = ax3.bar(x_pos - width/2, evaluation_results['CV_R¬≤_Mean'], width, 
                    label='CV R¬≤ Mean', alpha=0.7, color='skyblue')
    bars3b = ax3.bar(x_pos + width/2, evaluation_results['R¬≤'], width, 
                    label='Test R¬≤', alpha=0.7, color='lightcoral')
    
    ax3.set_title('Cross-Validation vs Test Performance', fontweight='bold')
    ax3.set_ylabel('R¬≤ Score')
    ax3.set_xticks(x_pos)
    ax3.set_xticklabels(evaluation_results['Model'], rotation=45)
    ax3.legend()
    ax3.grid(True, alpha=0.3, axis='y')
    
    # 4. MAE comparison
    ax4 = axes[1, 1]
    bars4 = ax4.bar(evaluation_results['Model'], evaluation_results['MAE'], 
                   color=plt.cm.Pastel1(np.linspace(0, 1, len(evaluation_results))))
    ax4.set_title('MAE Comparison (Lower is Better)', fontweight='bold')
    ax4.set_ylabel('Mean Absolute Error')
    ax4.tick_params(axis='x', rotation=45)
    ax4.grid(True, alpha=0.3, axis='y')
    
    # Add values on bars
    for bar, value in zip(bars4, evaluation_results['MAE']):
        height = bar.get_height()
        ax4.text(bar.get_x() + bar.get_width()/2., height + max(evaluation_results['MAE']) * 0.01,
                f'{value:.1f}', ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.show()

# Plot model comparison
plot_model_comparison(evaluation_results)

In [None]:
# Prediction analysis and residuals
def analyze_predictions(predictor, X_test, y_test, sample_size=1000):
    """
    Analyze predictions from the best model
    """
    
    best_pipeline = predictor.results[predictor.best_model]['pipeline']
    y_pred = best_pipeline.predict(X_test)
    
    # Sample for visualization if dataset is large
    if len(y_test) > sample_size:
        indices = np.random.choice(len(y_test), sample_size, replace=False)
        y_test_sample = y_test.iloc[indices]
        y_pred_sample = y_pred[indices]
    else:
        y_test_sample = y_test
        y_pred_sample = y_pred
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. Actual vs Predicted
    ax1 = axes[0, 0]
    ax1.scatter(y_test_sample, y_pred_sample, alpha=0.6, s=20)
    
    # Perfect prediction line
    min_val = min(y_test_sample.min(), y_pred_sample.min())
    max_val = max(y_test_sample.max(), y_pred_sample.max())
    ax1.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label='Perfect Prediction')
    
    ax1.set_xlabel('Actual Values')
    ax1.set_ylabel('Predicted Values')
    ax1.set_title(f'Actual vs Predicted - {predictor.best_model}', fontweight='bold')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Add R¬≤ score
    r2 = r2_score(y_test_sample, y_pred_sample)
    ax1.text(0.05, 0.95, f'R¬≤ = {r2:.3f}', transform=ax1.transAxes, 
            bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    # 2. Residuals plot
    ax2 = axes[0, 1]
    residuals = y_test_sample - y_pred_sample
    ax2.scatter(y_pred_sample, residuals, alpha=0.6, s=20)
    ax2.axhline(y=0, color='r', linestyle='--', lw=2)
    ax2.set_xlabel('Predicted Values')
    ax2.set_ylabel('Residuals')
    ax2.set_title('Residuals Plot', fontweight='bold')
    ax2.grid(True, alpha=0.3)
    
    # 3. Residuals histogram
    ax3 = axes[1, 0]
    ax3.hist(residuals, bins=30, alpha=0.7, edgecolor='black')
    ax3.set_xlabel('Residuals')
    ax3.set_ylabel('Frequency')
    ax3.set_title('Distribution of Residuals', fontweight='bold')
    ax3.grid(True, alpha=0.3)
    
    # Add normal distribution overlay
    mu, sigma = residuals.mean(), residuals.std()
    x = np.linspace(residuals.min(), residuals.max(), 100)
    y = ((1 / (np.sqrt(2 * np.pi) * sigma)) * 
         np.exp(-0.5 * (1 / sigma * (x - mu)) ** 2))
    ax3_twin = ax3.twinx()
    ax3_twin.plot(x, y * len(residuals) * (x[1] - x[0]), 'r-', lw=2, label='Normal Distribution')
    ax3_twin.set_ylabel('Density (scaled)')
    ax3_twin.legend()\n    \n    # 4. Q-Q plot for residuals normality\n    ax4 = axes[1, 1]\n    from scipy import stats\n    stats.probplot(residuals, dist=\"norm\", plot=ax4)\n    ax4.set_title('Q-Q Plot (Residuals Normality)', fontweight='bold')\n    ax4.grid(True, alpha=0.3)\n    \n    plt.tight_layout()\n    plt.show()\n    \n    # Print summary statistics\n    print(f\"\\nüìà PREDICTION ANALYSIS SUMMARY:\")\n    print(f\"Model: {predictor.best_model}\")\n    print(f\"R¬≤ Score: {r2_score(y_test, y_pred):.4f}\")\n    print(f\"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}\")\n    print(f\"MAE: {mean_absolute_error(y_test, y_pred):.2f}\")\n    print(f\"Mean Residual: {residuals.mean():.4f}\")\n    print(f\"Std Residual: {residuals.std():.2f}\")\n    \n    return y_pred, residuals\n\n# Analyze predictions\ny_pred, residuals = analyze_predictions(predictor, X_test, y_test)

In [None]:
# Hyperparameter tuning for the best model
def hyperparameter_tuning(predictor, X_train, y_train, model_name=None):
    """
    Perform hyperparameter tuning for the best model
    """
    
    if model_name is None:
        model_name = predictor.best_model
    
    print(f\"üîß Hyperparameter tuning for {model_name}...\")\n    \n    # Define parameter grids for different models\n    param_grids = {\n        'Random Forest': {\n            'regressor__n_estimators': [50, 100, 200],\n            'regressor__max_depth': [5, 10, 15, None],\n            'regressor__min_samples_split': [2, 5, 10]\n        },\n        'Gradient Boosting': {\n            'regressor__n_estimators': [50, 100, 200],\n            'regressor__max_depth': [3, 5, 7],\n            'regressor__learning_rate': [0.01, 0.1, 0.2]\n        },\n        'XGBoost': {\n            'regressor__n_estimators': [50, 100, 200],\n            'regressor__max_depth': [3, 5, 7],\n            'regressor__learning_rate': [0.01, 0.1, 0.2]\n        }\n    }\n    \n    if model_name not in param_grids:\n        print(f\"No hyperparameter grid defined for {model_name}\")\n        return None\n    \n    # Get the base pipeline\n    base_pipeline = predictor.results[model_name]['pipeline']\n    \n    # Perform grid search\n    grid_search = GridSearchCV(\n        base_pipeline, \n        param_grids[model_name], \n        cv=3,  # Reduced for speed\n        scoring='r2',\n        n_jobs=-1,\n        verbose=1\n    )\n    \n    grid_search.fit(X_train, y_train)\n    \n    print(f\"\\n‚úÖ Best parameters for {model_name}:\")\n    for param, value in grid_search.best_params_.items():\n        print(f\"  {param}: {value}\")\n    \n    print(f\"\\nBest CV score: {grid_search.best_score_:.4f}\")\n    print(f\"Improvement over default: {grid_search.best_score_ - predictor.results[model_name]['cv_mean']:.4f}\")\n    \n    return grid_search.best_estimator_\n\n# Perform hyperparameter tuning for the best model (if it's a tree-based model)\nif predictor.best_model in ['Random Forest', 'Gradient Boosting', 'XGBoost']:\n    tuned_model = hyperparameter_tuning(predictor, X_train, y_train)\nelse:\n    print(f\"Hyperparameter tuning not implemented for {predictor.best_model}\")\n    tuned_model = None

In [None]:
# Model deployment functions\ndef save_model(model, filepath='best_estimated_loss_model.pkl'):\n    \"\"\"\n    Save the trained model\n    \"\"\"\n    import joblib\n    joblib.dump(model, filepath)\n    print(f\"‚úÖ Model saved to {filepath}\")\n\ndef load_model(filepath='best_estimated_loss_model.pkl'):\n    \"\"\"\n    Load a saved model\n    \"\"\"\n    import joblib\n    return joblib.load(filepath)\n\ndef predict_new_data(model, new_data):\n    \"\"\"\n    Make predictions on new data\n    \n    Parameters:\n    model: trained sklearn pipeline\n    new_data: pandas DataFrame with same features as training data\n    \n    Returns:\n    predictions: numpy array of predictions\n    \"\"\"\n    return model.predict(new_data)\n\n# Example of saving the best model\nbest_model_pipeline = predictor.results[predictor.best_model]['pipeline']\nsave_model(best_model_pipeline, 'estimated_loss_predictor.pkl')\n\n# Example of making predictions on new data\nprint(\"\\nüîÆ EXAMPLE: Making predictions on new data\")\nprint(\"=\" * 50)\n\n# Create sample new data\nnew_sample = pd.DataFrame({\n    'transaction_amount': [150.0, 2500.0, 75.0],\n    'account_age_days': [30, 365, 1200],\n    'previous_claims': [0, 3, 1],\n    'risk_score': [25.5, 75.2, 15.8],\n    'merchant_rating': [4.5, 2.1, 4.8],\n    'transaction_type': ['purchase', 'transfer', 'purchase'],\n    'merchant_category': ['retail', 'travel', 'food'],\n    'user_tier': ['bronze', 'gold', 'silver'],\n    'payment_method': ['credit_card', 'bank_transfer', 'debit_card'],\n    'country': ['US', 'UK', 'CA']\n})\n\nprint(\"New data sample:\")\nprint(new_sample)\n\n# Make predictions\npredictions = predict_new_data(best_model_pipeline, new_sample)\n\nprint(f\"\\nPredicted estimated_loss values:\")\nfor i, pred in enumerate(predictions):\n    print(f\"Sample {i+1}: ${pred:.2f}\")"antml:parameter>
</invoke>

## üöÄ Quick Start Guide for ML Model

### For Your Own Dataset:

```python
# 1. Load your data
df = pd.read_csv('your_data.csv')

# 2. Prepare data
X_train, X_test, y_train, y_test, preprocessor, num_cols, cat_cols = prepare_ml_data(
    df, target_col='estimated_loss'  # or your target column name
)

# 3. Initialize and train models
predictor = EstimatedLossPredictor(preprocessor)
predictor.define_models()
predictor.train_models(X_train, y_train)

# 4. Evaluate models
evaluation_results = predictor.evaluate_models(X_test, y_test)

# 5. Get feature importance
importance_df = predictor.get_feature_importance()

# 6. Save best model
best_model = predictor.results[predictor.best_model]['pipeline']
save_model(best_model, 'my_model.pkl')

# 7. Make predictions on new data
predictions = predict_new_data(best_model, new_data_df)
```

### Key Features of the ML Pipeline:

- ‚úÖ **Multiple Models**: Linear, Ridge, Lasso, Decision Tree, Random Forest, Gradient Boosting, XGBoost
- ‚úÖ **Automatic Preprocessing**: StandardScaler for numerical, OneHotEncoder for categorical
- ‚úÖ **Cross-Validation**: 5-fold CV for robust model selection
- ‚úÖ **Feature Importance**: Extract and visualize most important features
- ‚úÖ **Model Comparison**: Comprehensive evaluation with multiple metrics
- ‚úÖ **Hyperparameter Tuning**: Grid search for tree-based models
- ‚úÖ **Prediction Analysis**: Residuals, Q-Q plots, actual vs predicted
- ‚úÖ **Model Persistence**: Save and load trained models
- ‚úÖ **Production Ready**: Functions for making predictions on new data

### Evaluation Metrics:
- **R¬≤ Score**: Proportion of variance explained (higher = better)
- **RMSE**: Root Mean Square Error (lower = better)
- **MAE**: Mean Absolute Error (lower = better)
- **Cross-Validation**: Robust performance estimation

# üå≤ SIMPLIFIED RANDOM FOREST MODEL

## Focused implementation using only RandomForestRegressor for estimated_loss prediction

In [None]:
# Simplified imports for Random Forest only
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer, mean_absolute_percentage_error
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Create MAPE scorer for sklearn using built-in function
mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

print("‚úÖ Random Forest ML libraries imported successfully!")
print("‚úÖ Using sklearn's built-in mean_absolute_percentage_error function!")

In [None]:
# Simplified Random Forest Predictor Class
class RandomForestLossPredictor:
    """
    Simplified predictor using only Random Forest for estimated_loss prediction
    """
    
    def __init__(self, n_estimators=100, max_depth=10, random_state=42):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.random_state = random_state
        self.model = None
        self.pipeline = None
        self.feature_names = None
        self.feature_importance_df = None
        
    def prepare_data(self, df, target_col, test_size=0.2):
        """
        Prepare data for Random Forest training
        """
        print("üîÑ Preparing data...")
        
        # Separate features and target
        X = df.drop(columns=[target_col])
        y = df[target_col]
        
        # Identify column types
        numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
        categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
        
        print(f"Numerical features ({len(numerical_cols)}): {numerical_cols}")
        print(f"Categorical features ({len(categorical_cols)}): {categorical_cols}")
        
        # Create preprocessing pipeline
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numerical_cols),
                ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
            ])
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=self.random_state
        )
        
        print(f"Training set: {X_train.shape[0]} samples")
        print(f"Test set: {X_test.shape[0]} samples")
        
        # Store for later use
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.preprocessor = preprocessor
        self.numerical_cols = numerical_cols
        self.categorical_cols = categorical_cols
        
        return X_train, X_test, y_train, y_test
    
    def train_model(self, cv_folds=5):
        """
        Train Random Forest model with cross-validation using MAPE
        """
        print("üå≤ Training Random Forest model...")
        
        # Create Random Forest model
        rf_model = RandomForestRegressor(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            random_state=self.random_state,
            n_jobs=-1
        )
        
        # Create pipeline
        self.pipeline = Pipeline([
            ('preprocessor', self.preprocessor),
            ('regressor', rf_model)
        ])
        
        # Cross-validation with MAPE (note: sklearn returns negative MAPE as decimal)
        cv_scores = cross_val_score(self.pipeline, self.X_train, self.y_train, 
                                  cv=cv_folds, scoring=mape_scorer)
        
        # Convert back to positive MAPE values as percentages
        cv_mape_scores = -cv_scores * 100
        
        print(f"Cross-validation MAPE scores: {cv_mape_scores}")
        print(f"Mean CV MAPE: {cv_mape_scores.mean():.4f}% (+/- {cv_mape_scores.std() * 2:.4f}%)")
        
        # Fit the model
        self.pipeline.fit(self.X_train, self.y_train)
        self.model = self.pipeline.named_steps['regressor']
        
        # Get feature names after preprocessing
        try:
            self.feature_names = self.pipeline.named_steps['preprocessor'].get_feature_names_out()
        except:
            # Fallback if feature names not available
            n_features = len(self.model.feature_importances_)
            self.feature_names = [f'feature_{i}' for i in range(n_features)]
        
        print("‚úÖ Model training completed!")
        
        return cv_mape_scores
    
    def evaluate_model(self):
        """
        Evaluate Random Forest model on test set with MAPE as primary metric
        """
        print("üìä Evaluating model performance...")
        
        # Make predictions
        y_pred = self.pipeline.predict(self.X_test)
        
        # Calculate MAPE (sklearn returns as decimal, convert to percentage)
        mape = mean_absolute_percentage_error(self.y_test, y_pred) * 100
        
        # Calculate other metrics
        r2 = r2_score(self.y_test, y_pred)
        mse = mean_squared_error(self.y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(self.y_test, y_pred)
        
        print(f"Test Set Performance:")
        print(f"MAPE: {mape:.4f}% üéØ (Primary Metric)")
        print(f"R¬≤ Score: {r2:.4f}")
        print(f"RMSE: {rmse:.2f}")
        print(f"MAE: {mae:.2f}")
        
        # MAPE interpretation
        if mape < 10:
            print("üìà Excellent prediction accuracy (MAPE < 10%)")
        elif mape < 20:
            print("üìä Good prediction accuracy (MAPE < 20%)")
        elif mape < 50:
            print("‚ö†Ô∏è Reasonable prediction accuracy (MAPE < 50%)")
        else:
            print("‚ùå Poor prediction accuracy (MAPE > 50%)")
        
        return {
            'mape': mape,
            'r2': r2,
            'rmse': rmse,
            'mae': mae,
            'predictions': y_pred
        }
    
    def get_feature_importance(self, top_n=15):
        """
        Extract and return feature importance
        """
        if self.model is None:
            print("Model not trained yet!")
            return None
        
        # Get feature importances
        importances = self.model.feature_importances_
        
        # Create DataFrame
        self.feature_importance_df = pd.DataFrame({
            'feature': self.feature_names,
            'importance': importances
        }).sort_values('importance', ascending=False)
        
        print(f"Top {top_n} Feature Importances:")
        print(self.feature_importance_df.head(top_n).round(4))
        
        return self.feature_importance_df
    
    def plot_feature_importance(self, top_n=15, figsize=(12, 8)):
        """
        Plot feature importance
        """
        if self.feature_importance_df is None:
            self.get_feature_importance(top_n)
        
        plot_data = self.feature_importance_df.head(top_n)
        
        plt.figure(figsize=figsize)
        
        bars = plt.barh(range(len(plot_data)), plot_data['importance'], 
                       color=plt.cm.viridis(np.linspace(0, 1, len(plot_data))))
        
        plt.yticks(range(len(plot_data)), plot_data['feature'])
        plt.xlabel('Feature Importance')
        plt.title(f'Top {top_n} Feature Importances - Random Forest', 
                  fontsize=14, fontweight='bold')
        plt.grid(True, alpha=0.3, axis='x')
        
        # Add values on bars
        for i, (bar, importance) in enumerate(zip(bars, plot_data['importance'])):
            width = bar.get_width()
            plt.text(width + max(plot_data['importance']) * 0.01, 
                    bar.get_y() + bar.get_height()/2,
                    f'{importance:.4f}', ha='left', va='center', fontsize=9)
        
        plt.tight_layout()
        plt.show()
    
    def predict(self, new_data):
        """
        Make predictions on new data
        """
        if self.pipeline is None:
            print("Model not trained yet!")
            return None
        
        return self.pipeline.predict(new_data)

print("‚úÖ RandomForestLossPredictor class defined!")

In [None]:
# Initialize and train the Random Forest model
print("üöÄ RANDOM FOREST MODEL TRAINING")
print("=" * 50)

# Initialize predictor
rf_predictor = RandomForestLossPredictor(
    n_estimators=100,
    max_depth=10, 
    random_state=42
)

# Prepare data (reusing the dataset from earlier)
X_train, X_test, y_train, y_test = rf_predictor.prepare_data(df_ml, 'estimated_loss')

# Train model
cv_scores = rf_predictor.train_model(cv_folds=5)

In [None]:
# Evaluate the model
evaluation_results = rf_predictor.evaluate_model()

In [None]:
# Get and plot feature importance
feature_importance = rf_predictor.get_feature_importance(top_n=15)
rf_predictor.plot_feature_importance(top_n=15)

In [None]:
# Prediction visualization for Random Forest with MAPE focus
def plot_rf_predictions(rf_predictor, sample_size=1000):
    """
    Visualize Random Forest predictions with MAPE emphasis
    """
    
    y_test = rf_predictor.y_test
    y_pred = rf_predictor.pipeline.predict(rf_predictor.X_test)
    
    # Sample for visualization if dataset is large
    if len(y_test) > sample_size:
        indices = np.random.choice(len(y_test), sample_size, replace=False)
        y_test_sample = y_test.iloc[indices]
        y_pred_sample = y_pred[indices]
    else:
        y_test_sample = y_test
        y_pred_sample = y_pred
    
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    
    # 1. Actual vs Predicted
    ax1 = axes[0]
    ax1.scatter(y_test_sample, y_pred_sample, alpha=0.6, s=30, color='forestgreen')
    
    # Perfect prediction line
    min_val = min(y_test_sample.min(), y_pred_sample.min())
    max_val = max(y_test_sample.max(), y_pred_sample.max())
    ax1.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label='Perfect Prediction')
    
    ax1.set_xlabel('Actual Estimated Loss')
    ax1.set_ylabel('Predicted Estimated Loss')
    ax1.set_title('Random Forest: Actual vs Predicted', fontsize=14, fontweight='bold')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Add MAPE score
    mape = mean_absolute_percentage_error(y_test_sample, y_pred_sample) * 100
    ax1.text(0.05, 0.95, f'MAPE = {mape:.2f}%', transform=ax1.transAxes, 
            bbox=dict(boxstyle='round', facecolor='lightgreen', alpha=0.8))
    
    # 2. Residuals plot
    ax2 = axes[1]
    residuals = y_test_sample - y_pred_sample
    ax2.scatter(y_pred_sample, residuals, alpha=0.6, s=30, color='forestgreen')
    ax2.axhline(y=0, color='r', linestyle='--', lw=2)
    ax2.set_xlabel('Predicted Estimated Loss')
    ax2.set_ylabel('Residuals')
    ax2.set_title('Random Forest: Residuals Plot', fontsize=14, fontweight='bold')
    ax2.grid(True, alpha=0.3)
    
    # 3. Percentage Error Distribution
    ax3 = axes[2]
    percentage_errors = np.abs((y_test_sample - y_pred_sample) / y_test_sample) * 100
    # Remove any infinite values
    percentage_errors = percentage_errors[np.isfinite(percentage_errors)]
    
    ax3.hist(percentage_errors, bins=30, alpha=0.7, color='forestgreen', edgecolor='black')
    ax3.axvline(mape, color='red', linestyle='--', linewidth=2, label=f'Mean APE = {mape:.2f}%')
    ax3.set_xlabel('Absolute Percentage Error (%)')
    ax3.set_ylabel('Frequency')
    ax3.set_title('Distribution of Absolute Percentage Errors', fontsize=14, fontweight='bold')
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return y_pred, residuals

# Plot predictions with MAPE focus
y_pred_rf, residuals_rf = plot_rf_predictions(rf_predictor)

In [None]:
# Hyperparameter tuning for Random Forest using MAPE
def tune_random_forest(rf_predictor, cv_folds=3):
    """
    Perform hyperparameter tuning for Random Forest using MAPE
    """
    print("üîß Tuning Random Forest hyperparameters using MAPE...")
    
    # Define parameter grid
    param_grid = {
        'regressor__n_estimators': [50, 100, 200],
        'regressor__max_depth': [5, 10, 15, None],
        'regressor__min_samples_split': [2, 5, 10],
        'regressor__min_samples_leaf': [1, 2, 4]
    }
    
    # Perform grid search with MAPE
    grid_search = GridSearchCV(
        rf_predictor.pipeline,
        param_grid,
        cv=cv_folds,
        scoring=mape_scorer,  # Using MAPE scorer
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(rf_predictor.X_train, rf_predictor.y_train)
    
    print(f"\\n‚úÖ Best parameters:")
    for param, value in grid_search.best_params_.items():
        print(f"  {param}: {value}")
    
    # Note: sklearn returns negative MAPE, so we convert to positive
    best_cv_mape = -grid_search.best_score_
    print(f"\\nBest CV MAPE score: {best_cv_mape:.4f}%")
    
    # Evaluate tuned model on test set
    y_pred_tuned = grid_search.best_estimator_.predict(rf_predictor.X_test)
    mape_tuned = mean_absolute_percentage_error(rf_predictor.y_test, y_pred_tuned)
    
    print(f"Tuned model test MAPE: {mape_tuned:.4f}%")
    print(f"Original model test MAPE: {evaluation_results['mape']:.4f}%")
    improvement = evaluation_results['mape'] - mape_tuned
    print(f"MAPE improvement: {improvement:.4f}% {'‚úÖ' if improvement > 0 else '‚ùå'}")
    
    return grid_search.best_estimator_

# Perform hyperparameter tuning
tuned_rf_model = tune_random_forest(rf_predictor)

In [None]:
# Save Random Forest model and make predictions
import joblib

def save_rf_model(model, filepath='random_forest_estimated_loss.pkl'):
    """Save the Random Forest model"""
    joblib.dump(model, filepath)
    print(f"‚úÖ Random Forest model saved to {filepath}")

def load_rf_model(filepath='random_forest_estimated_loss.pkl'):
    """Load a saved Random Forest model"""
    return joblib.load(filepath)

# Save the model
save_rf_model(rf_predictor.pipeline, 'random_forest_estimated_loss.pkl')

# Example predictions on new data
print("\\nüîÆ RANDOM FOREST PREDICTIONS ON NEW DATA")
print("=" * 60)

# Create sample new data
new_transactions = pd.DataFrame({
    'transaction_amount': [200.0, 1500.0, 50.0, 3000.0],
    'account_age_days': [45, 500, 10, 1000],
    'previous_claims': [0, 2, 0, 5],
    'risk_score': [20.0, 65.0, 15.0, 85.0],
    'merchant_rating': [4.8, 3.2, 4.5, 2.0],
    'transaction_type': ['purchase', 'transfer', 'purchase', 'withdrawal'],
    'merchant_category': ['retail', 'travel', 'food', 'other'],
    'user_tier': ['bronze', 'gold', 'bronze', 'platinum'],
    'payment_method': ['credit_card', 'bank_transfer', 'debit_card', 'digital_wallet'],
    'country': ['US', 'UK', 'CA', 'DE']
})

print("New transaction data:")
print(new_transactions)

# Make predictions
rf_predictions = rf_predictor.predict(new_transactions)

print(f"\\nRandom Forest Predicted estimated_loss values:")
for i, (idx, row) in enumerate(new_transactions.iterrows()):
    print(f"Transaction {i+1}: ${rf_predictions[i]:.2f}")
    print(f"  - Amount: ${row['transaction_amount']:.2f}, Risk: {row['risk_score']:.1f}, Type: {row['transaction_type']}")
    
print(f"\\nAverage predicted loss: ${rf_predictions.mean():.2f}")
print(f"Max predicted loss: ${rf_predictions.max():.2f}")
print(f"Min predicted loss: ${rf_predictions.min():.2f}")

## üå≤ Random Forest Quick Start Guide - MAPE Focused

### Simple Usage with MAPE as Primary Metric:

```python
# 1. Initialize Random Forest predictor
rf_predictor = RandomForestLossPredictor(
    n_estimators=100,
    max_depth=10,
    random_state=42
)

# 2. Prepare your data
X_train, X_test, y_train, y_test = rf_predictor.prepare_data(
    df, 'estimated_loss'  # your target column
)

# 3. Train the model with MAPE cross-validation
cv_mape_scores = rf_predictor.train_model(cv_folds=5)

# 4. Evaluate performance (MAPE is primary metric)
results = rf_predictor.evaluate_model()

# 5. Detailed MAPE analysis
mape_analysis = analyze_mape_performance(rf_predictor)

# 6. Get feature importance
importance = rf_predictor.get_feature_importance()
rf_predictor.plot_feature_importance()

# 7. Make predictions
predictions = rf_predictor.predict(new_data)
```

### Why MAPE is Perfect for estimated_loss:

- ‚úÖ **Percentage-Based**: Easy to interpret (e.g., "20% error")
- ‚úÖ **Scale-Independent**: Works for small and large loss amounts
- ‚úÖ **Business-Friendly**: Management understands percentages
- ‚úÖ **Relative Error**: More meaningful than absolute errors
- ‚úÖ **Benchmark Standard**: Industry standard for forecasting accuracy

### MAPE Interpretation Guidelines:
- **< 10%**: Excellent prediction accuracy üìà
- **10-20%**: Good prediction accuracy üìä  
- **20-50%**: Fair prediction accuracy ‚ö†Ô∏è
- **> 50%**: Poor prediction accuracy ‚ùå

### MAPE Advantages for Financial Predictions:
- **Risk Assessment**: Understand prediction uncertainty as %
- **Portfolio Planning**: Compare accuracy across different loss ranges
- **Resource Allocation**: Budget based on prediction confidence
- **Model Comparison**: Easy to compare different models
- **Stakeholder Communication**: Simple percentage format

In [None]:
# MAPE Analysis and Interpretation
def analyze_mape_performance(rf_predictor, threshold_percentages=[10, 20, 30]):
    """
    Analyze MAPE performance in detail
    """
    y_test = rf_predictor.y_test
    y_pred = rf_predictor.pipeline.predict(rf_predictor.X_test)
    
    # Calculate individual percentage errors
    percentage_errors = np.abs((y_test - y_pred) / y_test) * 100
    percentage_errors = percentage_errors[np.isfinite(percentage_errors)]  # Remove any inf values
    
    print("üéØ DETAILED MAPE ANALYSIS")
    print("=" * 50)
    
    # Overall MAPE
    overall_mape = mean_absolute_percentage_error(y_test, y_pred)
    print(f"Overall MAPE: {overall_mape:.2f}%")
    
    # MAPE interpretation
    if overall_mape < 10:
        print("üìà EXCELLENT prediction accuracy!")
    elif overall_mape < 20:
        print("üìä GOOD prediction accuracy")
    elif overall_mape < 30:
        print("‚ö†Ô∏è FAIR prediction accuracy")
    else:
        print("‚ùå POOR prediction accuracy - consider model improvements")
    
    print(f"\nüìä Prediction Accuracy Distribution:")
    
    # Analyze predictions within different error thresholds
    for threshold in threshold_percentages:
        within_threshold = (percentage_errors <= threshold).sum()
        percentage_within = (within_threshold / len(percentage_errors)) * 100
        print(f"Within {threshold}% error: {within_threshold}/{len(percentage_errors)} predictions ({percentage_within:.1f}%)")
    
    # Quartile analysis
    print(f"\nüìà Error Distribution Quartiles:")
    print(f"25th percentile: {np.percentile(percentage_errors, 25):.2f}%")
    print(f"50th percentile (median): {np.percentile(percentage_errors, 50):.2f}%")
    print(f"75th percentile: {np.percentile(percentage_errors, 75):.2f}%")
    print(f"95th percentile: {np.percentile(percentage_errors, 95):.2f}%")
    
    # Business impact analysis
    print(f"\nüí∞ Business Impact Analysis:")
    total_actual_loss = y_test.sum()
    total_predicted_loss = y_pred.sum()
    total_error = abs(total_actual_loss - total_predicted_loss)
    total_error_percentage = (total_error / total_actual_loss) * 100
    
    print(f"Total Actual Loss: ${total_actual_loss:,.2f}")
    print(f"Total Predicted Loss: ${total_predicted_loss:,.2f}")
    print(f"Total Absolute Error: ${total_error:,.2f}")
    print(f"Total Error Percentage: {total_error_percentage:.2f}%")
    
    return {
        'overall_mape': overall_mape,
        'percentage_errors': percentage_errors,
        'within_thresholds': {f'{t}%': (percentage_errors <= t).sum() for t in threshold_percentages}
    }

# Run detailed MAPE analysis
mape_analysis = analyze_mape_performance(rf_predictor)

## üîç MAPE TROUBLESHOOTING & PIPELINE ANALYSIS

### Common Causes of High MAPE Scores and Solutions

In [None]:
# PIPELINE ANALYSIS: Identifying High MAPE Causes
def diagnose_high_mape_issues(df, target_col='estimated_loss'):
    """
    Diagnose potential causes of high MAPE scores in the pipeline
    """
    print("üîç DIAGNOSING POTENTIAL HIGH MAPE CAUSES")
    print("=" * 60)
    
    # Issue 1: Zero or very small target values
    print("1Ô∏è‚É£ ZERO/SMALL TARGET VALUE ANALYSIS:")
    zero_count = (df[target_col] == 0).sum()
    small_values = (df[target_col] < 1).sum()
    very_small_values = (df[target_col] < 0.1).sum()
    
    print(f"   Zero values: {zero_count}/{len(df)} ({zero_count/len(df)*100:.2f}%)")
    print(f"   Values < 1: {small_values}/{len(df)} ({small_values/len(df)*100:.2f}%)")
    print(f"   Values < 0.1: {very_small_values}/{len(df)} ({very_small_values/len(df)*100:.2f}%)")
    
    if zero_count > 0:
        print("   ‚ö†Ô∏è  CRITICAL ISSUE: Zero values will cause infinite MAPE!")
    if small_values > len(df) * 0.1:
        print("   ‚ö†Ô∏è  WARNING: Many small values will inflate MAPE significantly!")
    
    # Issue 2: Target distribution analysis
    print(f"\\n2Ô∏è‚É£ TARGET DISTRIBUTION ANALYSIS:")
    print(f"   Min: {df[target_col].min():.4f}")
    print(f"   Max: {df[target_col].max():.4f}")
    print(f"   Mean: {df[target_col].mean():.4f}")
    print(f"   Median: {df[target_col].median():.4f}")
    print(f"   Std: {df[target_col].std():.4f}")
    
    # Check for high variance
    cv = df[target_col].std() / df[target_col].mean()
    print(f"   Coefficient of Variation: {cv:.4f}")
    if cv > 1:
        print("   ‚ö†Ô∏è  WARNING: High variance (CV > 1) makes prediction difficult!")
    
    # Issue 3: Skewness analysis
    from scipy.stats import skew
    skewness = skew(df[target_col])
    print(f"   Skewness: {skewness:.4f}")
    if abs(skewness) > 2:
        print("   ‚ö†Ô∏è  WARNING: Highly skewed data may need transformation!")
    
    # Issue 4: Feature-target relationship strength
    print(f"\\n3Ô∏è‚É£ FEATURE-TARGET RELATIONSHIP ANALYSIS:")
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    numerical_cols.remove(target_col)
    
    weak_correlations = 0
    for col in numerical_cols:
        corr = df[col].corr(df[target_col])
        if abs(corr) < 0.1:
            weak_correlations += 1
    
    print(f"   Features with weak correlation (|r| < 0.1): {weak_correlations}/{len(numerical_cols)}")
    if weak_correlations > len(numerical_cols) * 0.5:
        print("   ‚ö†Ô∏è  WARNING: Many features have weak relationships with target!")
    
    # Issue 5: Data generation noise analysis
    print(f"\\n4Ô∏è‚É£ DATA GENERATION ISSUES:")
    # Check if data was generated with high noise
    # Look at the data generation code pattern
    print("   Checking synthetic data generation patterns...")
    
    # Calculate signal-to-noise ratio estimate
    # Assume base effects create signal, random noise creates noise
    feature_effects_sum = (
        df['risk_score'].mean() * 0.5 +  # Risk score effect
        df['previous_claims'].mean() * 8 +  # Claims effect  
        20  # Average categorical effects
    )
    noise_std = 15  # From data generation
    snr = feature_effects_sum / noise_std
    print(f"   Estimated Signal-to-Noise Ratio: {snr:.2f}")
    if snr < 3:
        print("   ‚ö†Ô∏è  WARNING: High noise relative to signal!")
    
    # Issue 6: Model complexity vs data size
    print(f"\\n5Ô∏è‚É£ MODEL COMPLEXITY ANALYSIS:")
    n_samples = len(df)
    n_features_est = len(numerical_cols) + 20  # Estimate with one-hot encoding
    ratio = n_samples / n_features_est
    print(f"   Sample-to-feature ratio: {ratio:.1f}")
    if ratio < 10:
        print("   ‚ö†Ô∏è  WARNING: Low sample-to-feature ratio may cause overfitting!")
    
    return {
        'zero_values': zero_count,
        'small_values': small_values,
        'cv': cv,
        'skewness': skewness,
        'weak_correlations': weak_correlations,
        'snr': snr,
        'sample_feature_ratio': ratio
    }

# Run diagnostic analysis
diagnostic_results = diagnose_high_mape_issues(df_ml)

In [None]:
# SOLUTIONS for High MAPE Issues
def implement_mape_improvements(df, target_col='estimated_loss'):
    """
    Implement solutions for common high MAPE causes
    """
    print("üîß IMPLEMENTING MAPE IMPROVEMENT SOLUTIONS")
    print("=" * 60)
    
    df_improved = df.copy()
    
    # Solution 1: Handle zero/small values
    print("1Ô∏è‚É£ HANDLING ZERO/SMALL VALUES:")
    original_zeros = (df_improved[target_col] == 0).sum()
    original_small = (df_improved[target_col] < 1).sum()
    
    # Add small constant to avoid division by zero
    min_threshold = 0.01
    df_improved[f'{target_col}_adjusted'] = df_improved[target_col].apply(
        lambda x: max(x, min_threshold)
    )
    
    adjusted_zeros = (df_improved[f'{target_col}_adjusted'] == 0).sum()
    print(f"   Fixed zero values: {original_zeros} ‚Üí {adjusted_zeros}")
    print(f"   Minimum value set to: {min_threshold}")
    
    # Solution 2: Log transformation for skewed data
    print(f"\\n2Ô∏è‚É£ LOG TRANSFORMATION FOR SKEWED DATA:")
    from scipy.stats import skew
    original_skew = skew(df_improved[target_col])
    
    # Apply log1p transformation (log(1+x) to handle zeros)
    df_improved[f'{target_col}_log'] = np.log1p(df_improved[target_col])
    log_skew = skew(df_improved[f'{target_col}_log'])
    
    print(f"   Original skewness: {original_skew:.4f}")
    print(f"   Log-transformed skewness: {log_skew:.4f}")
    print(f"   Improvement: {abs(original_skew) - abs(log_skew):.4f}")
    
    # Solution 3: Outlier handling
    print(f"\\n3Ô∏è‚É£ OUTLIER HANDLING:")
    Q1 = df_improved[target_col].quantile(0.25)
    Q3 = df_improved[target_col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = ((df_improved[target_col] < lower_bound) | 
                (df_improved[target_col] > upper_bound)).sum()
    
    # Cap outliers instead of removing them
    df_improved[f'{target_col}_capped'] = df_improved[target_col].clip(
        lower=max(lower_bound, 0), upper=upper_bound
    )
    
    print(f"   Outliers detected: {outliers}")
    print(f"   Capping range: [{max(lower_bound, 0):.2f}, {upper_bound:.2f}]")
    
    # Solution 4: Feature engineering for better signal
    print(f"\\n4Ô∏è‚É£ FEATURE ENGINEERING FOR BETTER SIGNAL:")
    
    # Create interaction features
    df_improved['amount_risk_interaction'] = (
        df_improved['transaction_amount'] * df_improved['risk_score']
    )
    df_improved['claims_age_interaction'] = (
        df_improved['previous_claims'] * (1 / (df_improved['account_age_days'] + 1))
    )
    
    # Create binned features for non-linear patterns
    df_improved['amount_bins'] = pd.qcut(df_improved['transaction_amount'], 
                                       q=5, labels=['very_low', 'low', 'medium', 'high', 'very_high'])
    df_improved['risk_bins'] = pd.cut(df_improved['risk_score'], 
                                    bins=[0, 20, 40, 60, 80, 100], 
                                    labels=['very_low', 'low', 'medium', 'high', 'very_high'])
    
    print(f"   Added interaction features: 2")
    print(f"   Added binned features: 2")
    
    # Solution 5: Robust scaling instead of standard scaling
    print(f"\\n5Ô∏è‚É£ ROBUST PREPROCESSING:")
    from sklearn.preprocessing import RobustScaler
    
    print("   Recommendation: Use RobustScaler instead of StandardScaler")
    print("   Reason: Less sensitive to outliers")
    
    return df_improved

# Apply improvements
df_improved = implement_mape_improvements(df_ml)

In [None]:
# IMPROVED RandomForestLossPredictor with MAPE optimizations
class ImprovedRandomForestPredictor(RandomForestLossPredictor):
    """
    Enhanced Random Forest predictor with MAPE-specific optimizations
    """
    
    def __init__(self, n_estimators=100, max_depth=10, random_state=42, 
                 handle_zeros=True, use_robust_scaling=True, min_threshold=0.01):
        super().__init__(n_estimators, max_depth, random_state)
        self.handle_zeros = handle_zeros
        self.use_robust_scaling = use_robust_scaling
        self.min_threshold = min_threshold
        
    def prepare_data(self, df, target_col, test_size=0.2):
        """
        Prepare data with MAPE-specific improvements
        """
        print("üîÑ Preparing data with MAPE optimizations...")
        
        df_prep = df.copy()
        
        # Handle zero/small values if enabled
        if self.handle_zeros:
            original_zeros = (df_prep[target_col] == 0).sum()
            df_prep[target_col] = df_prep[target_col].apply(
                lambda x: max(x, self.min_threshold)
            )
            if original_zeros > 0:
                print(f"   ‚úÖ Fixed {original_zeros} zero values (set minimum to {self.min_threshold})")
        
        # Separate features and target
        X = df_prep.drop(columns=[target_col])
        y = df_prep[target_col]
        
        # Identify column types
        numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
        categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
        
        print(f"Numerical features ({len(numerical_cols)}): {numerical_cols}")
        print(f"Categorical features ({len(categorical_cols)}): {categorical_cols}")
        
        # Choose scaler based on setting
        if self.use_robust_scaling:
            from sklearn.preprocessing import RobustScaler
            numerical_transformer = RobustScaler()
            print("   ‚úÖ Using RobustScaler (outlier-resistant)")
        else:
            numerical_transformer = StandardScaler()
            print("   Using StandardScaler")
        
        # Create preprocessing pipeline
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_cols),
                ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
            ])
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=self.random_state
        )
        
        print(f"Training set: {X_train.shape[0]} samples")
        print(f"Test set: {X_test.shape[0]} samples")
        
        # Store for later use
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.preprocessor = preprocessor
        self.numerical_cols = numerical_cols
        self.categorical_cols = categorical_cols
        
        return X_train, X_test, y_train, y_test

# Compare original vs improved approach
print("üéØ COMPARING ORIGINAL vs IMPROVED APPROACH")
print("=" * 60)

# Test with improved predictor
improved_predictor = ImprovedRandomForestPredictor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    handle_zeros=True,
    use_robust_scaling=True,
    min_threshold=0.01
)

print("\\nTesting with improved predictor configuration...")
print("Improvements enabled:")
print("  ‚úÖ Zero value handling")
print("  ‚úÖ Robust scaling")
print("  ‚úÖ Minimum threshold protection")

## üö® KEY ISSUES IDENTIFIED IN CURRENT PIPELINE

Based on the analysis, here are the **primary causes of high MAPE scores** in the current pipeline:

### 1. **Zero Values Problem** üî¥
- **Issue**: The data generation can create `estimated_loss = 0` when `max(0, base_loss)` results in zero
- **Impact**: Division by zero in MAPE calculation causes infinite errors
- **Solution**: Set minimum threshold (e.g., 0.01) to avoid zeros

### 2. **High Noise-to-Signal Ratio** üü°
- **Issue**: Random noise of ¬±15 added to relatively small base effects
- **Impact**: Makes predictions inherently difficult
- **Solution**: Reduce noise or increase signal strength in data generation

### 3. **Skewed Target Distribution** üü°
- **Issue**: Log-normal transaction amounts and exponential account ages create skewed targets
- **Impact**: MAPE is more sensitive to errors on small values
- **Solution**: Consider log transformation or robust scaling

### 4. **Small Target Values** üü†
- **Issue**: Many target values are small (< 10), making percentage errors large
- **Impact**: Small absolute errors become large percentage errors
- **Solution**: Target value adjustment or different metric consideration

### 5. **Feature Scaling Issues** üü°
- **Issue**: StandardScaler is sensitive to outliers in transaction amounts
- **Impact**: Poor feature representation affects model performance
- **Solution**: Use RobustScaler instead

### 6. **Model Hyperparameters** üü°
- **Issue**: Default max_depth=10 might be too shallow for complex interactions
- **Impact**: Underfitting leads to poor predictions
- **Solution**: Tune hyperparameters specifically for MAPE

In [None]:
# Apply Trained Model to Test Set (Missing Target Column)
print("üéØ APPLYING TRAINED MODEL TO NEW TEST SET")
print("=" * 60)

def apply_model_to_test_set(trained_model, test_df, model_type="standard"):
    """
    Apply a trained model to a test set that doesn't have the target column
    
    Parameters:
    - trained_model: Your trained model object (RandomForestLossPredictor or LogTransformRandomForestPredictor)
    - test_df: DataFrame with same schema as training data but missing target column
    - model_type: "standard" or "log_transform" to handle inverse transformation
    
    Returns:
    - predictions: Array of predictions
    - prediction_df: DataFrame with original data + predictions
    """
    
    print(f"üìä Processing test set with {len(test_df)} samples...")
    print(f"Model type: {model_type}")
    
    # Validate that test_df has the expected columns
    expected_features = set()
    if hasattr(trained_model, 'numerical_cols') and hasattr(trained_model, 'categorical_cols'):
        expected_features = set(trained_model.numerical_cols + trained_model.categorical_cols)
    
    test_features = set(test_df.columns)
    
    if expected_features:
        missing_features = expected_features - test_features
        extra_features = test_features - expected_features
        
        if missing_features:
            print(f"‚ö†Ô∏è  WARNING: Missing features in test set: {missing_features}")
        if extra_features:
            print(f"‚ÑπÔ∏è  Extra features in test set (will be ignored): {extra_features}")
            # Keep only the expected features
            test_df = test_df[list(expected_features)]
    
    print(f"‚úÖ Test set shape after validation: {test_df.shape}")
    
    # Make predictions
    try:
        if model_type == "log_transform" and hasattr(trained_model, 'use_log_transform'):
            # For log transform models, the predict method handles inverse transformation automatically
            predictions = trained_model.predict(test_df)
            print("‚úÖ Applied log-transform model with automatic inverse transformation")
        else:
            # For standard models
            predictions = trained_model.predict(test_df)
            print("‚úÖ Applied standard model")
            
    except Exception as e:
        print(f"‚ùå Error making predictions: {e}")
        return None, None
    
    # Create results DataFrame
    result_df = test_df.copy()
    result_df['predicted_estimated_loss'] = predictions
    
    # Add prediction statistics
    print(f"\nüìà PREDICTION SUMMARY:")
    print(f"Number of predictions: {len(predictions)}")
    print(f"Mean predicted loss: ${predictions.mean():.2f}")
    print(f"Median predicted loss: ${np.median(predictions):.2f}")
    print(f"Min predicted loss: ${predictions.min():.2f}")
    print(f"Max predicted loss: ${predictions.max():.2f}")
    print(f"Std deviation: ${predictions.std():.2f}")
    
    # Prediction distribution
    print(f"\nüìä PREDICTION DISTRIBUTION:")
    percentiles = [10, 25, 50, 75, 90, 95, 99]
    for p in percentiles:
        value = np.percentile(predictions, p)
        print(f"{p}th percentile: ${value:.2f}")
    
    return predictions, result_df

# Example usage function
def demo_test_set_application():
    """
    Demonstrate how to apply models to test sets
    """
    print("\nüéØ DEMONSTRATION: Creating sample test set and applying model")
    print("=" * 60)
    
    # First, let's assume you have a trained model (you'll replace this with your actual trained model)
    print("üìù Step 1: Load or reference your trained model")
    print("   # Replace 'your_trained_model' with your actual model variable")
    print("   # Examples:")
    print("   # - rf_predictor (if you used RandomForestLossPredictor)")
    print("   # - log_rf_predictor (if you used LogTransformRandomForestPredictor)")
    print("   # - improved_predictor (if you used ImprovedRandomForestPredictor)")
    
    # Create sample test data (you'll replace this with your actual test data)
    print("\\nüìù Step 2: Load your test set")
    print("   # Replace this with loading your actual test data:")
    print("   # test_data = pd.read_csv('your_test_file.csv')")
    print("   # OR")
    print("   # test_data = your_existing_test_dataframe")
    
    # For demonstration, create sample test data with same structure as training data
    sample_test_data = pd.DataFrame({
        'transaction_amount': [150.0, 2500.0, 75.0, 1200.0, 500.0],
        'account_age_days': [30, 365, 1200, 180, 90],
        'previous_claims': [0, 3, 1, 2, 0],
        'risk_score': [25.5, 75.2, 15.8, 45.0, 30.0],
        'merchant_rating': [4.5, 2.1, 4.8, 3.5, 4.0],
        'transaction_type': ['purchase', 'transfer', 'purchase', 'withdrawal', 'purchase'],
        'merchant_category': ['retail', 'travel', 'food', 'other', 'retail'],
        'user_tier': ['bronze', 'gold', 'silver', 'silver', 'bronze'],
        'payment_method': ['credit_card', 'bank_transfer', 'debit_card', 'digital_wallet', 'credit_card'],
        'country': ['US', 'UK', 'CA', 'DE', 'US']
    })
    
    print(f"\\nüìä Sample test data:")
    print(sample_test_data)
    
    print(f"\\nüìù Step 3: Apply model to test set")
    print("   # Use the apply_model_to_test_set function:")
    print("   # predictions, results_df = apply_model_to_test_set(")
    print("   #     trained_model=your_trained_model,")
    print("   #     test_df=test_data,")
    print("   #     model_type='standard'  # or 'log_transform' if using log model")
    print("   # )")
    
    return sample_test_data

# Run demonstration
sample_test_data = demo_test_set_application()

In [None]:
# PRACTICAL EXAMPLE: Apply Your Trained Model to Real Test Set
print("üöÄ PRACTICAL EXAMPLE: APPLYING YOUR TRAINED MODEL")
print("=" * 60)

# STEP 1: Ensure you have a trained model
print("üìã STEP 1: Train a model first (if not already done)")
print("You need to run one of these training approaches first:")
print("  Option A: Standard RandomForestLossPredictor")
print("  Option B: LogTransformRandomForestPredictor") 
print("  Option C: ImprovedRandomForestPredictor")

# Example of how to train quickly if needed:
def quick_train_model_for_testing():
    """
    Quick training function for demonstration
    """
    # Generate sample training data if df_ml doesn't exist
    if 'df_ml' not in globals():
        print("‚ö†Ô∏è  No training data found. Generating sample data...")
        
        # Create sample training data
        np.random.seed(42)
        n_samples = 1000
        
        df_sample = pd.DataFrame({
            'transaction_amount': np.random.exponential(100, n_samples),
            'account_age_days': np.random.randint(1, 2000, n_samples),
            'previous_claims': np.random.poisson(1.5, n_samples),
            'risk_score': np.random.normal(50, 20, n_samples),
            'merchant_rating': np.random.uniform(1, 5, n_samples),
            'transaction_type': np.random.choice(['purchase', 'transfer', 'withdrawal'], n_samples),
            'merchant_category': np.random.choice(['retail', 'travel', 'food', 'other'], n_samples),
            'user_tier': np.random.choice(['bronze', 'silver', 'gold', 'platinum'], n_samples),
            'payment_method': np.random.choice(['credit_card', 'debit_card', 'bank_transfer', 'digital_wallet'], n_samples),
            'country': np.random.choice(['US', 'UK', 'CA', 'DE', 'FR'], n_samples)
        })
        
        # Generate target variable
        base_loss = (df_sample['transaction_amount'] * 0.02 + 
                    df_sample['risk_score'] * 0.5 + 
                    df_sample['previous_claims'] * 5 + 
                    np.random.normal(0, 15, n_samples))
        df_sample['estimated_loss'] = np.maximum(0, base_loss)
        
        return df_sample
    else:
        return df_ml

# STEP 2: Load your actual test data
print("\\nüìã STEP 2: Load your test data")
print("Replace this section with your actual test data loading:")

def load_test_data():
    """
    Load your test data here
    """
    # REPLACE THIS WITH YOUR ACTUAL TEST DATA LOADING:
    # return pd.read_csv('your_test_file.csv')
    # OR
    # return your_test_dataframe
    
    # For demonstration, create sample test data:
    print("üìù Creating sample test data (replace with your actual data loading)")
    
    test_data = pd.DataFrame({
        'transaction_amount': [200.0, 1500.0, 50.0, 3000.0, 800.0, 120.0],
        'account_age_days': [45, 500, 10, 1000, 200, 60],
        'previous_claims': [0, 2, 0, 5, 1, 0],
        'risk_score': [20.0, 65.0, 15.0, 85.0, 40.0, 25.0],
        'merchant_rating': [4.8, 3.2, 4.5, 2.0, 3.8, 4.2],
        'transaction_type': ['purchase', 'transfer', 'purchase', 'withdrawal', 'purchase', 'purchase'],
        'merchant_category': ['retail', 'travel', 'food', 'other', 'retail', 'food'],
        'user_tier': ['bronze', 'gold', 'bronze', 'platinum', 'silver', 'bronze'],
        'payment_method': ['credit_card', 'bank_transfer', 'debit_card', 'digital_wallet', 'credit_card', 'debit_card'],
        'country': ['US', 'UK', 'CA', 'DE', 'US', 'CA']
    })
    
    print(f"‚úÖ Test data loaded: {test_data.shape[0]} samples, {test_data.shape[1]} features")
    print("\\nTest data preview:")
    print(test_data.head())
    
    return test_data

# Execute the steps
print("\\nüîÑ EXECUTING STEPS...")

# Load training data and train a quick model
training_data = quick_train_model_for_testing()
print(f"Training data: {training_data.shape}")

# Load test data
test_data = load_test_data()

print("\\n‚úÖ Ready to apply model to test set!")
print("\\nNext steps:")
print("1. Train your preferred model on the training data")
print("2. Use apply_model_to_test_set() function to get predictions")
print("3. Save or export the results")

In [None]:
# COMPLETE WORKFLOW: Train Model & Apply to Test Set
print("üéØ COMPLETE WORKFLOW: TRAIN MODEL & APPLY TO TEST SET")
print("=" * 70)

def complete_model_application_workflow():
    """
    Complete end-to-end workflow for training and applying model to test set
    """
    
    # Step 1: Prepare training data
    print("1Ô∏è‚É£ PREPARING TRAINING DATA")
    print("-" * 30)
    
    # Use existing data or create sample data
    if 'df_ml' in globals() and len(df_ml) > 0:
        training_df = df_ml.copy()
        print(f"‚úÖ Using existing training data: {training_df.shape}")
    else:
        print("üìù Creating sample training data...")
        np.random.seed(42)
        n_samples = 1000
        
        training_df = pd.DataFrame({
            'transaction_amount': np.random.exponential(100, n_samples),
            'account_age_days': np.random.randint(1, 2000, n_samples),
            'previous_claims': np.random.poisson(1.5, n_samples),
            'risk_score': np.random.normal(50, 20, n_samples).clip(0, 100),
            'merchant_rating': np.random.uniform(1, 5, n_samples),
            'transaction_type': np.random.choice(['purchase', 'transfer', 'withdrawal'], n_samples),
            'merchant_category': np.random.choice(['retail', 'travel', 'food', 'other'], n_samples),
            'user_tier': np.random.choice(['bronze', 'silver', 'gold', 'platinum'], n_samples),
            'payment_method': np.random.choice(['credit_card', 'debit_card', 'bank_transfer', 'digital_wallet'], n_samples),
            'country': np.random.choice(['US', 'UK', 'CA', 'DE', 'FR'], n_samples)
        })
        
        # Generate realistic target variable
        base_loss = (training_df['transaction_amount'] * 0.02 + 
                    training_df['risk_score'] * 0.5 + 
                    training_df['previous_claims'] * 10 + 
                    np.random.normal(0, 20, n_samples))
        training_df['estimated_loss'] = np.maximum(0.01, base_loss)  # Avoid zeros
        
        print(f"‚úÖ Created training data: {training_df.shape}")
    
    # Step 2: Train the model
    print("\\n2Ô∏è‚É£ TRAINING MODEL")
    print("-" * 30)
    
    # Train a log-transform model for better performance
    model = LogTransformRandomForestPredictor(
        n_estimators=100,
        max_depth=15,
        random_state=42,
        use_log_transform=True
    )
    
    # Prepare and train
    model.prepare_data(training_df, 'estimated_loss', test_size=0.2)
    cv_scores = model.train_model(cv_folds=3)
    results = model.evaluate_model()
    
    print(f"‚úÖ Model trained! MAPE: {results['mape']:.2f}%")
    
    # Step 3: Create test data (replace this with your actual test data loading)
    print("\\n3Ô∏è‚É£ LOADING TEST DATA")
    print("-" * 30)
    
    # REPLACE THIS SECTION WITH YOUR ACTUAL TEST DATA:
    # test_df = pd.read_csv('your_test_file.csv')
    
    # For demonstration, create sample test data
    test_df = pd.DataFrame({
        'transaction_amount': [250.0, 1800.0, 95.0, 3500.0, 600.0, 150.0, 2200.0],
        'account_age_days': [60, 400, 15, 900, 180, 45, 600],
        'previous_claims': [1, 3, 0, 4, 2, 0, 1],
        'risk_score': [30.0, 70.0, 20.0, 80.0, 45.0, 25.0, 60.0],
        'merchant_rating': [4.2, 2.8, 4.6, 1.8, 3.5, 4.4, 3.0],
        'transaction_type': ['purchase', 'transfer', 'purchase', 'withdrawal', 'purchase', 'purchase', 'transfer'],
        'merchant_category': ['retail', 'travel', 'food', 'other', 'retail', 'food', 'travel'],
        'user_tier': ['silver', 'gold', 'bronze', 'platinum', 'silver', 'bronze', 'gold'],
        'payment_method': ['credit_card', 'bank_transfer', 'debit_card', 'digital_wallet', 'credit_card', 'debit_card', 'bank_transfer'],
        'country': ['US', 'UK', 'CA', 'DE', 'US', 'CA', 'UK']
    })
    
    print(f"‚úÖ Test data loaded: {test_df.shape}")
    print("\\nTest data preview:")
    print(test_df.head(3))
    
    # Step 4: Apply model to test set
    print("\\n4Ô∏è‚É£ APPLYING MODEL TO TEST SET")
    print("-" * 30)
    
    predictions, results_df = apply_model_to_test_set(
        trained_model=model,
        test_df=test_df,
        model_type="log_transform"
    )
    
    # Step 5: Display and save results
    print("\\n5Ô∏è‚É£ RESULTS")
    print("-" * 30)
    
    print("\\nüìä FINAL RESULTS:")
    print(results_df[['transaction_amount', 'risk_score', 'transaction_type', 'predicted_estimated_loss']].round(2))
    
    # Risk categorization
    print("\\nüéØ RISK CATEGORIZATION:")
    results_df['risk_category'] = pd.cut(
        results_df['predicted_estimated_loss'], 
        bins=[0, 10, 50, 100, float('inf')], 
        labels=['Low', 'Medium', 'High', 'Very High']
    )
    
    risk_summary = results_df['risk_category'].value_counts()
    print(risk_summary)
    
    # Save results (optional)
    print("\\nüíæ SAVING RESULTS:")
    output_filename = f"predicted_losses_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.csv"
    results_df.to_csv(output_filename, index=False)
    print(f"‚úÖ Results saved to: {output_filename}")
    
    return model, test_df, predictions, results_df

# Execute the complete workflow
print("\\nüöÄ EXECUTING COMPLETE WORKFLOW...")
print("=" * 70)

try:
    trained_model, test_data, predictions, final_results = complete_model_application_workflow()
    print("\\nüéâ WORKFLOW COMPLETED SUCCESSFULLY!")
    print(f"   - Trained model: ‚úÖ")
    print(f"   - Test predictions: ‚úÖ ({len(predictions)} predictions)")
    print(f"   - Results saved: ‚úÖ")
    
except Exception as e:
    print(f"‚ùå Error in workflow: {e}")
    print("Please check your data and try again.")

In [None]:
# SIMPLE SOLUTION: Apply Your Trained Model to Test Set
print("üéØ SIMPLE SOLUTION: APPLY YOUR TRAINED MODEL TO TEST SET")
print("=" * 60)

# STEP 1: Load your test data
print("1Ô∏è‚É£ LOAD YOUR TEST DATA:")
print("Replace the next line with your actual test data loading:")
# test_df = pd.read_csv('your_test_file.csv')  # REPLACE WITH YOUR FILE

# For demonstration - replace this with your actual test data:
your_test_data = pd.DataFrame({
    'transaction_amount': [100.0, 500.0, 1200.0, 75.0],
    'account_age_days': [30, 200, 800, 15],
    'previous_claims': [0, 1, 3, 0],
    'risk_score': [25.0, 45.0, 75.0, 20.0],
    'merchant_rating': [4.5, 3.8, 2.2, 4.8],
    'transaction_type': ['purchase', 'transfer', 'withdrawal', 'purchase'],
    'merchant_category': ['retail', 'travel', 'other', 'food'],
    'user_tier': ['bronze', 'silver', 'gold', 'bronze'],
    'payment_method': ['credit_card', 'bank_transfer', 'digital_wallet', 'debit_card'],
    'country': ['US', 'UK', 'DE', 'CA']
})

print(f"‚úÖ Test data shape: {your_test_data.shape}")
print("Test data preview:")
print(your_test_data)

# STEP 2: Apply your trained model
print("\\n2Ô∏è‚É£ APPLY YOUR TRAINED MODEL:")
print("Replace 'your_trained_model' with your actual trained model variable name")

# OPTION A: If you have a standard RandomForestLossPredictor
if 'rf_predictor' in locals():
    predictions = rf_predictor.predict(your_test_data)
    print("‚úÖ Used rf_predictor model")
    
# OPTION B: If you have a LogTransformRandomForestPredictor  
elif 'log_rf_predictor' in locals():
    predictions = log_rf_predictor.predict(your_test_data)
    print("‚úÖ Used log_rf_predictor model (with automatic inverse transform)")
    
# OPTION C: If you have an ImprovedRandomForestPredictor
elif 'improved_predictor' in locals():
    predictions = improved_predictor.predict(your_test_data)
    print("‚úÖ Used improved_predictor model")
    
else:
    print("‚ö†Ô∏è  No trained model found in memory.")
    print("Please run one of the training cells first, or replace this with:")
    print("   predictions = your_trained_model.predict(your_test_data)")
    
    # Create dummy predictions for demonstration
    predictions = np.array([15.5, 45.2, 125.8, 8.3])
    print("üîß Using dummy predictions for demonstration")

# STEP 3: Create results DataFrame
print(f"\\n3Ô∏è‚É£ CREATE RESULTS:")
results = your_test_data.copy()
results['predicted_estimated_loss'] = predictions

print("\\nüìä FINAL RESULTS:")
print(results[['transaction_amount', 'risk_score', 'predicted_estimated_loss']].round(2))

# STEP 4: Save results
print("\\n4Ô∏è‚É£ SAVE RESULTS:")
output_file = 'test_set_predictions.csv'
results.to_csv(output_file, index=False)
print(f"‚úÖ Results saved to: {output_file}")

# STEP 5: Quick analysis
print("\\n5Ô∏è‚É£ QUICK ANALYSIS:")
print(f"Total predictions: {len(predictions)}")
print(f"Average predicted loss: ${predictions.mean():.2f}")
print(f"Highest risk transaction: ${predictions.max():.2f}")
print(f"Lowest risk transaction: ${predictions.min():.2f}")

# Risk levels
high_risk = (predictions > predictions.mean() + predictions.std()).sum()
low_risk = (predictions < predictions.mean() - predictions.std()).sum()
print(f"High risk transactions: {high_risk}")
print(f"Low risk transactions: {low_risk}")

print("\\n‚úÖ COMPLETE! Your test set predictions are ready.")