# Investment Success Prediction Model Comparison

This notebook compares multiple machine learning algorithms to predict investment success probability.

## Models to Compare:
- Linear Regression
- Ridge Regression
- Lasso Regression
- Decision Tree Regressor
- Gradient Boosting Regressor
- XGBoost Regressor
- Random Forest Regressor
- Support Vector Regressor
- Neural Network (MLPRegressor)

## Evaluation Metrics:
- Mean Absolute Error (MAE)
- Mean Squared Error (MSE)
- Root Mean Squared Error (RMSE)
- R² Score
- Mean Absolute Percentage Error (MAPE)

In [None]:
# Import required libraries
import pandas as pd
import os
from google.colab import files
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning imports
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import joblib
from datetime import datetime
from scipy import stats

# Set style for better plots
plt.style.use('default')
sns.set_palette('husl')

print('Libraries imported successfully!')
print(f'Analysis started at: {datetime.now()}')

## 1. Data Loading and Exploration

In [None]:
# Load the dataset
print('Loading dataset...')
# File path and setup
DATA_PATH = "/investa/datasets/v1.csv"
if not os.path.exists(DATA_PATH):
    print("File not found. Please upload 'datasets/v1.csv'")
    uploaded = files.upload()
    os.makedirs("/investa", exist_ok=True)
    for filename in uploaded:
        uploaded_path = f"/investa/{filename}"
        os.rename(filename, uploaded_path)
        print(f"Saved file to {uploaded_path}")

# Load data
df = pd.read_csv(DATA_PATH)

# Basic information about the dataset
print(f'Dataset shape: {df.shape}')
print(f'\nDataset info:')
print(df.info())

# Display first few rows
print('\nFirst 5 rows:')
df.head()

In [None]:
# Check for missing values
print('Missing values per column:')
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

# Statistical summary
print('\nStatistical Summary:')
display(df.describe())

# Check categorical columns unique values
categorical_cols = ['industry', 'target_market', 'business_model', 'traction']
print('\nCategorical columns unique values:')
for col in categorical_cols:
    if col in df.columns:
        print(f'{col}: {df[col].unique()}')
        print(f'  Count: {len(df[col].unique())}')

In [None]:
# Visualize target variable distribution
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.hist(df['success_probability'], bins=50, alpha=0.7, color='skyblue')
plt.title('Distribution of Success Probability')
plt.xlabel('Success Probability')
plt.ylabel('Frequency')

plt.subplot(1, 3, 2)
plt.boxplot(df['success_probability'])
plt.title('Boxplot of Success Probability')
plt.ylabel('Success Probability')

plt.subplot(1, 3, 3)
stats.probplot(df['success_probability'], dist='norm', plot=plt)
plt.title('Q-Q Plot of Success Probability')

plt.tight_layout()
plt.show()

print(f'Target variable statistics:')
print(f'Mean: {df["success_probability"].mean():.4f}')
print(f'Std: {df["success_probability"].std():.4f}')
print(f'Min: {df["success_probability"].min():.4f}')
print(f'Max: {df["success_probability"].max():.4f}')

## 2. Data Preprocessing

In [None]:
# Identify categorical and numerical columns
categorical_cols = ['industry', 'target_market', 'business_model', 'traction']
# Filter to only include columns that actually exist in the dataset
categorical_cols = [col for col in categorical_cols if col in df.columns]
numerical_cols = [col for col in df.columns if col not in categorical_cols and col != 'success_probability']

print(f'Categorical columns: {categorical_cols}')
print(f'Numerical columns: {numerical_cols}')

# Create a copy for preprocessing
df_processed = df.copy()

# Initialize label encoders dictionary
label_encoders = {}

# Encode categorical variables BEFORE splitting
for col in categorical_cols:
    print(f'\nProcessing {col}...')
    print(f'Unique values: {df_processed[col].unique()}')
    
    le = LabelEncoder()
    df_processed[col] = le.fit_transform(df_processed[col].astype(str))  # Convert to string first
    label_encoders[col] = le
    
    print(f'Encoded {col}: {list(le.classes_)}')
    print(f'Encoded values: {df_processed[col].unique()}')

print('\nCategorical encoding completed!')
print(f'Processed dataset shape: {df_processed.shape}')

In [None]:
# Prepare features and target
X = df_processed.drop('success_probability', axis=1)
y = df_processed['success_probability']

print(f'Feature matrix shape: {X.shape}')
print(f'Target vector shape: {y.shape}')
print(f'Features: {list(X.columns)}')

# Check for any remaining non-numeric data
print('\nData types check:')
print(X.dtypes)

# Split the data with stratification based on success probability bins
# Create bins for stratification
y_binned = pd.cut(y, bins=5, labels=['very_low', 'low', 'medium', 'high', 'very_high'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y_binned
)

print(f'\nTrain set: {X_train.shape[0]} samples')
print(f'Test set: {X_test.shape[0]} samples')

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print('\nFeature scaling completed!')
print(f'Scaled train set shape: {X_train_scaled.shape}')
print(f'Scaled test set shape: {X_test_scaled.shape}')

## 3. Model Definition and Training Functions

In [None]:
# Define evaluation metrics function
def calculate_metrics(y_true, y_pred, model_name):
    '''Calculate and return various regression metrics'''
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    
    # Mean Absolute Percentage Error (MAPE) with protection against division by zero
    # Replace zeros with small values to avoid division by zero
    y_true_safe = np.where(y_true == 0, 1e-10, y_true)
    mape = np.mean(np.abs((y_true - y_pred) / y_true_safe)) * 100
    
    return {
        'Model': model_name,
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R²': r2,
        'MAPE': mape
    }

# Define cross-validation function
def perform_cv(model, X, y, cv=5):
    '''Perform cross-validation and return scores'''
    cv_scores = cross_val_score(model, X, y, cv=cv, scoring='r2')
    return cv_scores

print('Utility functions defined!')

In [None]:
# Initialize models dictionary with more conservative parameters
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0, random_state=42),
    'Lasso Regression': Lasso(alpha=0.01, random_state=42, max_iter=2000),
    'Decision Tree': DecisionTreeRegressor(random_state=42, max_depth=10, min_samples_split=5),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10, min_samples_split=5),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42, max_depth=6, learning_rate=0.1),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42, max_depth=6, learning_rate=0.1),
    'SVR': SVR(kernel='rbf', C=1.0, gamma='scale'),
    'Neural Network': MLPRegressor(hidden_layer_sizes=(100, 50), random_state=42, max_iter=1000, early_stopping=True)
}

print(f'Initialized {len(models)} models for comparison')
for name in models.keys():
    print(f'- {name}')

## 4. Model Training and Evaluation

In [None]:
# Store results
results = []
trained_models = {}
cv_results = {}

print('Training and evaluating models...\n')

for name, model in models.items():
    print(f'Training {name}...')
    
    try:
        # Use scaled data for models that need it
        if name in ['Linear Regression', 'Ridge Regression', 'Lasso Regression', 'SVR', 'Neural Network']:
            X_train_use = X_train_scaled
            X_test_use = X_test_scaled
        else:
            X_train_use = X_train.values  # Convert to numpy array to avoid issues
            X_test_use = X_test.values
        
        # Train the model
        model.fit(X_train_use, y_train)
        
        # Make predictions
        y_train_pred = model.predict(X_train_use)
        y_test_pred = model.predict(X_test_use)
        
        # Calculate metrics for training and test sets
        train_metrics = calculate_metrics(y_train, y_train_pred, f'{name} (Train)')
        test_metrics = calculate_metrics(y_test, y_test_pred, f'{name} (Test)')
        
        # Perform cross-validation
        cv_scores = perform_cv(model, X_train_use, y_train)
        cv_results[name] = cv_scores
        
        # Store results
        results.append(train_metrics)
        results.append(test_metrics)
        
        # Store trained model
        trained_models[name] = model
        
        print(f'✓ {name} completed - Test R²: {test_metrics["R²"]:.4f}')
        print(f'  CV R² Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})\n')
        
    except Exception as e:
        print(f'❌ Error training {name}: {str(e)}\n')
        continue

print('Model training completed!')

## 5. Results Analysis and Visualization

In [None]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Separate train and test results
train_results = results_df[results_df['Model'].str.contains('Train')].copy()
test_results = results_df[results_df['Model'].str.contains('Test')].copy()

# Clean model names
train_results['Model'] = train_results['Model'].str.replace(' (Train)', '')
test_results['Model'] = test_results['Model'].str.replace(' (Test)', '')

print('Model Performance Summary (Test Set):')
print('=' * 60)
test_results_sorted = test_results.sort_values('R²', ascending=False)
for _, row in test_results_sorted.iterrows():
    print(f'{row["Model"]:<20} | R²: {row["R²"]:.4f} | RMSE: {row["RMSE"]:.4f} | MAE: {row["MAE"]:.4f}')

# Display detailed results table
print('\nDetailed Results:')
display(test_results_sorted.round(4))

In [None]:
# Plot model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# R² Score comparison
axes[0, 0].barh(test_results['Model'], test_results['R²'], color='skyblue')
axes[0, 0].set_title('R² Score Comparison (Test Set)')
axes[0, 0].set_xlabel('R² Score')
axes[0, 0].grid(True, alpha=0.3)

# RMSE comparison
axes[0, 1].barh(test_results['Model'], test_results['RMSE'], color='lightcoral')
axes[0, 1].set_title('RMSE Comparison (Test Set)')
axes[0, 1].set_xlabel('RMSE')
axes[0, 1].grid(True, alpha=0.3)

# MAE comparison
axes[1, 0].barh(test_results['Model'], test_results['MAE'], color='lightgreen')
axes[1, 0].set_title('MAE Comparison (Test Set)')
axes[1, 0].set_xlabel('MAE')
axes[1, 0].grid(True, alpha=0.3)

# MAPE comparison
axes[1, 1].barh(test_results['Model'], test_results['MAPE'], color='orange')
axes[1, 1].set_title('MAPE Comparison (Test Set)')
axes[1, 1].set_xlabel('MAPE (%)')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Best Model Analysis

In [None]:
# Identify the best model based on test R² score
best_model_name = test_results.loc[test_results['R²'].idxmax(), 'Model']
best_model = trained_models[best_model_name]
best_scores = test_results[test_results['Model'] == best_model_name].iloc[0]

print(f'🏆 BEST MODEL: {best_model_name}')
print('=' * 50)
print(f'R² Score: {best_scores["R²"]:.4f}')
print(f'RMSE: {best_scores["RMSE"]:.4f}')
print(f'MAE: {best_scores["MAE"]:.4f}')
print(f'MAPE: {best_scores["MAPE"]:.4f}%')
print(f'CV Mean R²: {cv_results[best_model_name].mean():.4f}')
print(f'CV Std R²: {cv_results[best_model_name].std():.4f}')

# Get predictions from best model
if best_model_name in ['Linear Regression', 'Ridge Regression', 'Lasso Regression', 'SVR', 'Neural Network']:
    y_pred_best = best_model.predict(X_test_scaled)
else:
    y_pred_best = best_model.predict(X_test.values)

In [None]:
# Visualize best model performance
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Predicted vs Actual
axes[0, 0].scatter(y_test, y_pred_best, alpha=0.6, color='blue')
axes[0, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0, 0].set_xlabel('Actual Success Probability')
axes[0, 0].set_ylabel('Predicted Success Probability')
axes[0, 0].set_title(f'{best_model_name}: Predicted vs Actual')
axes[0, 0].grid(True, alpha=0.3)

# Residuals plot
residuals = y_test - y_pred_best
axes[0, 1].scatter(y_pred_best, residuals, alpha=0.6, color='green')
axes[0, 1].axhline(y=0, color='r', linestyle='--')
axes[0, 1].set_xlabel('Predicted Success Probability')
axes[0, 1].set_ylabel('Residuals')
axes[0, 1].set_title(f'{best_model_name}: Residuals Plot')
axes[0, 1].grid(True, alpha=0.3)

# Residuals histogram
axes[1, 0].hist(residuals, bins=30, alpha=0.7, color='purple')
axes[1, 0].set_xlabel('Residuals')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title(f'{best_model_name}: Residuals Distribution')
axes[1, 0].grid(True, alpha=0.3)

# Q-Q plot of residuals
stats.probplot(residuals, dist='norm', plot=axes[1, 1])
axes[1, 1].set_title(f'{best_model_name}: Q-Q Plot of Residuals')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Feature Importance Analysis

In [None]:
# Feature importance for tree-based models
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 8))
    plt.barh(range(len(feature_importance)), feature_importance['importance'])
    plt.yticks(range(len(feature_importance)), feature_importance['feature'])
    plt.xlabel('Feature Importance')
    plt.title(f'{best_model_name}: Feature Importance')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    print('Top 10 Most Important Features:')
    display(feature_importance.head(10))
    
elif hasattr(best_model, 'coef_'):
    # For linear models, show coefficient magnitudes
    feature_coef = pd.DataFrame({
        'feature': X.columns,
        'coefficient': best_model.coef_,
        'abs_coefficient': np.abs(best_model.coef_)
    }).sort_values('abs_coefficient', ascending=False)
    
    plt.figure(figsize=(10, 8))
    colors = ['red' if x < 0 else 'blue' for x in feature_coef['coefficient']]
    plt.barh(range(len(feature_coef)), feature_coef['coefficient'], color=colors)
    plt.yticks(range(len(feature_coef)), feature_coef['feature'])
    plt.xlabel('Coefficient Value')
    plt.title(f'{best_model_name}: Feature Coefficients')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    print('Top 10 Most Important Features (by coefficient magnitude):')
    display(feature_coef.head(10))
else:
    print(f'Feature importance not available for {best_model_name}')

## 8. Model Saving and Export

In [None]:
# Save the best model and preprocessors
import os

# Create models directory if it doesn't exist
os.makedirs('saved_models', exist_ok=True)

# Save the best model
model_filename = f'saved_models/best_model_{best_model_name.lower().replace(" ", "_")}.pkl'
joblib.dump(best_model, model_filename)

# Save the scaler
scaler_filename = 'saved_models/feature_scaler.pkl'
joblib.dump(scaler, scaler_filename)

# Save label encoders
encoders_filename = 'saved_models/label_encoders.pkl'
joblib.dump(label_encoders, encoders_filename)

# Save feature names
features_filename = 'saved_models/feature_names.pkl'
joblib.dump(list(X.columns), features_filename)

# Save categorical columns list
categorical_cols_filename = 'saved_models/categorical_columns.pkl'
joblib.dump(categorical_cols, categorical_cols_filename)

# Save model performance metrics
metrics_filename = 'saved_models/model_metrics.pkl'
model_info = {
    'best_model_name': best_model_name,
    'test_metrics': best_scores.to_dict(),
    'cv_scores': cv_results[best_model_name],
    'all_results': results_df,
    'categorical_columns': categorical_cols
}
joblib.dump(model_info, metrics_filename)

print(f'✅ Best model saved: {model_filename}')
print(f'✅ Scaler saved: {scaler_filename}')
print(f'✅ Label encoders saved: {encoders_filename}')
print(f'✅ Feature names saved: {features_filename}')
print(f'✅ Categorical columns saved: {categorical_cols_filename}')
print(f'✅ Model metrics saved: {metrics_filename}')

print(f'\n📁 All files saved in "saved_models" directory')

## 9. Robust Model Loading and Prediction Function

In [None]:
# Create a robust prediction function for new data
def load_model_and_predict(new_data):
    '''
    Load the saved model and make predictions on new data
    
    Parameters:
    new_data: pandas DataFrame with the same columns as training data (except success_probability)
    
    Returns:
    predictions: numpy array of predicted success probabilities
    '''
    
    # Load saved components
    model = joblib.load(model_filename)
    scaler = joblib.load(scaler_filename)
    encoders = joblib.load(encoders_filename)
    feature_names = joblib.load(features_filename)
    categorical_columns = joblib.load(categorical_cols_filename)
    
    # Preprocess new data
    new_data_processed = new_data.copy()
    
    # Encode categorical variables with robust handling for unseen categories
    for col in categorical_columns:
        if col in new_data_processed.columns:
            encoder = encoders[col]
            
            # Convert to string to ensure consistent data type
            new_data_processed[col] = new_data_processed[col].astype(str)
            
            # Handle unseen categories by mapping them to the most frequent class
            def safe_transform(value):
                try:
                    return encoder.transform([value])[0]
                except ValueError:
                    # If unseen category, use the most frequent class (first in classes_)
                    most_frequent_class = encoder.classes_[0]
                    print(f'Warning: Unseen category "{value}" in column "{col}". Using "{most_frequent_class}".')
                    return encoder.transform([most_frequent_class])[0]
            
            new_data_processed[col] = new_data_processed[col].apply(safe_transform)
    
    # Ensure correct feature order and fill missing features with 0
    for feature in feature_names:
        if feature not in new_data_processed.columns:
            new_data_processed[feature] = 0
            print(f'Warning: Missing feature "{feature}". Filled with 0.')
    
    # Select only the required features in correct order
    new_data_processed = new_data_processed[feature_names]
    
    # Scale features if needed
    if best_model_name in ['Linear Regression', 'Ridge Regression', 'Lasso Regression', 'SVR', 'Neural Network']:
        new_data_processed = scaler.transform(new_data_processed)
    else:
        new_data_processed = new_data_processed.values
    
    # Make predictions
    predictions = model.predict(new_data_processed)
    
    return predictions

print('Robust prediction function defined!')

In [None]:
# Test the prediction function with sample data from the original dataset
# Get some test samples
sample_indices = df.sample(3, random_state=42).index
sample_data_original = df.loc[sample_indices].drop('success_probability', axis=1)
sample_actual = df.loc[sample_indices]['success_probability'].values

print('Testing prediction function with sample data:')
print('Sample data:')
display(sample_data_original)

# Test predictions
sample_predictions = load_model_and_predict(sample_data_original)

print('\nPrediction Results:')
print('=' * 50)
for i in range(len(sample_predictions)):
    print(f'Sample {i+1}:')
    print(f'  Predicted: {sample_predictions[i]:.4f}')
    print(f'  Actual:    {sample_actual[i]:.4f}')
    print(f'  Difference: {abs(sample_predictions[i] - sample_actual[i]):.4f}')
    print()

print('✅ Prediction function working correctly!')

## 10. Final Summary and Recommendations

In [None]:
# Final summary
print('🎯 INVESTMENT PREDICTION MODEL ANALYSIS SUMMARY')
print('=' * 60)
print(f'Dataset: {df.shape[0]} samples, {df.shape[1]-1} features')
print(f'Target: Success Probability (Range: {df["success_probability"].min():.4f} - {df["success_probability"].max():.4f})')
print(f'\n🏆 BEST MODEL: {best_model_name}')
print(f'   R² Score: {best_scores["R²"]:.4f} (Explains {best_scores["R²"]*100:.1f}% of variance)')
print(f'   RMSE: {best_scores["RMSE"]:.4f}')
print(f'   MAE: {best_scores["MAE"]:.4f}')
print(f'   Cross-Validation R²: {cv_results[best_model_name].mean():.4f} ± {cv_results[best_model_name].std():.4f}')

print('\n📊 TOP 3 MODELS BY PERFORMANCE:')
top_3 = test_results_sorted.head(3)
for i, (_, row) in enumerate(top_3.iterrows(), 1):
    print(f'   {i}. {row["Model"]:<20} | R²: {row["R²"]:.4f} | RMSE: {row["RMSE"]:.4f}')

print('\n💾 SAVED ARTIFACTS:')
print(f'   • Best model: {model_filename}')
print(f'   • Feature scaler: {scaler_filename}')
print(f'   • Label encoders: {encoders_filename}')
print(f'   • Feature names: {features_filename}')
print(f'   • Categorical columns: {categorical_cols_filename}')
print(f'   • Model metrics: {metrics_filename}')

print('\n🔮 TO USE THE MODEL FOR NEW PREDICTIONS:')
print('   1. Prepare data with columns: {}'.format(', '.join(list(df.columns)[:-1])))
print('   2. Call load_model_and_predict(new_data_df)')
print('   3. The function handles encoding and scaling automatically')
print('   4. Returns success probability predictions')

print('\n📋 DATA REQUIREMENTS:')
print(f'   • Categorical columns: {categorical_cols}')
print(f'   • Numerical columns: {[col for col in df.columns if col not in categorical_cols and col != "success_probability"]}')

print(f'\n✅ Analysis completed at: {datetime.now()}')
print('\n🎉 Ready for production use!')
print('\n🛡️ The model includes robust error handling for:')
print('   • Unseen categorical values')
print('   • Missing features')
print('   • Data type inconsistencies')

## 11. Demonstration: Using the Model for New Predictions

In [None]:
# Demonstration: How to use the model for new predictions
print('🔮 DEMONSTRATION: Model Loading and Prediction')
print('=' * 60)

# Create 5 example investment scenarios for prediction
example_data = pd.DataFrame({
    'industry': ['Technology', 'Health and Fitness', 'Fashion', 'Food and Beverage', 'Beauty'],
    'funding_egp': [500000, 1200000, 300000, 800000, 450000],
    'equity_percentage': [25.0, 15.0, 30.0, 20.0, 35.0],
    'duration_months': [12, 18, 6, 24, 9],
    'target_market': ['Global', 'Regional', 'Local', 'Global', 'Regional'],
    'business_model': ['B2B', 'Marketplace', 'Subscription', 'B2B', 'Marketplace'],
    'founder_experience_years': [8, 12, 3, 15, 6],
    'team_size': [5, 8, 3, 12, 4],
    'traction': ['High', 'Medium', 'Low', 'High', 'Medium'],
    'market_size_usd': [10000000, 7000000, 5000000, 8000000, 6000000],
    'funding_usd': [16666.67, 40000.0, 10000.0, 26666.67, 15000.0],
    'profit': [120000, 85000, -15000, 180000, 45000],
    'repeat_purchase_rate': [0.7, 0.6, 0.3, 0.8, 0.5],
    'branches_count': [2, 3, 0, 5, 1],
    'revenue': [450000, 320000, 85000, 720000, 180000],
    'customers': [1500, 2200, 450, 3500, 800],
    'revenue_growth': [0.25, 0.18, 0.05, 0.32, 0.15],
    'profit_margin': [0.27, 0.27, -0.18, 0.25, 0.25],
    'customer_growth': [0.20, 0.15, 0.08, 0.28, 0.12],
    'churn_rate': [0.15, 0.20, 0.35, 0.12, 0.25],
    'operating_costs': [330000, 235000, 100000, 540000, 135000],
    'debt_to_profit_ratio': [0.8, 1.2, -2.5, 0.6, 1.5]
})

print('📊 Example Investment Scenarios:')
print('-' * 60)
display(example_data)

print('\n🎯 Making Predictions...')
print('-' * 60)

# Use the prediction function
try:
    predictions = load_model_and_predict(example_data)
    
    # Create results dataframe
    results_demo = pd.DataFrame({
        'Scenario': [f'Investment {i+1}' for i in range(len(predictions))],
        'Industry': example_data['industry'],
        'Funding (EGP)': example_data['funding_egp'],
        'Business Model': example_data['business_model'],
        'Traction': example_data['traction'],
        'Predicted Success Probability': [f'{pred:.4f}' for pred in predictions],
        'Success Percentage': [f'{pred*100:.2f}%' for pred in predictions],
        'Risk Level': ['High Risk' if pred < 0.3 else 'Medium Risk' if pred < 0.7 else 'Low Risk' for pred in predictions]
    })
    
    print('📈 PREDICTION RESULTS:')
    print('=' * 80)
    display(results_demo)
    
    # Show detailed analysis for each prediction
    print('\n📋 DETAILED ANALYSIS:')
    print('=' * 80)
    for i, (_, row) in enumerate(results_demo.iterrows()):
        risk_emoji = '🔴' if predictions[i] < 0.3 else '🟡' if predictions[i] < 0.7 else '🟢'
        print(f'{risk_emoji} {row["Scenario"]} - {row["Industry"]} ({row["Business Model"]})')
        print(f'   Success Probability: {predictions[i]:.4f} ({predictions[i]*100:.2f}%)')
        print(f'   Funding: {row["Funding (EGP)"]:,} EGP | Traction: {row["Traction"]} | Risk: {row["Risk Level"]}')
        print()
    
    # Investment recommendations
    print('\n💡 INVESTMENT RECOMMENDATIONS:')
    print('=' * 80)
    
    high_potential = results_demo[results_demo['Risk Level'] == 'Low Risk']
    medium_potential = results_demo[results_demo['Risk Level'] == 'Medium Risk']
    low_potential = results_demo[results_demo['Risk Level'] == 'High Risk']
    
    if len(high_potential) > 0:
        print('🟢 RECOMMENDED INVESTMENTS (Low Risk):')
        for _, row in high_potential.iterrows():
            print(f'   • {row["Scenario"]}: {row["Industry"]} - {row["Success Percentage"]} success rate')
        print()
    
    if len(medium_potential) > 0:
        print('🟡 CONSIDER WITH CAUTION (Medium Risk):')
        for _, row in medium_potential.iterrows():
            print(f'   • {row["Scenario"]}: {row["Industry"]} - {row["Success Percentage"]} success rate')
        print()
    
    if len(low_potential) > 0:
        print('🔴 NOT RECOMMENDED (High Risk):')
        for _, row in low_potential.iterrows():
            print(f'   • {row["Scenario"]}: {row["Industry"]} - {row["Success Percentage"]} success rate')
        print()
    
    # Summary statistics
    print('📊 SUMMARY STATISTICS:')
    print('=' * 80)
    print(f'Average Success Probability: {predictions.mean():.4f} ({predictions.mean()*100:.2f}%)')
    print(f'Highest Success Probability: {predictions.max():.4f} ({predictions.max()*100:.2f}%)')
    print(f'Lowest Success Probability: {predictions.min():.4f} ({predictions.min()*100:.2f}%)')
    print(f'Standard Deviation: {predictions.std():.4f}')
    print(f'\nRisk Distribution:')
    print(f'  🟢 Low Risk (>70%): {len(high_potential)} investments')
    print(f'  🟡 Medium Risk (30-70%): {len(medium_potential)} investments')
    print(f'  🔴 High Risk (<30%): {len(low_potential)} investments')

except Exception as e:
    print(f'❌ Error making predictions: {str(e)}')
    print('Make sure the model has been trained and saved first!')

print('\n✅ Demonstration completed!')
print('\n💡 TIP: You can modify the example_data DataFrame above to test different scenarios')

## 12. How to Use This Model in Production

### Step-by-Step Guide:

1. **Prepare your data** in the same format as the training data
2. **Load the model** using the `load_model_and_predict()` function
3. **Get predictions** for success probability
4. **Make investment decisions** based on the results

### Data Requirements:
- All numerical features must be present
- Categorical features: `industry`, `target_market`, `business_model`, `traction`
- The function handles missing or unknown categories automatically

### Example Usage:
```python
# Your new investment data
new_investment = pd.DataFrame({
    'industry': ['Technology'],
    'funding_egp': [750000],
    'equity_percentage': [20.0],
    # ... include all other required columns
})

# Get prediction
success_prob = load_model_and_predict(new_investment)
print(f'Success probability: {success_prob[0]:.4f}')
```

### Interpretation Guide:
- **> 0.70**: Low risk, high chance of success
- **0.30 - 0.70**: Medium risk, requires careful evaluation
- **< 0.30**: High risk, not recommended without major improvements