In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
file_path = "/content/drive/MyDrive/Applied research project/Data_X/Final-V14-news_updated.xlsx"
df = pd.read_excel(file_path)

In [None]:
### Feature engineering

# Convert 'ADJ_FOUNDED_ON' to datetime and extract the founding year
df['founded_year'] = pd.to_datetime(df['ADJ_FOUNDED_ON'], errors='coerce').dt.year

def extract_exited_year(value):
    # If the startup still not exited
    if isinstance(value, str) and value.strip().lower() == "not exited yet":
        # Use 0
        return 0
    else:
        # Try to convert the value to a datetime and extract the year
        try:
            return pd.to_datetime(value, errors='coerce').year
        except Exception as e:
            return 0

# Apply the function to create a new 'closed_year' column
df['exit_year'] = df['ADJ_EXITED_ON'].apply(extract_exited_year)

# Convert datetime columns to numerical before setting X and y
datetime_cols = df.select_dtypes(include=['datetime64']).columns
for col in datetime_cols:
    df[col] = df[col].apply(lambda x: x.toordinal() if pd.notnull(x) else x)

# Apply log transformation to reduce skew on money columns.
df['Average_Money_Raised_log'] = np.log(df['Average_Money_Raised'] + 1)
df['Total_Money_Raised_log'] = np.log(df['Toatl_Money_Raised'] + 1)
df['Money_Raised_Ratio'] = df['Average_Money_Raised_log'] / (df['Total_Money_Raised_log'] + 1e-8)

## Check for missing values in the entire DataFrame
total_nans = df.isnull().sum().sum()
print("Total NaNs in DataFrame:", total_nans)

# List all columns that contain NaN values
nan_columns = df.columns[df.isnull().any()].tolist()
print("Columns with NaN values:", nan_columns)

# If you suspect empty strings may be an issue, replace them with np.nan and check again:
df_cleaned = df.replace(r'^\s*$', np.nan, regex=True)
nan_columns_after_replace = df_cleaned.columns[df_cleaned.isnull().any()].tolist()
print("Columns with NaN values after replacing empty strings:", nan_columns_after_replace)

# Check for infinite values in numeric columns
numeric_cols = df.select_dtypes(include=[np.number])
inf_counts = (numeric_cols == np.inf).sum() + (numeric_cols == -np.inf).sum()
inf_columns = inf_counts[inf_counts > 0].index.tolist()
print("Columns with infinite values:", inf_columns)



In [None]:
# Handle NaN in investment
df.fillna(0, inplace=True)

In [None]:
# New feature calculate the currunt year-founding year and divide the total news to get average news per year
# Get the current year furst
current_year = pd.Timestamp.now().year

# Calculate the 'Yearly_Exposure' column
df['Yearly_Exposure'] = df['Total_News'] / (current_year - df['founded_year'] + 1)

In [None]:
df['High_Positive_News_Rule2'] = (
    (df['SUM_Sentiment_Category_Positive'] + df['SUM_Sentiment_Category_Very Positive']) >
     (df['SUM_Sentiment_Category_Negative'] + df['SUM_Sentiment_Category_Very Negative'])
).astype(int)

# Class distribution for the new target
new_dist = df['High_Positive_News_Rule2'].value_counts().rename_axis('Class').reset_index(name='Count')
new_dist['Proportion'] = (new_dist['Count'] / len(df)).round(3)
new_dist

In [None]:
df = df[df['Total_News'] != 0]
new_dist = df['High_Positive_News_Rule2'].value_counts().rename_axis('Class').reset_index(name='Count')
new_dist['Proportion'] = (new_dist['Count'] / len(df)).round(3)
new_dist

In [None]:
# delete the rows where "total news" column has a 0 value
df = df[df['Total_News'] != 0]
# Define feature matrix X and target y.
X = df.drop(['High_Positive_News_Rule2','Average_Money_Raised', 'Toatl_Money_Raised', 'Total_Money_Raised_log','Average_Money_Raised_log',
             'ADJ_FOUNDED_ON','ADJ_EXITED_ON', 'ADJ_CLOSED_ON','COMPANY_ID','Total_News',
             'Acquihire', 'Acquisition', 'Leveraged Buyout', 'Merger', 'No Acquisition Type', 'Acquihire_dummy', 'Acquisition_dummy', 'Leveraged Buyout_dummy', 'Merger_dummy', 'NoAcquisitionType_dummy'], axis=1)
# sentiment_cols = [col for col in X.columns if 'Sentiment_Category' in col or 'Total_News' in col]
sentiment_cols = [col for col in X.columns if 'Sentiment_Category' in col]

X = X.drop(sentiment_cols, axis=1)
y = df['High_Positive_News_Rule2']

In [None]:
# Display all features in X
for col in X.columns:
    print(col)

In [None]:
import numpy as np
import pprint
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.base import clone
from sklearn.utils.class_weight import compute_class_weight
import xgboost as xgb
from imblearn.over_sampling import SMOTE

# -----------------------------------------------------------------------------------
# Helper function: compute metrics for given true labels, predictions, and probability estimates.
def get_metrics(y_true, y_pred, y_proba):
    metrics = {}
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['precision'] = precision_score(y_true, y_pred, zero_division=0)
    metrics['recall'] = recall_score(y_true, y_pred, zero_division=0)
    metrics['f1'] = f1_score(y_true, y_pred, zero_division=0)
    if y_proba is not None:
        metrics['auc'] = roc_auc_score(y_true, y_proba)
    else:
        metrics['auc'] = np.nan
    return metrics

# -----------------------------------------------------------------------------------
# Core function: evaluate a model using cross validation with a given balancing strategy.
def evaluate_model(model, X, y, balancing, cv_splits=5, random_state=42):
    """
    balancing: one of 'none', 'smote', or 'class_weight'
    """
    skf = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=random_state)
    train_metrics_list = []
    test_metrics_list = []

    for train_index, test_index in skf.split(X, y):
        # Use .iloc if X and y are pandas objects to select rows by integer location.
        try:
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        except AttributeError:
            # If X and y are numpy arrays.
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

        sample_weights = None  # For class weighting (if needed)

        # Apply SMOTE on the training data only.
        if balancing == 'smote':
            sm = SMOTE(random_state=random_state)
            X_train, y_train = sm.fit_resample(X_train, y_train)

        # For class_weight balancing:
        if balancing == 'class_weight':
            # For MLPClassifier, sample_weight is not supported.
            if isinstance(model, MLPClassifier):
                sample_weights = None
            else:
                classes = np.unique(y_train)
                weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
                class_weight_dict = dict(zip(classes, weights))
                sample_weights = np.array([class_weight_dict[label] for label in y_train])

        # Fit the model with or without sample weights.
        if sample_weights is not None:
            model.fit(X_train, y_train, sample_weight=sample_weights)
        else:
            model.fit(X_train, y_train)

        # In-sample (training) predictions
        y_train_pred = model.predict(X_train)
        try:
            y_train_proba = model.predict_proba(X_train)[:, 1]
        except Exception:
            try:
                y_train_proba = model.decision_function(X_train)
            except Exception:
                y_train_proba = None
        train_metrics = get_metrics(y_train, y_train_pred, y_train_proba)

        # Out-of-sample (testing) predictions
        y_test_pred = model.predict(X_test)
        try:
            y_test_proba = model.predict_proba(X_test)[:, 1]
        except Exception:
            try:
                y_test_proba = model.decision_function(X_test)
            except Exception:
                y_test_proba = None
        test_metrics = get_metrics(y_test, y_test_pred, y_test_proba)

        train_metrics_list.append(train_metrics)
        test_metrics_list.append(test_metrics)

    # Average the metrics across folds.
    avg_train_metrics = {k: np.mean([m[k] for m in train_metrics_list]) for k in train_metrics_list[0]}
    avg_test_metrics = {k: np.mean([m[k] for m in test_metrics_list]) for k in test_metrics_list[0]}
    return avg_train_metrics, avg_test_metrics

# -----------------------------------------------------------------------------------
# Main function: define all models and run evaluations for each balancing strategy.
def run_all_models(X, y):
    results = {}

    # Define model instances.
    models = {
        'RandomForest': {
            'base': RandomForestClassifier(random_state=42),
            'class_weight': RandomForestClassifier(class_weight='balanced', random_state=42)
        },
        'LogisticRegression': {
            'base': LogisticRegression(max_iter=2000, random_state=42),
            'class_weight': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
        },
        'SVM': {
            'base': SVC(probability=True, random_state=42),
            'class_weight': SVC(probability=True, class_weight='balanced', random_state=42)
        },
        'XGBoost': {
            'base': xgb.XGBClassifier(eval_metric='logloss', random_state=42),
            'class_weight': None  # Created dynamically below.
        },
        'NeuralNetwork': {
            'base': MLPClassifier(max_iter=500, random_state=42)
        }
    }

    balancing_methods = ['none', 'smote', 'class_weight']

    for model_name, model_dict in models.items():
        results[model_name] = {}
        for balancing in balancing_methods:
            if model_name == 'XGBoost' and balancing == 'class_weight':
                pos = np.sum(y == 1)
                neg = np.sum(y == 0)
                scale_pos_weight = neg / pos if pos > 0 else 1
                model_instance = xgb.XGBClassifier(eval_metric='logloss',
                                                   random_state=42,
                                                   scale_pos_weight=scale_pos_weight)
            elif balancing == 'class_weight' and model_name in ['RandomForest', 'LogisticRegression', 'SVM']:
                model_instance = model_dict['class_weight']
            else:
                model_instance = model_dict['base']

            model_instance = clone(model_instance)
            train_metrics, test_metrics = evaluate_model(model_instance, X, y, balancing, cv_splits=5)
            results[model_name][balancing] = {
                'in_sample': train_metrics,
                'out_sample': test_metrics
            }

    return results

In [None]:
# -----------------------------------------------------------------------------------
# Main execution:

results = run_all_models(X, y)
pprint.pprint(results)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Transform the nested dictionary into a flat DataFrame.
records = []
for model, model_data in results.items():
    for strategy, strat_data in model_data.items():
        for sample in ['in_sample', 'out_sample']:
            record = {
                'Model': model,
                'Balancing': strategy,
                'Sample': sample
            }
            # Convert each metric to float and add to the record.
            record.update({metric: float(val) for metric, val in strat_data[sample].items()})
            records.append(record)

df = pd.DataFrame(records)

# Create a new column to combine Model and Balancing for easier plotting.
df['Model_Balancing'] = df['Model'] + ' - ' + df['Balancing']

# List of metrics to visualize.
metrics = ['accuracy', 'auc', 'f1', 'precision', 'recall']

# Set a Seaborn style.
sns.set(style="whitegrid")

# Loop over each metric and create a bar plot.
for metric in metrics:
    plt.figure(figsize=(12, 6))
    sns.barplot(data=df, x='Model_Balancing', y=metric, hue='Sample')
    plt.title(f"{metric.capitalize()} by Model and Balancing Strategy")
    plt.xlabel("Model and Balancing Strategy")
    plt.ylabel(metric.capitalize())
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Loop over each metric and create a line plot
for metric in metrics:
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=df, x='Model_Balancing', y=metric, hue='Sample', marker='o')
    plt.title(f"{metric.capitalize()} by Model and Balancing Strategy")
    plt.xlabel("Model and Balancing Strategy")
    plt.ylabel(metric.capitalize())
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:
# Further feature engineering
# Split into training and test sets.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# ------------------------------
# Tree-Based Feature Importance with RandomForest
# ------------------------------
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Print top 20 features
print("Top 20 features by RandomForest:")
print(importance_df.head(20))
print("\nLast 20 features by RandomForest:")
print(importance_df.tail(20))


# Plot the top 10 features.
plt.figure(figsize=(12, 6))
plt.barh(importance_df['Feature'].head(10), importance_df['Importance'].head(10), color='skyblue')
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Top 10 Feature Importances using RandomForest")
plt.gca().invert_yaxis()  # Invert y-axis to show the most important at the top
plt.show()

# Plot the bottom 10 features.
plt.figure(figsize=(12, 6))
plt.barh(importance_df['Feature'].tail(10), importance_df['Importance'].tail(10), color='skyblue')
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Bottom 10 Feature Importances using RandomForest")
plt.gca().invert_yaxis()  # Invert y-axis to show the most important at the top
plt.show()

# Plot top 10 features and the sum of bottom 50% features named "Others"
plt.figure(figsize=(12, 6))
plt.barh(importance_df['Feature'].head(10), importance_df['Importance'].head(10), color='skyblue')
others_importance = importance_df['Importance'].tail(50).sum()
plt.barh('Others', others_importance, color='lightgray')
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Top 10 and Bottom 50% Feature Importances using RandomForest")
plt.gca().invert_yaxis()
plt.show()

#Plot aggregated sum of feature importance by top 10, top 30, top 100 and the sum of the rest features
plt.figure(figsize=(12, 6))
top_10 = importance_df['Importance'].head(10).sum()
top_30 = importance_df['Importance'].head(30).sum()
top_100 = importance_df['Importance'].head(100).sum()
others_importance = importance_df['Importance'].sum() - top_100
plt.barh('Top 10', top_10, color='darkred')
plt.barh('Top 30', top_30, color='darkred')
plt.barh('Top 100', top_100, color='darkred')
plt.barh('Others', others_importance, color='lightgray')
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Aggregated Feature Importances using RandomForest")
plt.gca().invert_yaxis()
plt.show()

# ------------------------------
# 3. Recursive Feature Elimination (RFE) with Logistic Regression
# ------------------------------
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

# Use Logistic Regression as the estimator; increase max_iter if needed.
lr = LogisticRegression(max_iter=1000, random_state=42)
# Here, we select the top 30 features. You can adjust n_features_to_select.
rfe = RFE(estimator=lr, n_features_to_select=30)
rfe.fit(X_train, y_train)

selected_features = X_train.columns[rfe.support_]
ranking_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Ranking': rfe.ranking_
}).sort_values(by='Ranking')

print("Top 30 features selected by RFE (lower ranking is better):")
print(ranking_df.head(30))

# Plot the rankings of the top 10 features.
plt.figure(figsize=(12, 6))
plt.barh(ranking_df['Feature'].head(10), ranking_df['Ranking'].head(10), color='lightgreen')
plt.xlabel("Feature Ranking (Lower is better)")
plt.ylabel("Feature")
plt.title("Top 20 Feature Rankings using RFE with Logistic Regression")
plt.gca().invert_yaxis()
plt.show()

# ------------------------------
# 4. Statistical Feature Selection using SelectKBest (ANOVA F-test)
# ------------------------------
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k="all")
selector.fit(X_train, y_train)
scores = selector.scores_
pvalues = selector.pvalues_

select_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Score': scores,
    'p-value': pvalues
}).sort_values(by='Score', ascending=False)

print("Top 30 features by SelectKBest (ANOVA F-test):")
print(select_df.head(30))

# Plot the scores for the top 20 features.
plt.figure(figsize=(12, 6))
plt.barh(select_df['Feature'].head(20), select_df['Score'].head(20), color='salmon')
plt.xlabel("F-score")
plt.ylabel("Feature")
plt.title("Top 20 Features by ANOVA F-test using SelectKBest")
plt.gca().invert_yaxis()
plt.show()

In [None]:
# =============================================================================
# 1. Tuning Random Forest
# =============================================================================
import numpy as np
import itertools
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier

# Define a small grid for RandomForest hyperparameters.
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

balancing_methods = ['none', 'smote', 'class_weight']

rf_results = {}

for balancing in balancing_methods:
    best_auc = -np.inf
    best_params = None
    best_in_sample = None
    best_out_sample = None
    for params in itertools.product(*param_grid_rf.values()):
        param_dict = dict(zip(param_grid_rf.keys(), params))
        # For class_weight balancing, add the parameter.
        if balancing == 'class_weight':
            param_dict['class_weight'] = 'balanced'
        rf_model = RandomForestClassifier(random_state=42, **param_dict)

        # Use your provided evaluate_model function (it applies balancing only on training set)
        in_sample_metrics, out_sample_metrics = evaluate_model(clone(rf_model), X, y, balancing, cv_splits=5)
        current_auc = out_sample_metrics['auc']
        if current_auc > best_auc:
            best_auc = current_auc
            best_params = param_dict
            best_in_sample = in_sample_metrics
            best_out_sample = out_sample_metrics
    rf_results[balancing] = {
        'best_params': best_params,
        'in_sample': best_in_sample,
        'out_sample': best_out_sample
    }

print("RandomForest tuning results:")
print(rf_results)


In [None]:
# =============================================================================
# 2. Tuning XGB
# =============================================================================
import numpy as np
import itertools
from sklearn.base import clone
import xgboost as xgb

# Define a small grid for XGBoost hyperparameters.
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.1, 0.01]
}

balancing_methods = ['none', 'smote', 'class_weight']
xgb_results = {}

for balancing in balancing_methods:
    best_auc = -np.inf
    best_params = None
    best_in_sample = None
    best_out_sample = None
    for params in itertools.product(*param_grid_xgb.values()):
        param_dict = dict(zip(param_grid_xgb.keys(), params))
        # For XGBoost using class_weight, compute scale_pos_weight from y.
        if balancing == 'class_weight':
            pos = np.sum(y == 1)
            neg = np.sum(y == 0)
            param_dict['scale_pos_weight'] = neg / pos if pos > 0 else 1
        xgb_model = xgb.XGBClassifier(eval_metric='logloss', random_state=42, **param_dict)
        in_sample_metrics, out_sample_metrics = evaluate_model(clone(xgb_model), X, y, balancing, cv_splits=5)
        current_auc = out_sample_metrics['auc']
        if current_auc > best_auc:
            best_auc = current_auc
            best_params = param_dict
            best_in_sample = in_sample_metrics
            best_out_sample = out_sample_metrics
    xgb_results[balancing] = {
        'best_params': best_params,
        'in_sample': best_in_sample,
        'out_sample': best_out_sample
    }

print("XGBoost tuning results:")
print(xgb_results)


In [None]:
# =============================================================================
# 3. Tuning Log
# =============================================================================
import numpy as np
import itertools
from sklearn.base import clone
from sklearn.linear_model import LogisticRegression

# Define a small grid for Logistic Regression hyperparameters.
param_grid_lr = {
    'C': [0.1, 1, 10],
    'max_iter': [2000]  # Increased to help with convergence.
}

balancing_methods = ['none', 'smote', 'class_weight']
lr_results = {}

for balancing in balancing_methods:
    best_auc = -np.inf
    best_params = None
    best_in_sample = None
    best_out_sample = None
    for params in itertools.product(*param_grid_lr.values()):
        param_dict = dict(zip(param_grid_lr.keys(), params))
        if balancing == 'class_weight':
            param_dict['class_weight'] = 'balanced'
        lr_model = LogisticRegression(random_state=42, **param_dict)
        in_sample_metrics, out_sample_metrics = evaluate_model(clone(lr_model), X, y, balancing, cv_splits=5)
        current_auc = out_sample_metrics['auc']
        if current_auc > best_auc:
            best_auc = current_auc
            best_params = param_dict
            best_in_sample = in_sample_metrics
            best_out_sample = out_sample_metrics
    lr_results[balancing] = {
        'best_params': best_params,
        'in_sample': best_in_sample,
        'out_sample': best_out_sample
    }

print("Logistic Regression tuning results:")
print(lr_results)


In [None]:
# =============================================================================
# 4. Next-round tuning
# =============================================================================
import numpy as np
import itertools
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone

# Improved grid for RandomForest to reduce overfitting:
param_grid_rf_improved = {
    'n_estimators': [100, 200],
    'max_depth': [5, 8, 10],
    'min_samples_split': [2, 5, 10],
    'max_features': [None, 'sqrt']
}

balancing_methods = ['none', 'smote', 'class_weight']
rf_improved_results = {}

for balancing in balancing_methods:
    print(f"\nTuning RandomForest with balancing strategy: '{balancing}'")
    grid = list(itertools.product(*param_grid_rf_improved.values()))
    total_fits = len(grid)
    print(f"Total parameter combinations to evaluate: {total_fits}")

    best_auc = -np.inf
    best_params = None
    best_in_sample = None
    best_out_sample = None

    for params in grid:
        param_dict = dict(zip(param_grid_rf_improved.keys(), params))
        if balancing == 'class_weight':
            param_dict['class_weight'] = 'balanced'

        rf_model = RandomForestClassifier(random_state=42, **param_dict)
        in_sample_metrics, out_sample_metrics = evaluate_model(clone(rf_model), X, y, balancing, cv_splits=5)
        current_auc = out_sample_metrics['auc']

        if current_auc > best_auc:
            best_auc = current_auc
            best_params = param_dict
            best_in_sample = in_sample_metrics
            best_out_sample = out_sample_metrics

    rf_improved_results[balancing] = {
        'best_params': best_params,
        'in_sample': best_in_sample,
        'out_sample': best_out_sample
    }

    print(f"Best results for RandomForest with balancing '{balancing}':")
    print(f"  Best parameters: {best_params}")
    print("  In-sample metrics:")
    for metric, value in best_in_sample.items():
        print(f"    {metric}: {value:.4f}")
    print("  Out-of-sample metrics:")
    for metric, value in best_out_sample.items():
        print(f"    {metric}: {value:.4f}")

print("\nFinal Improved RandomForest Tuning Results:")
print(rf_improved_results)

import numpy as np
import itertools
import xgboost as xgb
from sklearn.base import clone

# Improved grid for XGBoost with additional regularization parameters.
param_grid_xgb_improved = {
    'n_estimators': [50, 100],
    'max_depth': [2, 3, 4],
    'learning_rate': [0.01, 0.1],
    'reg_lambda': [0, 1],  # L2 regularization parameter.
    'reg_alpha': [0, 0.1]   # L1 regularization parameter.
}

balancing_methods = ['none', 'smote', 'class_weight']
xgb_improved_results = {}

for balancing in balancing_methods:
    print(f"\nTuning XGBoost with balancing strategy: '{balancing}'")
    grid = list(itertools.product(*param_grid_xgb_improved.values()))
    total_fits = len(grid)
    print(f"Total parameter combinations to evaluate: {total_fits}")

    best_auc = -np.inf
    best_params = None
    best_in_sample = None
    best_out_sample = None

    for params in grid:
        param_dict = dict(zip(param_grid_xgb_improved.keys(), params))
        if balancing == 'class_weight':
            pos = np.sum(y == 1)
            neg = np.sum(y == 0)
            param_dict['scale_pos_weight'] = neg / pos if pos > 0 else 1

        xgb_model = xgb.XGBClassifier(eval_metric='logloss', random_state=42, **param_dict)
        in_sample_metrics, out_sample_metrics = evaluate_model(clone(xgb_model), X, y, balancing, cv_splits=5)
        current_auc = out_sample_metrics['auc']

        if current_auc > best_auc:
            best_auc = current_auc
            best_params = param_dict
            best_in_sample = in_sample_metrics
            best_out_sample = out_sample_metrics

    xgb_improved_results[balancing] = {
        'best_params': best_params,
        'in_sample': best_in_sample,
        'out_sample': best_out_sample
    }

    print(f"Best results for XGBoost with balancing '{balancing}':")
    print(f"  Best parameters: {best_params}")
    print("  In-sample metrics:")
    for metric, value in best_in_sample.items():
        print(f"    {metric}: {value:.4f}")
    print("  Out-of-sample metrics:")
    for metric, value in best_out_sample.items():
        print(f"    {metric}: {value:.4f}")

print("\nFinal Improved XGBoost Tuning Results:")
print(xgb_improved_results)


In [None]:
# =============================================================================
# 5.0 tuning
# =============================================================================
import numpy as np
import itertools
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone

# Reduced grid for RandomForest
param_grid_rf_fast = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [3, 5],
    'min_samples_split': [5, 10],
    'max_features': [None, 'sqrt']
}

balancing_methods = ['none', 'smote', 'class_weight']
rf_fast_results = {}

for balancing in balancing_methods:
    print(f"\nTuning RandomForest with balancing strategy: '{balancing}'")
    grid = list(itertools.product(*param_grid_rf_fast.values()))
    total_fits = len(grid)
    print(f"Total parameter combinations to evaluate: {total_fits}")

    best_auc = -np.inf
    best_params = None
    best_in_sample = None
    best_out_sample = None

    for params in grid:
        param_dict = dict(zip(param_grid_rf_fast.keys(), params))
        if balancing == 'class_weight':
            param_dict['class_weight'] = 'balanced'

        rf_model = RandomForestClassifier(random_state=42, **param_dict)
        in_sample_metrics, out_sample_metrics = evaluate_model(clone(rf_model), X, y, balancing, cv_splits=5)
        current_auc = out_sample_metrics['auc']

        if current_auc > best_auc:
            best_auc = current_auc
            best_params = param_dict
            best_in_sample = in_sample_metrics
            best_out_sample = out_sample_metrics

    rf_fast_results[balancing] = {
        'best_params': best_params,
        'in_sample': best_in_sample,
        'out_sample': best_out_sample
    }

    print(f"\nBest results for RandomForest with balancing '{balancing}':")
    print(f"  Best parameters: {best_params}")
    print("  In-sample metrics:")
    for metric, value in best_in_sample.items():
        print(f"    {metric}: {value:.4f}")
    print("  Out-of-sample metrics:")
    for metric, value in best_out_sample.items():
        print(f"    {metric}: {value:.4f}")

print("\nFinal Faster RandomForest Tuning Results:")
print(rf_fast_results)

import numpy as np
import itertools
import xgboost as xgb
from sklearn.base import clone

# Reduced grid for XGBoost (16 combinations, ignoring reg_alpha for speed)
param_grid_xgb_fast = {
    'n_estimators': [50, 100],
    'max_depth': [2, 3],
    'learning_rate': [0.01, 0.1],
    'reg_lambda': [0, 1]
}

balancing_methods = ['none', 'smote', 'class_weight']
xgb_fast_results = {}

for balancing in balancing_methods:
    print(f"\nTuning XGBoost with balancing strategy: '{balancing}'")
    grid = list(itertools.product(*param_grid_xgb_fast.values()))
    total_fits = len(grid)
    print(f"Total parameter combinations to evaluate: {total_fits}")

    best_auc = -np.inf
    best_params = None
    best_in_sample = None
    best_out_sample = None

    for params in grid:
        param_dict = dict(zip(param_grid_xgb_fast.keys(), params))
        if balancing == 'class_weight':
            pos = np.sum(y == 1)
            neg = np.sum(y == 0)
            param_dict['scale_pos_weight'] = neg / pos if pos > 0 else 1

        xgb_model = xgb.XGBClassifier(eval_metric='logloss', random_state=42, **param_dict)
        in_sample_metrics, out_sample_metrics = evaluate_model(clone(xgb_model), X, y, balancing, cv_splits=5)
        current_auc = out_sample_metrics['auc']

        if current_auc > best_auc:
            best_auc = current_auc
            best_params = param_dict
            best_in_sample = in_sample_metrics
            best_out_sample = out_sample_metrics

    xgb_fast_results[balancing] = {
        'best_params': best_params,
        'in_sample': best_in_sample,
        'out_sample': best_out_sample
    }

    print(f"\nBest results for XGBoost with balancing '{balancing}':")
    print(f"  Best parameters: {best_params}")
    print("  In-sample metrics:")
    for metric, value in best_in_sample.items():
        print(f"    {metric}: {value:.4f}")
    print("  Out-of-sample metrics:")
    for metric, value in best_out_sample.items():
        print(f"    {metric}: {value:.4f}")

print("\nFinal Faster XGBoost Tuning Results:")
print(xgb_fast_results)


In [None]:
import numpy as np
import itertools
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight
from sklearn.pipeline import Pipeline

# Custom evaluation function for Logistic Regression pipelines.
def evaluate_model_lr(model, X, y, balancing, cv_splits=5, random_state=42):
    skf = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=random_state)
    train_metrics_list = []
    test_metrics_list = []
    for train_index, test_index in skf.split(X, y):
        try:
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        except AttributeError:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

        sample_weights = None
        if balancing == 'smote':
            sm = SMOTE(random_state=random_state)
            X_train, y_train = sm.fit_resample(X_train, y_train)
        if balancing == 'class_weight':
            # For pipelines, we do not pass sample_weight.
            sample_weights = None

        # For pipelines we avoid passing sample_weight.
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        try:
            y_train_proba = model.predict_proba(X_train)[:, 1]
        except Exception:
            y_train_proba = None
        train_metrics = {
            'accuracy': accuracy_score(y_train, y_train_pred),
            'precision': precision_score(y_train, y_train_pred, zero_division=0),
            'recall': recall_score(y_train, y_train_pred, zero_division=0),
            'f1': f1_score(y_train, y_train_pred, zero_division=0),
            'auc': roc_auc_score(y_train, y_train_proba) if y_train_proba is not None else np.nan
        }

        y_test_pred = model.predict(X_test)
        try:
            y_test_proba = model.predict_proba(X_test)[:, 1]
        except Exception:
            y_test_proba = None
        test_metrics = {
            'accuracy': accuracy_score(y_test, y_test_pred),
            'precision': precision_score(y_test, y_test_pred, zero_division=0),
            'recall': recall_score(y_test, y_test_pred, zero_division=0),
            'f1': f1_score(y_test, y_test_pred, zero_division=0),
            'auc': roc_auc_score(y_test, y_test_proba) if y_test_proba is not None else np.nan
        }

        train_metrics_list.append(train_metrics)
        test_metrics_list.append(test_metrics)

    avg_train_metrics = {k: np.mean([m[k] for m in train_metrics_list]) for k in train_metrics_list[0]}
    avg_test_metrics = {k: np.mean([m[k] for m in test_metrics_list]) for k in test_metrics_list[0]}
    return avg_train_metrics, avg_test_metrics

# Reduced grid for Logistic Regression (12 combinations)
param_grid_lr_fast = {
    'C': [0.001, 0.1, 1, 10],
    'max_iter': [3000, 4000, 5000],
    'solver': ['lbfgs', 'saga']
}

balancing_methods = ['none', 'smote', 'class_weight']
lr_fast_results = {}

for balancing in balancing_methods:
    print(f"\nTuning Logistic Regression with balancing strategy: '{balancing}'")
    grid = list(itertools.product(*param_grid_lr_fast.values()))
    total_fits = len(grid)
    print(f"Total parameter combinations to evaluate: {total_fits}")

    best_auc = -np.inf
    best_params = None
    best_in_sample = None
    best_out_sample = None

    for params in grid:
        param_dict = dict(zip(param_grid_lr_fast.keys(), params))
        if balancing == 'class_weight':
            lr_model = LogisticRegression(class_weight='balanced', random_state=42, **param_dict)
        else:
            lr_model = LogisticRegression(random_state=42, **param_dict)

        # Create pipeline with scaling.
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('lr', lr_model)
        ])

        in_sample_metrics, out_sample_metrics = evaluate_model_lr(clone(pipe), X, y, balancing, cv_splits=5)
        current_auc = out_sample_metrics['auc']

        if current_auc > best_auc:
            best_auc = current_auc
            best_params = param_dict
            best_in_sample = in_sample_metrics
            best_out_sample = out_sample_metrics

    lr_fast_results[balancing] = {
        'best_params': best_params,
        'in_sample': best_in_sample,
        'out_sample': best_out_sample
    }

    print(f"\nBest results for Logistic Regression with balancing '{balancing}':")
    print(f"  Best parameters: {best_params}")
    print("  In-sample metrics:")
    for metric, value in best_in_sample.items():
        print(f"    {metric}: {value:.4f}")
    print("  Out-of-sample metrics:")
    for metric, value in best_out_sample.items():
        print(f"    {metric}: {value:.4f}")

print("\nFinal Faster Logistic Regression Tuning Results:")
print(lr_fast_results)

In [None]:
import numpy as np
import itertools
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone

# Final grid for RandomForest
param_grid_rf_final = {
    'n_estimators': [50, 100,150],
    'max_depth': [2, 3, 4],
    'min_samples_split': [5, 10],
    'max_features': [None, 'sqrt','log2']
}

balancing_methods = ['none', 'smote', 'class_weight']
rf_final_results = {}

for balancing in balancing_methods:
    print(f"\nFinal Tuning: RandomForest with balancing strategy: '{balancing}'")
    grid = list(itertools.product(*param_grid_rf_final.values()))
    total_fits = len(grid)
    print(f"Total parameter combinations to evaluate: {total_fits}")

    best_auc = -np.inf
    best_params = None
    best_in_sample = None
    best_out_sample = None

    for params in grid:
        param_dict = dict(zip(param_grid_rf_final.keys(), params))
        if balancing == 'class_weight':
            param_dict['class_weight'] = 'balanced'

        rf_model = RandomForestClassifier(random_state=42, **param_dict)
        in_sample_metrics, out_sample_metrics = evaluate_model(clone(rf_model), X, y, balancing, cv_splits=5)
        current_auc = out_sample_metrics['auc']

        if current_auc > best_auc:
            best_auc = current_auc
            best_params = param_dict
            best_in_sample = in_sample_metrics
            best_out_sample = out_sample_metrics

    rf_final_results[balancing] = {
        'best_params': best_params,
        'in_sample': best_in_sample,
        'out_sample': best_out_sample
    }

    print(f"\nBest results for RandomForest with balancing '{balancing}':")
    print(f"  Best parameters: {best_params}")
    print("  In-sample metrics:")
    for metric, value in best_in_sample.items():
        print(f"    {metric}: {value:.4f}")
    print("  Out-of-sample metrics:")
    for metric, value in best_out_sample.items():
        print(f"    {metric}: {value:.4f}")

print("\nFinal RandomForest Tuning Results:")
print(rf_final_results)


In [None]:
import numpy as np
import itertools
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone

# Narrow grid for Neural Network (MLPClassifier)
param_grid_nn_final = {
    'hidden_layer_sizes': [(50,), (100,)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001]
}

# For NN, we test only "none" and "smote" since class_weight is not supported.
balancing_methods_nn = ['none', 'smote']
nn_final_results = {}

for balancing in balancing_methods_nn:
    print(f"\nFinal Tuning: Neural Network (MLPClassifier) with balancing strategy: '{balancing}'")
    grid = list(itertools.product(*param_grid_nn_final.values()))
    total_fits = len(grid)
    print(f"Total parameter combinations to evaluate: {total_fits}")

    best_auc = -np.inf
    best_params = None
    best_in_sample = None
    best_out_sample = None

    for params in grid:
        param_dict = dict(zip(param_grid_nn_final.keys(), params))
        nn_model = MLPClassifier(random_state=42, max_iter=500, **param_dict)

        # Create pipeline with scaling.
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('nn', nn_model)
        ])

        in_sample_metrics, out_sample_metrics = evaluate_model(clone(pipe), X, y, balancing, cv_splits=5)
        current_auc = out_sample_metrics['auc']

        if current_auc > best_auc:
            best_auc = current_auc
            best_params = param_dict
            best_in_sample = in_sample_metrics
            best_out_sample = out_sample_metrics

    nn_final_results[balancing] = {
        'best_params': best_params,
        'in_sample': best_in_sample,
        'out_sample': best_out_sample
    }

    print(f"\nBest results for Neural Network with balancing '{balancing}':")
    print(f"  Best parameters: {best_params}")
    print("  In-sample metrics:")
    for metric, value in best_in_sample.items():
        print(f"    {metric}: {value:.4f}")
    print("  Out-of-sample metrics:")
    for metric, value in best_out_sample.items():
        print(f"    {metric}: {value:.4f}")

print("\nFinal Neural Network Tuning Results:")
print(nn_final_results)


In [None]:
# -------------------------------
# Section 1: Further Tuning Experiments
# -------------------------------

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.neural_network import MLPClassifier

# Helper function to compute metrics.
def get_metrics(y_true, y_pred, y_proba):
    metrics = {}
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['precision'] = precision_score(y_true, y_pred, zero_division=0)
    metrics['recall'] = recall_score(y_true, y_pred, zero_division=0)
    metrics['f1'] = f1_score(y_true, y_pred, zero_division=0)
    if y_proba is not None:
        metrics['auc'] = roc_auc_score(y_true, y_proba)
    else:
        metrics['auc'] = np.nan
    return metrics

# Evaluation function: using stratified k-fold CV.
def evaluate_pipeline(estimator, X, y, cv_splits=5):
    skf = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42)
    train_metrics_list = []
    test_metrics_list = []

    for train_idx, test_idx in skf.split(X, y):
        try:
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        except AttributeError:
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

        estimator.fit(X_train, y_train)
        y_train_pred = estimator.predict(X_train)
        try:
            y_train_proba = estimator.predict_proba(X_train)[:, 1]
        except Exception:
            y_train_proba = None
        train_metrics_list.append(get_metrics(y_train, y_train_pred, y_train_proba))

        y_test_pred = estimator.predict(X_test)
        try:
            y_test_proba = estimator.predict_proba(X_test)[:, 1]
        except Exception:
            y_test_proba = None
        test_metrics_list.append(get_metrics(y_test, y_test_pred, y_test_proba))

    avg_train = {k: np.mean([m[k] for m in train_metrics_list]) for k in train_metrics_list[0]}
    avg_test  = {k: np.mean([m[k] for m in test_metrics_list]) for k in test_metrics_list[0]}
    return avg_train, avg_test

# Convenience function to perform hyperparameter tuning.
def tune_and_evaluate(pipeline, param_grid, X, y, cv_search=3, cv_eval=5):
    grid = GridSearchCV(pipeline, param_grid, scoring='roc_auc', cv=cv_search, n_jobs=-1, error_score='raise')
    grid.fit(X, y)
    best_estimator = grid.best_estimator_
    print("Best parameters:", grid.best_params_)
    in_sample, out_sample = evaluate_pipeline(best_estimator, X, y, cv_splits=cv_eval)
    print("In-sample metrics:", in_sample)
    print("Out-of-sample metrics:", out_sample)
    return best_estimator

# (Assume X and y are already defined in your environment.)

# -----------------------------------------
# RandomForest Further Tuning
# -----------------------------------------

# -- RandomForest with SMOTE --
print("Further tuning: RandomForest with SMOTE:")
pipeline_rf_smote = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])
param_grid_rf_smote = {
    'classifier__n_estimators': [100, 150],
    'classifier__max_depth': [4, 5, 6],
    'classifier__min_samples_leaf': [1, 2],
    'classifier__max_features': ['sqrt', 'log2']
}
best_rf_smote = tune_and_evaluate(pipeline_rf_smote, param_grid_rf_smote, X, y)

# -- RandomForest with Class-Weight --
print("\nFurther tuning: RandomForest with Class-Weight:")
pipeline_rf_class = Pipeline([
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])
param_grid_rf_class = {
    'classifier__n_estimators': [100, 150],
    'classifier__max_depth': [4, 5, 6],
    'classifier__min_samples_leaf': [1, 2],
    'classifier__max_features': ['sqrt', 'log2']
}
best_rf_class = tune_and_evaluate(pipeline_rf_class, param_grid_rf_class, X, y)

# -----------------------------------------
# XGBoost Further Tuning
# -----------------------------------------

# -- XGBoost with SMOTE --
print("\nFurther tuning: XGBoost with SMOTE:")
pipeline_xgb_smote = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('classifier', xgb.XGBClassifier(eval_metric='logloss', random_state=42))
])
param_grid_xgb_smote = {
    'classifier__n_estimators': [50, 75, 100],
    'classifier__max_depth': [3, 4, 5],
    'classifier__learning_rate': [0.01],
    'classifier__reg_alpha': [0.05, 0.1, 0.2],
    'classifier__reg_lambda': [1.5, 2, 2.5],
    'classifier__subsample': [0.9, 1.0]
}
best_xgb_smote = tune_and_evaluate(pipeline_xgb_smote, param_grid_xgb_smote, X, y)

# -- XGBoost with Class-Weight --
print("\nFurther tuning: XGBoost with Class-Weight:")
pos = np.sum(y == 1)
neg = np.sum(y == 0)
scale_weight = neg / pos if pos > 0 else 1
pipeline_xgb_class = Pipeline([
    ('classifier', xgb.XGBClassifier(eval_metric='logloss', random_state=42, scale_pos_weight=scale_weight))
])
param_grid_xgb_class = {
    'classifier__n_estimators': [100, 125],
    'classifier__max_depth': [3, 4],
    'classifier__learning_rate': [0.01],
    'classifier__reg_alpha': [0.005, 0.01, 0.02],
    'classifier__reg_lambda': [1.5, 2],
    'classifier__subsample': [0.8, 0.9]
}
best_xgb_class = tune_and_evaluate(pipeline_xgb_class, param_grid_xgb_class, X, y)

# -----------------------------------------
# NeuralNetwork Further Tuning
# -----------------------------------------
print("\nFurther tuning: NeuralNetwork with SMOTE:")
pipeline_nn_smote = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('classifier', MLPClassifier(max_iter=500, random_state=42, early_stopping=True))
])
param_grid_nn_smote = {
    'classifier__hidden_layer_sizes': [(50,), (100,), (150,)],
    'classifier__alpha': [0.0005, 0.001, 0.005]
}
best_nn_smote = tune_and_evaluate(pipeline_nn_smote, param_grid_nn_smote, X, y)

print("\nFurther tuning: NeuralNetwork baseline (no balancing):")
pipeline_nn_baseline = Pipeline([
    ('classifier', MLPClassifier(max_iter=500, random_state=42, early_stopping=True))
])
param_grid_nn_baseline = {
    'classifier__hidden_layer_sizes': [(150,)],  # Zooming in on the best from previous run.
    'classifier__alpha': [0.01]
}
best_nn_baseline = tune_and_evaluate(pipeline_nn_baseline, param_grid_nn_baseline, X, y)


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# -------------------------------------------------
# 1. Store final results in a dictionary
# -------------------------------------------------
final_results = {
    'RF': {'accuracy': 0.751752752064637, 'precision': 0.667384661328898,
                       'recall': 0.5131008881958099, 'f1': 0.5797464995220165,
                       'auc': 0.7845614024309545},
    'XGB': {'accuracy': 0.7610659836335729, 'precision': 0.7017065805345479,
                        'recall': 0.49389789138246665, 'f1': 0.5795234222357906,
                        'auc': 0.7842233882483632},
    'LR': {'accuracy': 0.7381631557635625, 'precision': 0.6346159477343123,
                       'recall': 0.5090345108142925, 'f1': 0.5645008419194599,
                       'auc': 0.7323059439650166}}

# -------------------------------------------------
# 2. Convert results to a pandas DataFrame
# -------------------------------------------------
df = pd.DataFrame(final_results).T  # .T so that models become rows
df.index.name = "Model"
df.reset_index(inplace=True)  # Move model names into a column
# df columns: Model, accuracy, precision, recall, f1, auc

# Melt the data for easy plotting with seaborn
df_melt = df.melt(id_vars="Model", var_name="Metric", value_name="Value")

# -------------------------------------------------
# 3. Define a custom dark‐red–based palette
# -------------------------------------------------
custom_palette = [
    "#840000",  # darkest red
    "#b53232",
    "#ce4b4b",
    "#f08080"   # lightest red
]

# Ensure we map each model to a unique color.
# We'll just sort the model names alphabetically for consistent ordering
model_names_sorted = sorted(df['Model'].unique())
color_map = dict(zip(model_names_sorted, custom_palette))

# -------------------------------------------------
# 4. Plot a Bar Chart for all metrics across models
# -------------------------------------------------
plt.figure(figsize=(10, 6))
colors = [color_map[m] for m in model_names_sorted]

sns.barplot(
    data=df_melt,
    x='Metric',
    y='Value',
    hue='Model',
    palette=colors)
plt.title('Performance Metrics Comparison (Bar Chart)', fontsize=14, color="#840101")
plt.ylim(0, 1)
plt.ylabel('Metric Value', fontsize=12)
plt.xlabel('Metric', fontsize=12)
plt.legend(title='Model', fontsize=10)
plt.grid(axis='y', alpha=0.3)
#Show data labels
for p in plt.gca().patches:
    plt.gca().annotate(f"{p.get_height():.3f}", (p.get_x() + p.get_width() / 2.,
                                                 p.get_height()),
                       ha='center', va='center', fontsize=8,
                       xytext=(0, 5), textcoords='offset points')

plt.tight_layout()
plt.show()

# -------------------------------------------------
# 5. Plot a Line Chart for the same metrics
# -------------------------------------------------
plt.figure(figsize=(10, 6))
sns.lineplot(
    data=df_melt,
    x='Metric',
    y='Value',
    hue='Model',
    style='Model',
    markers=True,
    dashes=False,
    palette=[color_map[m] for m in df_melt['Model']],
    linewidth=2,
    markersize=10
)
plt.title('Performance Metrics Comparison (Line Chart)', fontsize=14, color="#840101")
plt.ylim(0, 1)
plt.ylabel('Metric Value', fontsize=12)
plt.xlabel('Metric', fontsize=12)
plt.legend(title='Model', fontsize=10)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
plt.show()



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, roc_curve)
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import shap


# Split into train and test (70/30 stratified split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Final hyperparameters based on tuning (using "none" balancing):
lr_final = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(C=1, max_iter=3000, solver='saga', random_state=42))
])

# Train the final model
lr_final.fit(X_train, y_train)

# -------------------------------
# 2. Evaluate final models & save metrics
# -------------------------------
def get_metrics(y_true, y_pred, y_proba):
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'auc': roc_auc_score(y_true, y_proba) if y_proba is not None else np.nan
    }

final_metrics = {}
models = {
    'LogisticRegression': lr_final
}

for name, model in models.items():
    # In-sample predictions
    y_train_pred = model.predict(X_train)
    try:
        y_train_proba = model.predict_proba(X_train)[:, 1]
    except Exception:
        y_train_proba = None
    in_metrics = get_metrics(y_train, y_train_pred, y_train_proba)

    # Out-of-sample predictions
    y_test_pred = model.predict(X_test)
    try:
        y_test_proba = model.predict_proba(X_test)[:, 1]
    except Exception:
        y_test_proba = None
    out_metrics = get_metrics(y_test, y_test_pred, y_test_proba)

    final_metrics[name] = {'in_sample': in_metrics, 'out_sample': out_metrics}

# Convert to DataFrame for visualization
metrics_df = pd.DataFrame({
    (model, sample): final_metrics[model][sample]
    for model in final_metrics.keys()
    for sample in ['in_sample', 'out_sample']
}).T.reset_index()
metrics_df.columns = ['Model', 'Sample', 'Accuracy', 'Precision', 'Recall', 'F1', 'AUC']
print("Final Performance Metrics:")
print(metrics_df)


# -------------------------------
# 4. SHAP Explanations
# -------------------------------
# Prepare feature names (if X is not a DataFrame, create default feature names)
if hasattr(X, 'columns'):
    feature_names = list(X.columns)
else:
    feature_names = [f"f{i}" for i in range(X.shape[1])]

# Use a subset of training data for SHAP (for speed)
X_shap = X_train[:100]


#Logistic Regression SHAP

X_shap_scaled = lr_final.named_steps['scaler'].transform(X_shap)
explainer_lr = shap.LinearExplainer(lr_final.named_steps['lr'], X_shap_scaled, feature_perturbation="interventional")
shap_values_lr = explainer_lr.shap_values(X_shap_scaled)
plt.figure()
shap.summary_plot(shap_values_lr, X_shap_scaled, plot_type="bar", color_bar_label="SHAP value",
                  show=False, plot_size=(8,6), feature_names=feature_names)
plt.title('Logistic Regression SHAP Summary', fontsize=14, color='#006400')
plt.tight_layout()
plt.show()
