# Medicare Fraud - Stacking Models

Mustapha Mbengue, Peyton Nash, Bradley Stoller, Kyler Rosen

3/9/25

Purpose: Specifies, trains and evaluates stacking models to classify cases of medicare fraud.

### Setup

In [58]:
# Import packages
from data_loading import load_data
from feature_engineering import apply_feature_engineering
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.utils.validation import check_is_fitted
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import matplotlib.pyplot as plt

In [1]:
# Load data
df = load_data()
df.head()

NameError: name 'load_data' is not defined

In [4]:
# Apply data engineering
df = apply_feature_engineering(df)

Starting feature engineering...
Adding datetime features...
Added datetime features. Time elapsed: 14.90s
Discretizing age...
Discretized age. Time elapsed: 14.91s
Filling in missing values...
Filled in missing values. Time elapsed: 16.54s
Transforming skewed distributions...
Transformed skewed distributions. Time elapsed: 16.55s
Encoding categorical columns...
Encoded categorical columns. Time elapsed: 19.53s
Dropping unnecessary columns...
Dropped unnecessary columns. Time elapsed: 19.65s
Feature engineering complete!


### Train base models

In [10]:
# Create function to train baseline models
def train_optimized_classification_models(data, target_column, n_iter=10):
    x = data.drop(columns=[target_column])
    y = data[target_column]

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

    param_distributions = {
        "Logistic Regression": {
            "model": LogisticRegression(),
            "params": {"C": np.logspace(-3, 3, 10), "penalty": ["l1", "l2"], "solver": ["liblinear"]}
        },
        "Random Forest": {
            "model": RandomForestClassifier(random_state=42),
            "params": {"n_estimators": np.arange(50, 300, 50), "max_depth": [5, 10, None], "min_samples_split": [2, 5, 10]}
        },
        "XGBoost": {
            "model": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
            "params": {"n_estimators": np.arange(50, 300, 50), "learning_rate": np.linspace(0.01, 0.2, 5), "max_depth": [3, 5, 10]}
        },
        "Gaussian Naïve Bayes": {
            "model": GaussianNB(),
            "params": {"var_smoothing": np.logspace(-9, -6, 10)}
        },
    }

    best_models = {}
    for name, config in param_distributions.items():
        print(f"\nTraining and tuning {name}...")

        if name == "Gaussian Naïve Bayes":
            model = config["model"]
            model.fit(x_train, y_train)
            best_model = model
            best_params = None  
        else:
            random_search = RandomizedSearchCV(
                config["model"], config["params"], n_iter=n_iter, cv=5, scoring="f1", n_jobs=-1, random_state=42
            )
            random_search.fit(x_train, y_train)
            best_model = random_search.best_estimator_
            best_params = random_search.best_params_

        cv_scores = cross_val_score(best_model, x_train, y_train, cv=5, scoring='f1')

        y_pred = best_model.predict(x_val)
        y_prob = best_model.predict_proba(x_val)[:, 1] if hasattr(best_model, "predict_proba") else None
        metrics = {
            "Best Params": best_params,
            "Cross-Validation F1 (Mean)": np.mean(cv_scores),
            "Accuracy": accuracy_score(y_val, y_pred),
            "Precision": precision_score(y_val, y_pred),
            "Recall": recall_score(y_val, y_pred),
            "F1 Score": f1_score(y_val, y_pred),
            "ROC AUC": roc_auc_score(y_val, y_prob) if y_prob is not None else None
        }

        best_models[name] = {"model": best_model, "metrics": metrics}

        print(f"\n{name} Best Parameters: {best_params}")
        for metric, value in metrics.items():
            if metric != "Best Params":
                print(f"{metric}: {value:.4f}")

    return best_models

In [None]:
# Get the results of the optimized classification models
results = train_optimized_classification_models(df, 'PotentialFraud')

### Create stacking classifier

In [130]:
# Split the data into train, test and validation sets
x = df.drop(columns=['PotentialFraud'])
y = df['PotentialFraud']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

In [123]:
# Create function to create a stacking classifier
def stacking_classifier(x_val, y_val, x_test, y_test, base:list, meta_model, gs_grid=None, **gs_args):
    # Create DataFrame of predicted values for validation and test sets
    meta_features = np.column_stack([model.predict(x_val) for model in base])
    test_meta_features = np.column_stack([model.predict(x_test) for model in base])

    # Fit model
    if gs_grid==None:
        best_model = meta_model.fit(meta_features, y_val)
    else:
        meta_gs = GridSearchCV(meta_model, gs_grid, gs_args)
        meta_gs.fit(meta_features, y_val)
        best_model = meta_gs.best_estimator_

    # Predict classes and probability        
    final_pred = best_model.predict(test_meta_features)
    final_prob = best_model.predict_proba(test_meta_features)[:, 1] if hasattr(meta_model, "predict_proba") else None

    # Calculate classification metrics
    metrics = {
            "Accuracy": accuracy_score(y_test, final_pred),
            "Precision": precision_score(y_test, final_pred),
            "Recall": recall_score(y_test, final_pred),
            "F1 Score": f1_score(y_test, final_pred),
            "ROC AUC": roc_auc_score(y_test, final_prob) if final_prob is not None else None
        }

    # Combine model and metrics
    final_model = {'model': best_model, 'metrics': metrics}

    # Output the final results
    return final_model

In [129]:
# Define base models
lr = results['Logistic Regression']['model']
rf = results['Random Forest']['model']
xgb = results['XGBoost']['model']
gnb = results['Gaussian Naïve Bayes']['model']

# Create base models list
base_models = [
    lr,
    rf,
    xgb,
    gnb
]

In [None]:
# Create meta model
meta_lr = LogisticRegression()

# Get results
logit_results = stacking_classifier(x_val, y_val, x_test, y_test, base_models, meta_lr)

In [120]:
# Create meta model
meta_svm = SVC()

# Define the grid for the SVM
svc_params = {'kernel':['rbf', 'poly'],
              'degree':[2, 3, 4,],
              'C':[.01, .1, 1], 
              'gamma':['scale', 'auto']}

# SVM meta-model
svm_results = stacking_classifier(x_val, y_val, x_test, y_test, base_models, meta_svm, meta_gs=svc_params, cv=5, scoring='f1', verbose=2)

In [133]:
# Get logit meta model results
logit_results

{'model': LogisticRegression(),
 'metrics': {'Accuracy': 0.9763699364149461,
  'Precision': 0.9981608966871287,
  'Recall': 0.9398151398151399,
  'F1 Score': 0.968109723762233,
  'ROC AUC': 0.9730804793874362}}

In [2]:
# Get SVM meta-model results
svm_results

NameError: name 'svm_results' is not defined