# Medicare Fraud - Stacking Models

Mustapha Mbengue, Peyton Nash, Bradley Stoller, Kyler Rosen

3/9/25

Purpose: Specifies, trains and evaluates stacking models to classify cases of medicare fraud.

### Setup

In [1]:
# Import packages
import pandas as pd
import numpy as np

# Function to pre-process this data
from adsp31017_group4_data_preprocessing import process_data

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.utils.validation import check_is_fitted
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import matplotlib.pyplot as plt

In [2]:
# Apply data processing script
df = process_data()

# Check DataFrame head
df.head()

Starting data preprocessing...
Merging raw data...
Merged raw data. Time elapsed: 10.00s
Correcting codes...


KeyboardInterrupt: 

In [None]:
# Split the data into train, test and validation sets
x = df.drop(columns=['PotentialFraud'])
y = df['PotentialFraud']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

### Train base models

In [7]:
# Create function to train baseline models
def train_optimized_classification_models(X_train, X_test, y_train, y_test, n_iter=10):
    param_distributions = {
        "Logistic Regression": {
            "model": LogisticRegression(),
            "params": {"C": np.logspace(-3, 3, 10), "penalty": ["l1", "l2"], "solver": ["liblinear"]}
        },
        "Random Forest": {
            "model": RandomForestClassifier(random_state=42),
            "params": {"n_estimators": np.arange(50, 300, 50), "max_depth": [5, 10, None], "min_samples_split": [2, 5, 10]}
        },
        "XGBoost": {
            "model": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
            "params": {"n_estimators": np.arange(50, 300, 50), "learning_rate": np.linspace(0.01, 0.2, 5), "max_depth": [3, 5, 10]}
        },
        "Gaussian Naïve Bayes": {
            "model": GaussianNB(),
            "params": {"var_smoothing": np.logspace(-9, -6, 10)}
        },
    }

    best_models = {}
    for name, config in param_distributions.items():
        print(f"\nTraining and tuning {name}...")

        if name == "Gaussian Naïve Bayes":
            model = config["model"]
            model.fit(x_train, y_train)
            best_model = model
            best_params = None  
        else:
            random_search = RandomizedSearchCV(
                config["model"], config["params"], n_iter=n_iter, cv=5, scoring="f1", n_jobs=-1, random_state=42
            )
            random_search.fit(x_train, y_train)
            best_model = random_search.best_estimator_
            best_params = random_search.best_params_

        cv_scores = cross_val_score(best_model, x_train, y_train, cv=5, scoring='f1')

        y_pred = best_model.predict(x_val)
        y_prob = best_model.predict_proba(x_val)[:, 1] if hasattr(best_model, "predict_proba") else None
        metrics = {
            "Best Params": best_params,
            "Cross-Validation F1 (Mean)": np.mean(cv_scores),
            "Accuracy": accuracy_score(y_val, y_pred),
            "Precision": precision_score(y_val, y_pred),
            "Recall": recall_score(y_val, y_pred),
            "F1 Score": f1_score(y_val, y_pred),
            "ROC AUC": roc_auc_score(y_val, y_prob) if y_prob is not None else None
        }

        best_models[name] = {"model": best_model, "metrics": metrics}

        print(f"\n{name} Best Parameters: {best_params}")
        for metric, value in metrics.items():
            if metric != "Best Params":
                print(f"{metric}: {value:.4f}")

    return best_models

In [None]:
# Get the results of the optimized classification models
results = train_optimized_classification_models(x_train, x_test, y_train, y_test)

### Create Voting Classifier

In [None]:
# Create a function to train and predict ensemble model metrics
def train_voting_ensemble(models, X_train, X_test, y_train, y_test):
    voting_clf = VotingClassifier(
        estimators=[
            ("Random Forest", models["Random Forest"]["model"]),
            ("Logistic Regression", models["Logistic Regression"]["model"]),
            ("XGBoost", models["XGBoost"]["model"])
        ],
        voting="soft"  # Soft voting considers probability predictions
    )

    voting_clf.fit(X_train, y_train)
    return voting_clf

In [None]:
# Create a function to evaluate the ensemble model
def evaluate_final_model(model, X_train, y_train, X_test, y_test, cv=5):
    # Predict the values for each model
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    # Calculate cross-validation scores    
    cv_f1_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1')
    cross_val_f1_mean = np.mean(cv_f1_scores)

    # Create metrics dictionary
    metrics = {
        "Cross-Validation F1 (Mean)": cross_val_f1_mean,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "Balanced Accuracy": balanced_accuracy_score(y_test, y_pred),
        "Cohen's Kappa": cohen_kappa_score(y_test, y_pred),
        "Matthews Corr Coeff (MCC)": matthews_corrcoef(y_test, y_pred),
        "Log Loss": log_loss(y_test, y_prob) if y_prob is not None else None,
        "ROC AUC Score": roc_auc_score(y_test, y_prob) if y_prob is not None else None
    }

    # Print classification report
    print("\nFinal Model Performance on Test Set:")
    print(classification_report(y_test, y_pred))

    # Print metrics
    for metric, value in metrics.items():
        if value is not None:
            print(f"{metric}: {value:.4f}")

    return metrics


In [None]:
# Estimate the voting model classifier
voting_model = train_voting_ensemble(results, x_train, x_test, y_train, y_test)

Parameters: { "use_label_encoder" } are not used.



In [None]:
# Calculate metrics for the voting model classifier
final_metrics = evaluate_final_model(voting_model, x_train, y_train, x_test, y_test)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




Final Model Performance on Test Set:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     69241
           1       1.00      0.87      0.93     42735

    accuracy                           0.95    111976
   macro avg       0.96      0.93      0.94    111976
weighted avg       0.95      0.95      0.95    111976

Cross-Validation F1 (Mean): 0.9195
Accuracy: 0.9476
Precision: 0.9973
Recall: 0.8651
F1 Score: 0.9265
Balanced Accuracy: 0.9318
Cohen's Kappa: 0.8862
Matthews Corr Coeff (MCC): 0.8916
Log Loss: 0.2067
ROC AUC Score: 0.9882


### Create stacking classifier

In [30]:
# Create function to create a stacking classifier
def stacking_classifier(x_val, y_val, x_test, y_test, base:list, meta_model, gs_grid=None, **gs_args):
    # Create DataFrame of predicted values for validation and test sets
    meta_features = np.column_stack([model.predict(x_val) for model in base])
    test_meta_features = np.column_stack([model.predict(x_test) for model in base])

    # Fit model
    if gs_grid==None:
        best_model = meta_model.fit(meta_features, y_val)
    else:
        meta_gs = GridSearchCV(meta_model, gs_grid, gs_args)
        meta_gs.fit(meta_features, y_val)
        best_model = meta_gs.best_estimator_

    # Predict classes and probability        
    final_pred = best_model.predict(test_meta_features)
    final_prob = best_model.predict_proba(test_meta_features)[:, 1] if hasattr(meta_model, "predict_proba") else None

    # Calculate classification metrics
    metrics = {
            "Accuracy": accuracy_score(y_test, final_pred),
            "Precision": precision_score(y_test, final_pred),
            "Recall": recall_score(y_test, final_pred),
            "F1 Score": f1_score(y_test, final_pred),
            "ROC AUC": roc_auc_score(y_test, final_prob) if final_prob is not None else None
        }

    # Combine model and metrics
    final_model = {'model': best_model, 'metrics': metrics, 'pred': final_pred, 'pred_prob': final_prob}

    # Output the final results
    return final_model

In [13]:
# Define base models
lr = results['Logistic Regression']['model']
rf = results['Random Forest']['model']
xgb = results['XGBoost']['model']
gnb = results['Gaussian Naïve Bayes']['model']

# Create base models list
base_models = [
    lr,
    rf,
    xgb,
    gnb
]

In [31]:
# Create meta model
meta_lr = LogisticRegression()

# Get results
logit_results = stacking_classifier(x_val, y_val, x_test, y_test, base_models, meta_lr)

In [32]:
# Create meta model
meta_svm = SVC()

# Define the grid for the SVM
svc_params = {'kernel':['rbf', 'poly'],
              'degree':[2, 3, 4,],
              'C':[.01, .1, 1], 
              'gamma':['scale', 'auto']}

# SVM meta-model
svm_results = stacking_classifier(x_val, y_val, x_test, y_test, base_models, meta_svm, meta_gs=svc_params, cv=5, scoring='f1', verbose=2)

In [15]:
# Get logit meta model results
logit_results

{'model': LogisticRegression(),
 'metrics': {'Accuracy': 0.9763699364149461,
  'Precision': 0.9981608966871287,
  'Recall': 0.9398151398151399,
  'F1 Score': 0.968109723762233,
  'ROC AUC': 0.9730974921483434}}

In [17]:
# Get SVM meta-model results
svm_results

{'model': SVC(),
 'metrics': {'Accuracy': 0.9764592412659856,
  'Precision': 0.9978891951625319,
  'Recall': 0.9403065403065403,
  'F1 Score': 0.9682424943376223,
  'ROC AUC': None}}

In [35]:
# Predict contender models on test set
xgb_pred = xgb.predict(x_test)
stack_log_pred = logit_results['pred']
stack_svm_pred = svm_results['pred']

# Add labels and predictions to x_test
test_full = x_test.copy()
test_full['label'] = y_test
test_full['xgb_pred'] = xgb_pred
test_full['stack_log_pred'] = stack_log_pred
test_full['stack_svm_pred'] = stack_svm_pred
test_full['tot_reimburse'] = test_full['IPAnnualReimbursementAmt'] + test_full['OPAnnualReimbursementAmt']

In [62]:
# Get the total amount of fraudulent reimbursements on the test data
tot_fraud = test_full[test_full['label'] == 1]['tot_reimburse'].sum()

# Create a list to store results
monetary = []

# Calculate monetary value of each model
for col in ['xgb_pred', 'stack_log_pred', 'stack_svm_pred']:
    # Amount of fraud predicted as fraud
    pred_fraud = test_full[(test_full['label'] == 1) & (test_full[col] == 1)]['tot_reimburse'].sum()

    # Amount of fraud predicted as not-fraud
    miss_fraud = test_full[(test_full['label'] == 1) & (test_full[col] == 0)]['tot_reimburse'].sum()

    # Percentage of fraud predicted as fraud
    pct_pred_fraud = pred_fraud/tot_fraud

    # Percentage of fraud predicted as not-fraud
    pct_miss_fraud = miss_fraud/tot_fraud

    # Add the results to the list
    monetary.append({'model':col, 'tot_fraud':tot_fraud, 'pred_fraud':pred_fraud, 'miss_fraud':miss_fraud, 'pct_pred_fraud':pct_pred_fraud, 'pct_miss_fraud':pct_miss_fraud})

In [92]:
# Check monetary values
df_monetary = pd.DataFrame(monetary)
df_monetary

Unnamed: 0,model,tot_fraud,pred_fraud,miss_fraud,pct_pred_fraud,pct_miss_fraud
0,xgb_pred,347745110,326034460,21710650,0.937567,0.062433
1,stack_log_pred,347745110,326034460,21710650,0.937567,0.062433
2,stack_svm_pred,347745110,326185120,21559990,0.938001,0.061999
