In [1]:
#pip install mlxtend

In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, chi2, f_classif
import statsmodels.api as sm

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from sklearn.calibration import calibration_curve
from sklearn.metrics import roc_curve, auc, roc_auc_score 
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Data Load

In [3]:
# Read the cleaned CSV file
fraud_data = pd.read_csv('final_challenge_data.csv')

# Display the first few rows
fraud_data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'final_challenge_data.csv'

In [None]:
fraud_data.shape

In [None]:
#Subsetting missing fraud_bool

missing_bool = fraud_data[fraud_data['fraud_bool'].isna()].copy()
fraud_data = fraud_data.dropna(subset=['fraud_bool'])

In [None]:
fraud_data.shape

In [None]:
missing = fraud_data.isna().sum()/ len(fraud_data)*100
missing

In [None]:
print(fraud_data.dtypes)

In [None]:
int_cols = fraud_data.select_dtypes(include='int').columns
fraud_data[int_cols] = fraud_data[int_cols].astype(float)

In [None]:
print(fraud_data.dtypes)

In [None]:
fraud_data = fraud_data.drop(['credit_limit_to_salary'],axis=1)

In [None]:
fraud_prop = pd.crosstab(index = fraud_data['fraud_bool'], columns = "prop")/pd.crosstab(index = fraud_data['fraud_bool'], columns = "prop").sum()
print(fraud_prop)

In [None]:
pd.crosstab(index = fraud_data['fraud_bool'], columns = "count")

## Creating Test/Train & Undersampling

In [None]:
train_o = fraud_data.groupby("fraud_bool").sample(n = 6800, random_state = 6)

test_o = fraud_data.loc[fraud_data.index.difference(train_o.index)]

In [None]:
pd.crosstab(index = train_o['fraud_bool'], columns = "count")

In [None]:
pd.crosstab(index = test_o['fraud_bool'], columns = "count")

## Feature Engineering

In [None]:
# Income Column

#Creating a flag column for income 
train_o['income_missing'] = train_o['income'].isna().astype(int)

#filling missing income columns with salary. Have to convert salary to quantiles first
train_o['salary_decile'] = pd.qcut(train_o['salary'], q=10, labels=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]).astype(float)
train_o['income'] = train_o['income'].fillna(train_o['salary_decile'])
train_o.drop(columns='salary_decile', axis=1, inplace=True)

train_o.drop(columns='salary', axis=1, inplace=True)
train_o = train_o.dropna(subset=['income'])
train_o['income'] = train_o['income'].astype(str)

In [None]:
# zip_count_4w Column 

# Locating rows where 'zip_count_4w' contains a hyphen, flagging, then imputing with -1
bad_zip = train_o['zip_count_4w'].str.contains('-', na=False)
train_o['zip_count_4w_flag'] = bad_zip
train_o.loc[bad_zip, 'zip_count_4w'] = -1
train_o['zip_count_4w'] = train_o['zip_count_4w'].astype(int)

In [None]:
# Velocity Columns 

# Flag missing values
train_o['velocity_6h_missing'] = train_o['velocity_6h'].isna().astype(int)
train_o['velocity_24h_missing'] = train_o['velocity_24h'].isna().astype(int)
train_o['velocity_4w_missing'] = train_o['velocity_4w'].isna().astype(int)

# Impute with median
med_vel_6h = train_o['velocity_6h'].median()
train_o['velocity_6h'] = train_o['velocity_6h'].fillna(med_vel_6h)

med_vel_24h = train_o['velocity_24h'].median()
train_o['velocity_24h'] = train_o['velocity_24h'].fillna(med_vel_24h)

med_vel_4w = train_o['velocity_4w'].median()
train_o['velocity_4w'] = train_o['velocity_4w'].fillna(med_vel_4w)

In [None]:
missing = train_o.isna().sum()/ len(train_o)*100
missing

In [None]:
train_o['zip_count_4w_flag'] = train_o['zip_count_4w_flag'].astype('float64')


int_cols = train_o.select_dtypes(include='int').columns
train_o[int_cols] = train_o[int_cols].astype(float)

In [None]:
print(train_o.dtypes)

In [None]:
predictors = train_o.drop(columns=['fraud_bool'])
predictors = pd.get_dummies(predictors, drop_first=True)
predictors = predictors.astype(float)

X = predictors
y = train_o['fraud_bool']

In [None]:
X.shape

### Feature Selection

In [None]:
#Low Variability – Numeric
fraud_data_num = fraud_data.select_dtypes(include = ['number'])
non_flag_cols = [col for col in fraud_data_num.columns if not col.endswith(('_missing', '_OutOfRange', '_flagged'))]

#variance threshold
threshold = 0.01
selector = VarianceThreshold(threshold = threshold)  
selector.fit(fraud_data_num[non_flag_cols])

# Get list of all column names
flag = selector.get_support() 
all_features = fraud_data_num[non_flag_cols].columns 
low_variability_features = all_features[~flag] 
print(low_variability_features.tolist()) 

In [None]:
# Low Variability – Categorical

for col in fraud_data.select_dtypes(include = 'object'): 
    top_freq = fraud_data[col].value_counts(normalize = True).iloc[0] #Loop through each categorical variable, counting each occurance of each unique value 
                                                            
    if top_freq > 0.95: #Categorical variable with 1 category > 95% of the data considered for removal 
        print(f"{col} ({top_freq:.1%})")

In [None]:
X = X.drop(['source_TELEAPP'],axis=1)
X = X.drop(['source_UNKNOWN'],axis=1)

In [None]:
# Separate categorical (dummy) vs. continuous features
categorical_features = [col for col in X.columns if X[col].nunique() == 2]
continuous_features = [col for col in X.columns if X[col].nunique() > 2]

X_cat = X[categorical_features]
X_cont = X[continuous_features]

# Fit SelectKBest for categorical variables
selector = SelectKBest(score_func=chi2, k='all')  # 'all' keeps all features for scoring
selector.fit(X_cat, y)

# Correct closing syntax for DataFrame
scores_cat_df = pd.DataFrame({
    'Feature': X_cat.columns,
    'Chi2_score': selector.scores_,
    'p_value': selector.pvalues_
})

# Filter for features with p-value < 0.002
selected_cat_features = scores_cat_df[scores_cat_df['p_value'] < 0.002]['Feature']

# Sort results for better readability
scores_cat_df = scores_cat_df.sort_values(by='Chi2_score', ascending=False)

# Display top categorical features
print(scores_cat_df)

In [None]:
# Fit SelectKBest for Continous Variables
selector = SelectKBest(score_func=f_classif, k='all')  # 'all' keeps all features for scoring
selector.fit(X_cont, y)

# Create a DataFrame with feature names, F-scores, and p-values
scores_cont_df = pd.DataFrame({
    'Feature': X_cont.columns,
    'F_score': selector.scores_,
    'p_value': selector.pvalues_
})

# Filter for features with p-value < 0.002
selected_cont_features = scores_cont_df[scores_cont_df['p_value'] < 0.002]['Feature']

In [None]:
# Create a new DataFrame with only those selected columns
X_reduced = X[selected_cat_features.tolist() + selected_cont_features.tolist()]

X_reduced.head()

In [None]:
def check_quasi_complete_separation(X, y):
    """
    Checks each categorical predictor in X for quasi-complete separation with respect to binary target y.
    
    Parameters:
    - X: pd.DataFrame of predictors (categorical variables)
    - y: pd.Series of binary target variable (e.g., 0/1 or True/False)
    
    Returns:
    - List of variable names that exhibit quasi-complete separation
    """
    problematic_vars = []

    for col in X.columns:
        ct = pd.crosstab(X[col], y)

        # Check if any category (row) has a zero in any outcome class
        if (ct == 0).any(axis=1).any():
            print(f"Quasi-complete separation detected in '{col}'")
            print(ct)
            print()
            problematic_vars.append(col)

    return problematic_vars

In [None]:
# Create a new DataFrame with only those selected columns
X_cat_reduced = X_reduced[selected_cat_features.tolist()]

problem_vars = check_quasi_complete_separation(X_cat_reduced, y)

In [None]:
X_reduced = X_reduced.drop(problem_vars, axis = 1)

## Model Building

### Initial Logistic Regression Model

In [None]:
print(train_o.shape)
print(X_reduced.shape)
print(y.shape)

In [None]:
#Based on undersampling
weight_1 = 1
weight_0 = (1018338/11334) / (6800/6800)

print(weight_0, weight_1)

In [None]:
train_o['weight'] = train_o.fraud_bool.replace({1: weight_1, 0: weight_0}).astype(float)

X = sm.add_constant(X_reduced)

model_1 = sm.GLM(y, X, family = sm.families.Binomial(), freq_weights = train_o['weight'])
result_1 = model_1.fit()
print(result_1.summary())

### Stepwise Selection

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reduced)
X_scaled_df = pd.DataFrame(X_scaled, columns=X_reduced.columns)

# Model
logr = LogisticRegression(max_iter = 1000, solver = 'newton-cg', penalty = None) 

# Stepwise selection to find best subset of features
sfs = SFS(logr,
          k_features = "best", 
          forward = True,
          floating = True,
          scoring = 'roc_auc',
          cv = 10)

sfs = sfs.fit(X_scaled_df, y)

# Get selected feature names
selected_features = list(sfs.k_feature_names_)
print("Selected features:", selected_features)

In [None]:
X_stepwise = X_reduced[selected_features]

In [None]:
train_o['weight'] = train_o.fraud_bool.replace({1: weight_1, 0: weight_0}).astype(float)

X_stepwise = sm.add_constant(X_stepwise).copy()

model_2 = sm.GLM(y, X_stepwise, family = sm.families.Binomial(), freq_weights = train_o['weight'])
result_2 = model_2.fit()
print(result_2.summary())

## Model Assessment

In [None]:
X_selected = X_stepwise.copy()

### Calibration Curve

In [None]:
train_o['pred_prob'] = result_2.predict(X_selected)

# Compute calibration curve
prob_true, prob_pred = calibration_curve(train_o['fraud_bool'], train_o['pred_prob'], 
                                         n_bins = 10, strategy = 'quantile')

plt.figure(figsize = (6, 6))
plt.plot(prob_pred, prob_true, marker = 'o', label = 'Calibration curve')
plt.plot([0, 1], [0, 1], linestyle = '--', color = 'gray', label = 'Perfectly calibrated')
plt.xlabel('Predicted probability')
plt.ylabel('Observed frequency')
plt.title('Calibration Curve')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

### C-statistic (AUC)

In [None]:
train_o['pred_prob'] = result_2.predict()

auc = roc_auc_score(y, train_o['pred_prob'])
print("C-statistic (AUC):", auc)

### Somer's D

In [None]:
somer_d = 2 * auc - 1
print("Somer's D:", somer_d)

### Classification Table

In [None]:
fpr, tpr, thresholds = roc_curve(train_o['fraud_bool'], train_o['pred_prob'])

data = {'TPR': tpr, 'FPR': fpr, 'Cut-off': thresholds, 'Youden': tpr-fpr}
youden = pd.DataFrame(data)

youden.sort_values(by = ['Youden'], ascending = False)

In [None]:
train_o['pred'] = train_o['pred_prob'].map(lambda x: 1 if x > 0.5 else 0) #How should we decide cutoff??

pd.crosstab(train_o['fraud_bool'], train_o['pred'])

In [None]:
auc = roc_auc_score(train_o['fraud_bool'], train_o['pred_prob'])

plt.cla()
plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")  # chance line
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.show()

### K-S Stat

In [None]:
fpr, tpr, thresholds = roc_curve(train_o['fraud_bool'], train_o['pred_prob'])

# Create the Youden DataFrame
youden = pd.DataFrame({
    'Cut-off': thresholds,
    'TPR': tpr,
    'FPR': fpr,
    'Youden': tpr - fpr
})

# Sort by Cut-off and rename
youden = youden.sort_values(by='Cut-off', ascending=True)

ks_stat = youden.rename(columns={'TPR': 'PR_T', 'FPR': 'PR_F'})
ks_stat = ks_stat.melt(id_vars='Cut-off', var_name='PR', value_name='value')

ks_val = (youden['TPR'] - youden['FPR']).max()
ks_cutoff = youden.loc[(youden['TPR'] - youden['FPR']).idxmax(), 'Cut-off']

# Plot
plt.cla()
sns.lineplot(x='Cut-off', y='value', hue='PR', data=ks_stat)
plt.xlim(1, 0)

plt.title("KS Plot (TPR vs. FPR)")
plt.grid(True)
plt.axvline(x=ks_cutoff, linestyle='--', color='red', label=f'KS = {ks_val:.2f}')
plt.legend()
plt.show()

# Taking into account the cost 

In [None]:
# Taking into account the cost 
from sklearn.metrics import roc_curve
import pandas as pd
import numpy as np

# True labels and predicted probabilities
y_true = train_o['fraud_bool'].astype(int)
y_score = train_o['pred_prob'].astype(float)

# --- Business Costs ---
C_FN = 1200                     # Cost of false negative (missed fraud)
C_FP_values = np.arange(700, 1001, 100)  # Costs of false positive (700 → 1000)

# --- Check for constant target ---
if len(set(y_true)) < 2:
    raise ValueError("y_true has only one class. Need both positive and negative samples.")

# --- Compute ROC ---
fpr, tpr, thresholds = roc_curve(y_true, y_score)

# --- Base counts ---
total_pos = sum(y_true)
total_neg = len(y_true) - total_pos

# --- Build results ---
results = pd.DataFrame({
    'Threshold': thresholds,
    'TPR': tpr,
    'FPR': fpr
})

# Compute cost for each FP cost level
for C_FP in C_FP_values:
    results[f'Cost_FP{C_FP}'] = (
        (C_FP * results['FPR'] * total_neg) + 
        (C_FN * (1 - results['TPR']) * total_pos)
    )

# --- Find best threshold for each FP cost ---
best_thresholds = []
for C_FP in C_FP_values:
    col = f'Cost_FP{C_FP}'
    idx_min = results[col].idxmin()
    best_thresholds.append({
        'C_FP': C_FP,
        'Best_Threshold': results.loc[idx_min, 'Threshold'],
        'Min_Cost': results.loc[idx_min, col],
        'TPR': results.loc[idx_min, 'TPR'],
        'FPR': results.loc[idx_min, 'FPR']
    })

best_df = pd.DataFrame(best_thresholds)

print("✅ Optimal thresholds by false positive cost:")
print(best_df)



In [None]:
# Base line compared to adjusting for cost
from sklearn.metrics import roc_curve
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# --- Inputs ---
y_true = train_o['fraud_bool'].astype(int)
y_score = train_o['pred_prob'].astype(float)

# --- Business Costs ---
C_FN = 1200                        # Cost of missing a fraud
C_FP_values = np.arange(700, 1001, 100)  # Cost range for false positives (700–1000)
baseline_threshold = 0.5           # Your current operational cutoff

# --- Checks ---
if len(set(y_true)) < 2:
    raise ValueError("y_true has only one class. Need both positive and negative samples.")

# --- ROC curve ---
fpr, tpr, thresholds = roc_curve(y_true, y_score)
total_pos = sum(y_true)
total_neg = len(y_true) - total_pos

results = pd.DataFrame({
    'Threshold': thresholds,
    'TPR': tpr,
    'FPR': fpr
})

# --- Compute cost and baseline savings for each FP cost ---
best_thresholds = []

# Baseline performance at threshold 0.5
baseline_idx = np.argmin(np.abs(thresholds - baseline_threshold))
baseline_tpr = tpr[baseline_idx]
baseline_fpr = fpr[baseline_idx]

for C_FP in C_FP_values:
    # Expected cost at each threshold
    results[f'Cost_FP{C_FP}'] = (
        (C_FP * results['FPR'] * total_neg) + 
        (C_FN * (1 - results['TPR']) * total_pos)
    )
    
    # Baseline cost
    baseline_cost = (
        (C_FP * baseline_fpr * total_neg) + 
        (C_FN * (1 - baseline_tpr) * total_pos)
    )
    
    # Find minimum cost
    idx_min = results[f'Cost_FP{C_FP}'].idxmin()
    min_cost = results.loc[idx_min, f'Cost_FP{C_FP}']
    best_threshold = results.loc[idx_min, 'Threshold']
    
    # Savings = baseline_cost - min_cost
    savings = baseline_cost - min_cost
    
    best_thresholds.append({
        'C_FP': C_FP,
        'Best_Threshold': best_threshold,
        'Baseline_Cost': baseline_cost,
        'Min_Cost': min_cost,
        'Savings': savings,
        'TPR': results.loc[idx_min, 'TPR'],
        'FPR': results.loc[idx_min, 'FPR']
    })

best_df = pd.DataFrame(best_thresholds)

# --- Display results ---
print("✅ Optimal thresholds and cost savings vs baseline (threshold = 0.5):\n")
print(best_df.round(2))

# --- Plot cost curves ---
plt.figure(figsize=(8, 5))
for C_FP in C_FP_values:
    plt.plot(results['Threshold'], results[f'Cost_FP{C_FP}'], label=f'FP cost {C_FP}')
plt.axvline(baseline_threshold, color='gray', linestyle='--', label='Baseline = 0.5')
plt.xlabel('Threshold')
plt.ylabel('Expected Cost')
plt.title('Expected Cost vs. Threshold (Cost-Sensitive ROC)')
plt.legend()  # fixed from plt.lege
plt.grid(True)
plt.show()


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import numpy as np

# --- True labels and predicted probabilities ---
y_true = train_o['fraud_bool'].astype(int)
y_score = train_o['pred_prob'].astype(float)

# --- ROC curve for standard model (all thresholds) ---
fpr_model, tpr_model, thresholds_model = roc_curve(y_true, y_score)
auc_model = roc_auc_score(y_true, y_score)

# --- Business cost setup ---
C_FN = 1200   # Cost of false negative (missed fraud)
C_FP = 820    # Cost of false positive (wrongly flagged as fraud)

# --- Class counts ---
total_pos = sum(y_true)
total_neg = len(y_true) - total_pos

# --- Compute total business cost for each threshold ---
costs = (C_FP * fpr_model * total_neg) + (C_FN * (1 - tpr_model) * total_pos)

# --- Identify optimal threshold minimizing total cost ---
idx_min = np.argmin(costs)
optimal_threshold = thresholds_model[idx_min]
optimal_fpr = fpr_model[idx_min]
optimal_tpr = tpr_model[idx_min]
optimal_cost = costs[idx_min]

# --- Print results ---
print("✅ Cost-Optimized Threshold (C_FP = 820)")
print(f"Optimal Threshold : {optimal_threshold:.4f}")
print(f"True Positive Rate: {optimal_tpr:.4f}")
print(f"False Positive Rate: {optimal_fpr:.4f}")
print(f"Minimum Expected Cost: ${optimal_cost:,.2f}")

# --- Plot ROC curves ---
plt.figure(figsize=(8, 6))

# Standard ROC curve
plt.plot(fpr_model, tpr_model, label=f"Model (AUC = {auc_model:.3f})", color='blue')

# Baseline threshold = 0.5 marker
baseline_idx = np.argmin(np.abs(thresholds_model - 0.5))
plt.scatter(
    fpr_model[baseline_idx],
    tpr_model[baseline_idx],
    color='red',
    label=f"Baseline threshold = 0.5",
    zorder=5
)

# Cost-optimized threshold marker
plt.scatter(
    optimal_fpr,
    optimal_tpr,
    color='green',
    label=f"Cost-optimal threshold = {optimal_threshold:.3f} (C_FP = 820)",
    zorder=5
)

# Chance line
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve with Cost-Optimized Threshold (C_FP = 820)")
# plt.legend("cost-Optimized Threshold (C_FP = 820)
# Optimal Threshold : 0.3905
# True Positive Rate: 0.8636
# False Positive Rate: 0.2874
# Minimum Expected Cost: $1,996,740.00")

plt.grid(True)
plt.show()


### Precision, Recall, & Accuracy

In [None]:
precision = np.array([])
recall = np.array([])
accuracy = np.array([])
f1score = np.array([])

for y in range(100):
    train_o['pred'] = train_o['pred_prob'].map(lambda x: 1 if x > y/100 else 0)
    value_p = precision_score(train_o['fraud_bool'], train_o['pred'])
    precision = np.append(precision, value_p)
    value_r = recall_score(train_o['fraud_bool'], train_o['pred'])
    recall = np.append(recall, value_r)
    value_a = accuracy_score(train_o['fraud_bool'], train_o['pred'])
    accuracy = np.append(accuracy, value_a)
    value_f = f1_score(train_o['fraud_bool'], train_o['pred'])
    f1score = np.append(f1score, value_f)

data = {'Precision': precision, 'Recall': recall, 'Accuracy': accuracy, 'Cut-off': range(100), 'F1': f1score}
f1_s = pd.DataFrame(data)

f1_s.sort_values(by = ['F1'], ascending = False)

In [None]:
def plot_lift_and_gains(y_true, y_proba, n_bins=10):
    """
    Plot Lift and Cumulative Gains curves.
    
    Parameters:
    - y_true: array-like, true binary labels (0/1)
    - y_proba: array-like, predicted probabilities for the positive class
    - n_bins: number of bins/deciles to split data
    
    Returns:
    - None (plots the curves)
    """
    df = pd.DataFrame({
        'y_true': y_true,
        'y_proba': y_proba
    })
    
    # Sort descending by predicted probability
    df = df.sort_values(by='y_proba', ascending=False).reset_index(drop=True)
    
    # Add cumulative counts
    df['cum_total'] = np.arange(1, len(df) + 1)
    df['cum_positives'] = df['y_true'].cumsum()
    
    total_positives = df['y_true'].sum()
    total_samples = len(df)
    
    # Cumulative Gains: % positives captured vs % samples
    df['cum_gains'] = df['cum_positives'] / total_positives
    
    # Lift: (cumulative gains) / (cumulative % of sample)
    df['cum_lift'] = df['cum_gains'] / (df['cum_total'] / total_samples)
    
    # Sample points for plotting (deciles)
    cutoffs = np.linspace(0, total_samples, n_bins + 1, dtype=int)
    cutoffs = cutoffs[cutoffs > 0]  # remove zero
    plot_points = df.loc[cutoffs - 1, ['cum_total', 'cum_gains', 'cum_lift']].copy()
    plot_points['percent_samples'] = plot_points['cum_total'] / total_samples * 100
    
    # Plot
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Plot Cumulative Gains
    axes[0].plot(plot_points['percent_samples'], plot_points['cum_gains'], marker='o', color='blue', label='Cumulative Gains')
    axes[0].plot([0, 100], [0, 1], linestyle='--', color='blue', alpha=0.5, label='Random Gains')
    axes[0].set_xlabel('Percent of Sample')
    axes[0].set_ylabel('Cumulative Gains')
    axes[0].set_title('Cumulative Gains Curve')
    axes[0].set_ylim(0, 1.05)
    axes[0].grid(True)
    axes[0].legend()
    
    # Plot Lift
    axes[1].plot(plot_points['percent_samples'], plot_points['cum_lift'], marker='o', color='red', label='Lift')
    axes[1].axhline(1, linestyle='--', color='red', alpha=0.5, label='Random Lift')
    axes[1].set_xlabel('Percent of Sample')
    axes[1].set_ylabel('Cumulative Lift')
    axes[1].set_title('Cumulative Lift Curve')
    axes[1].set_ylim(0, plot_points['cum_lift'].max() * 1.1)
    axes[1].grid(True)
    axes[1].legend()
    
    plt.tight_layout()
    plt.show()

In [None]:
plot_lift_and_gains(train_o['fraud_bool'], train_o['pred_prob'])

## Test

In [None]:
num_cols_test = test_o.select_dtypes(include='number').columns

for col in num_cols_test:
    if test_o[col].isnull().any():
        # Create missing flag column
        test_o[f'{col}_was_missing'] = test_o[col].isnull().astype(int)

        # Impute with median
        median = test_o[col].median()
        test_o[col] = test_o[col].fillna(median)
print(test_o.columns)

In [None]:
predictors_test = test_o.drop(columns=['fraud_bool'])
predictors_test = pd.get_dummies(predictors_test, drop_first=True)
predictors_test = predictors_test.astype(float)

X_test = predictors_test
y_test = test_o['fraud_bool']
X_test.head()

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# --- Ensure test has all columns used in training ---
missing_cols = set(selected_features) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = 0  # fill missing dummy columns with 0

# --- Reorder columns to match training exactly ---
X_test1 = X_test[selected_features].copy()
X_test1 = sm.add_constant(X_test1, has_constant='add')

# --- Predict probabilities using weighted logistic model ---
y_pred_prob = result_2.predict(X_test1)

# --- Evaluate performance ---
metrics = {
    'MAE': mean_absolute_error(y_test, y_pred_prob),
    'LogLoss': log_loss(y_test, y_pred_prob),
    'BrierScore': brier_score_loss(y_test, y_pred_prob),
    'ROC_AUC': roc_auc_score(y_test, y_pred_prob)
}

print(pd.Series(metrics).round(4))


In [None]:
# --- Convert probabilities to binary predictions using best threshold ---
best_threshold = 0.0072  # replace with your computed optimal threshold
y_pred_binary = (y_pred_prob >= best_threshold).astype(int)

# --- Compute metrics ---
mae = mean_absolute_error(y_test, y_pred_prob)
ll = log_loss(y_test, y_pred_prob)
brier = brier_score_loss(y_test, y_pred_prob)
accuracy = accuracy_score(y_test, y_pred_binary)
auc = roc_auc_score(y_test, y_pred_prob)
recall = recall_score(y_test, y_pred_binary)

# --- Display results ---
print(f"Best Threshold: {best_threshold:.2f}")
print(f"Log Loss: {ll:.4f}")
print(f"Brier Score: {brier:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"AUC: {auc:.4f}")
print(f"Recall: {recall:.4f}")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_binary))
