In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


In [5]:

## LOAD FEATURE ENGINEERING DATASET


In [6]:
df = pd.read_csv("../data/processed/insurance_fraud_fe.csv")
df.shape

(1000, 46)

In [7]:
y = df['fraud_reported'].map({'Y': 1, 'N': 0})
X = df.drop(columns=['fraud_reported'])

In [8]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

len(num_cols), len(cat_cols)


(27, 18)

## Preprocessing + Model (Pipeline)

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

preprocess = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    n_jobs=None
)

clf = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', model)
])


In [10]:

## TRAIN TEST SPLIT


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)


In [12]:
clf.fit(X_train, y_train)

In [13]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.88      0.86      0.87       188
           1       0.61      0.65      0.62        62

    accuracy                           0.81       250
   macro avg       0.74      0.75      0.75       250
weighted avg       0.81      0.81      0.81       250

ROC-AUC: 0.8169183253260124
Confusion Matrix:
 [[162  26]
 [ 22  40]]


# Threshold Tuning 

Objective:
Optimize the classification threshold to improve recall for fraudulent claims,
since missing fraud is more costly than false positives.


In [14]:
def eval_at_threshold(th):
    preds = (y_prob >= th).astype(int)
    from sklearn.metrics import precision_score, recall_score, f1_score
    return {
        "threshold": th,
        "precision": precision_score(y_test, preds),
        "recall": recall_score(y_test, preds),
        "f1": f1_score(y_test, preds)
    }

for th in [0.2, 0.3, 0.4, 0.5]:
    print(eval_at_threshold(th))


{'threshold': 0.2, 'precision': 0.4722222222222222, 'recall': 0.8225806451612904, 'f1': 0.6}
{'threshold': 0.3, 'precision': 0.5319148936170213, 'recall': 0.8064516129032258, 'f1': 0.6410256410256411}
{'threshold': 0.4, 'precision': 0.5487804878048781, 'recall': 0.7258064516129032, 'f1': 0.625}
{'threshold': 0.5, 'precision': 0.6060606060606061, 'recall': 0.6451612903225806, 'f1': 0.625}


### Threshold Selection Conclusion

A threshold of 0.3 was selected as the optimal operating point, achieving over 80% recall for fraud detection while maintaining acceptable precision. This trade-off is suitable for insurance fraud use cases where missing fraud is more costly than investigating false positives.


## Feature importance (Logistic)

In [15]:
# Get OneHotEncoder
ohe = clf.named_steps['preprocess'].named_transformers_['cat']

# Get encoded categorical feature names
cat_feature_names = ohe.get_feature_names_out(cat_cols)

# Combine with numerical feature names
feature_names = num_cols + list(cat_feature_names)

# Get coefficients
coefficients = clf.named_steps['model'].coef_[0]

# Create feature importance dataframe
feature_importance = (
    pd.Series(coefficients, index=feature_names)
      .sort_values(key=abs, ascending=False)
)

feature_importance.head(15)


insured_hobbies_chess                   3.898170
insured_hobbies_cross-fit               3.025860
incident_severity_Major Damage          2.761997
insured_hobbies_sleeping               -1.490872
incident_location_7582 Pine Drive       1.347271
incident_location_8782 3rd St           1.298456
incident_location_4981 Weaver St        1.258960
incident_location_8204 Pine Lane        1.228863
insured_occupation_handlers-cleaners   -1.227005
auto_model_Malibu                      -1.175803
incident_severity_Trivial Damage       -1.169440
incident_location_5779 2nd Lane         1.167055
insured_hobbies_dancing                -1.129725
incident_severity_Minor Damage         -1.067633
incident_location_6939 3rd Hwy          1.019627
dtype: float64

In [16]:
len(feature_names), len(coefficients)


(922, 922)

### Feature Importance Insights

Positive coefficients indicate higher likelihood of fraud, while negative coefficients reduce fraud probability.
Incident severity and claim-related features were strong predictors.
Some high-cardinality categorical features (e.g., exact locations) showed large coefficients due to dataset-specific patterns and would require generalization or regularization in production.


## Random Forest Model 
Logistic Regression provides a linear decision boundary. 
To capture non-linear interactions between policy, claim, and incident features,
a Random Forest model was trained and evaluated.



In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

In [18]:
preprocess_rf = ColumnTransformer(
    [
        ('num', 'passthrough', num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

In [19]:

## Random Forest Pipeline


In [20]:
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf_clf = Pipeline([
    ('preprocess', preprocess_rf),
    ('model', rf_model)
])


In [21]:

## Train Random Forest


In [22]:
rf_clf.fit(X_train, y_train)

In [23]:

## Evaluate Random Forest


In [24]:
rf_pred = rf_clf.predict(X_test)
rf_prob = rf_clf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, rf_pred))
print("ROC-AUC:", roc_auc_score(y_test, rf_prob))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_pred))


              precision    recall  f1-score   support

           0       0.79      0.92      0.85       188
           1       0.52      0.26      0.34        62

    accuracy                           0.76       250
   macro avg       0.65      0.59      0.60       250
weighted avg       0.72      0.76      0.72       250

ROC-AUC: 0.820006863417982
Confusion Matrix:
 [[173  15]
 [ 46  16]]


In [26]:

## RF Threshold Tuning


In [27]:
from sklearn.metrics import precision_score, recall_score, f1_score

def rf_eval_threshold(th):
    preds = (rf_prob >= th).astype(int)
    return {
        "threshold": th,
        "precision": precision_score(y_test, preds),
        "recall": recall_score(y_test, preds),
        "f1": f1_score(y_test, preds)
    }

for t in [0.1, 0.2, 0.3, 0.4, 0.5]:
    print(rf_eval_threshold(t))


{'threshold': 0.1, 'precision': 0.24899598393574296, 'recall': 1.0, 'f1': 0.3987138263665595}
{'threshold': 0.2, 'precision': 0.2830188679245283, 'recall': 0.967741935483871, 'f1': 0.43795620437956206}
{'threshold': 0.3, 'precision': 0.43902439024390244, 'recall': 0.8709677419354839, 'f1': 0.5837837837837838}
{'threshold': 0.4, 'precision': 0.5974025974025974, 'recall': 0.7419354838709677, 'f1': 0.6618705035971223}
{'threshold': 0.5, 'precision': 0.5161290322580645, 'recall': 0.25806451612903225, 'f1': 0.34408602150537637}


## Final Model Selection

Two models were evaluated for insurance fraud detection.

Logistic Regression, after threshold tuning to 0.3, achieved higher fraud recall (~81%) and was selected as the primary model due to the high cost of missed fraud cases.

Random Forest captured non-linear patterns and achieved better precision at a threshold of 0.4, making it a strong challenger model.

The final decision prioritizes recall while maintaining acceptable precision, aligning with real-world insurance fraud risk management.
