In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from xgboost import XGBClassifier

df_clean = pd.read_parquet("../data/processed/df_clean.parquet")

df_clean.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,insured_occupation_farming-fishing,insured_occupation_handlers-cleaners,insured_occupation_machine-op-inspct,insured_occupation_other-service,insured_occupation_priv-house-serv,insured_occupation_prof-specialty,insured_occupation_protective-serv,insured_occupation_sales,insured_occupation_tech-support,insured_occupation_transport-moving
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,...,0,0,0,0,0,0,0,0,0,0
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,...,0,0,1,0,0,0,0,0,0,0
2,134,29,687698,2000-06-09,OH,100/300,2000,1413.14,5000000,430632,...,0,0,0,0,0,0,0,1,0,0
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,...,0,0,0,0,0,0,0,0,0,0
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,...,0,0,0,0,0,0,0,1,0,0


# 1. Define target and features

In [3]:
# Target
y = df_clean['fraud_reported']

# Variables
#selected_X = ['policy_deductable', 'policy_annual_premium', 'total_claim_amount', 'police_report_available_YES', 'auto_model_risk',
#              'male', 'education_advanced', 'months_as_customer']
#X = df_clean[selected_X]

selected_X = [
    'policy_deductable',
    'policy_annual_premium',
    'total_claim_amount',
    'months_as_customer',
    'auto_model_risk',
    'male',
    'education_advanced',

    'police_report_available_YES',
    
    'collision_type_Rear Collision',
    'collision_type_Side Collision',
    'collision_type_UNKNOWN',

    'property_damage_UNKNOWN',
    'property_damage_YES',

    'incident_severity_Minor Damage',
    'incident_severity_Total Loss',

    'authorities_contacted_Fire',
    'authorities_contacted_None',
    'authorities_contacted_Other',
    
    'insured_relationship_not-in-family',
    'insured_relationship_other-relative',
    'insured_relationship_own-child',
    'insured_relationship_unmarried',
    'insured_relationship_wife',
    
    'insured_occupation_exec-managerial',
    'insured_occupation_farming-fishing',
    'insured_occupation_transport-moving',
    'insured_occupation_tech-support',
    'insured_occupation_craft-repair',
]
X = df_clean[selected_X]

# 2. Train/test split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

# 3. Choose and train model

## Random Forest

In [5]:
model = RandomForestClassifier(random_state=1, class_weight='balanced')
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [6]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1] #Probability of fraud
y_pred_thresh = (y_proba >= 0.25).astype(int) #probability of fraud if >0.25%

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print(f"\nROC AUC Score: {roc_auc_score(y_test, y_proba):.4f}")

Confusion Matrix:
 [[126  25]
 [ 34  15]]

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.83      0.81       151
           1       0.38      0.31      0.34        49

    accuracy                           0.70       200
   macro avg       0.58      0.57      0.57       200
weighted avg       0.69      0.70      0.69       200


ROC AUC Score: 0.7163


## XGBClassifier

In [5]:
from xgboost import XGBClassifier

model = XGBClassifier(
    random_state=42,
    scale_pos_weight=3,  # à ajuster en fonction du déséquilibre
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='logloss', gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=6,
              max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, objective='binary:logistic',
              predictor='auto', random_state=42, reg_alpha=0, ...)

# 4. Predictions & evaluation

In [6]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1] #Probability of fraud
y_pred_thresh = (y_proba >= 0.25).astype(int) #probability of fraud if >0.25%

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print(f"\nROC AUC Score: {roc_auc_score(y_test, y_proba):.4f}")

Confusion Matrix:
 [[118  33]
 [ 20  29]]

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.78      0.82       151
           1       0.47      0.59      0.52        49

    accuracy                           0.73       200
   macro avg       0.66      0.69      0.67       200
weighted avg       0.76      0.73      0.74       200


ROC AUC Score: 0.7041
