# Vehicle Accident Insurance Fraud Prediction System

This notebook implements a supervised ML-based fraud detection system with feature engineering, SMOTE, and model comparison.

## 1. Import Libraries

In [20]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import joblib



## 2. Load Dataset

In [21]:

df = pd.read_csv("fraud_oracle.csv")
df.head()


Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


## 3. Data Cleaning

In [22]:

df = df.drop_duplicates()
df = df.fillna(0)


## 4. Feature Engineering

In [23]:

# Convert string columns to numeric
if 'PastNumberOfClaims' in df.columns:
    df['PastNumberOfClaims'] = pd.to_numeric(df['PastNumberOfClaims'], errors='coerce').fillna(0)
if 'PolicyTenure' in df.columns:
    df['PolicyTenure'] = pd.to_numeric(df['PolicyTenure'], errors='coerce').fillna(0)

# Claim frequency feature
if 'PastNumberOfClaims' in df.columns and 'PolicyTenure' in df.columns:
    df['Claim_Frequency'] = df['PastNumberOfClaims'] / (df['PolicyTenure'] + 1)

# Risk score feature
df['Risk_Score'] = 0
if 'DrunkDriving' in df.columns:
    df.loc[df['DrunkDriving'] == 1, 'Risk_Score'] += 3
if 'AccidentType' in df.columns:
    df.loc[df['AccidentType'] == 3, 'Risk_Score'] += 2
if 'PolicyType' in df.columns:
    df.loc[df['PolicyType'] == -1, 'Risk_Score'] += 5
if 'PastNumberOfClaims' in df.columns:
    df.loc[df['PastNumberOfClaims'] >= 3, 'Risk_Score'] += 2


## 5. Encoding & Feature Selection

In [24]:

target_col = 'FraudFound_P'
X = df.drop(columns=[target_col])
y = df[target_col]

X = pd.get_dummies(X, drop_first=True)


## 6. Handle Class Imbalance (SMOTE)

In [25]:

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


## 7. Train-Test Split & Scaling

In [26]:

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


## 8. Train Multiple Models

In [27]:

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier()
}

results = []
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    results.append([
        name,
        accuracy_score(y_test, preds),
        precision_score(y_test, preds),
        recall_score(y_test, preds)
    ])

results_df = pd.DataFrame(results, columns=["Model","Accuracy","Precision","Recall"])
results_df


Unnamed: 0,Model,Accuracy,Precision,Recall
0,Logistic Regression,0.966374,0.999631,0.933103
1,Random Forest,0.973961,0.999274,0.948621
2,Gradient Boosting,0.963442,0.997778,0.928966


## 9. Save Final Model & Artifacts

In [28]:

final_model = RandomForestClassifier(n_estimators=200, random_state=42)
final_model.fit(X_train_scaled, y_train)

joblib.dump(final_model, "insurance_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(X.columns.tolist(), "feature_columns.pkl")

print("Artifacts saved successfully")


Artifacts saved successfully


## 10. Conclusion

This project demonstrates a real-world insurance fraud detection system using supervised learning.