In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

In [2]:
file_path = "carclaim.csv"  
carclaims_df = pd.read_csv(file_path)

In [3]:
carclaims_df['FraudFound'] = carclaims_df['FraudFound'].map({'No': 0, 'Yes': 1})

In [4]:
def range_to_average(value):
    if isinstance(value, str) and 'to' in value:
        start, end = map(int, value.split(' to '))
        return (start + end) / 2
    elif isinstance(value, str) and value.strip().isdigit():
        return int(value)
    return value

In [5]:
range_columns = ['AgeOfVehicle', 'AgeOfPolicyHolder', 'NumberOfCars']
for col in range_columns:
    carclaims_df[col] = carclaims_df[col].apply(range_to_average)

KeyError: 'AgeOfVehicle'

In [26]:
irrelevant_cols = ['PolicyNumber', 'RepNumber']
carclaims_df = carclaims_df.drop(columns=irrelevant_cols)

In [27]:
categorical_cols = carclaims_df.select_dtypes(include='object').columns
carclaims_encoded = pd.get_dummies(carclaims_df, columns=categorical_cols, drop_first=True)

In [28]:
X = carclaims_encoded.drop(columns=['FraudFound'])
y = carclaims_encoded['FraudFound']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

In [30]:
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_balanced, y_train_balanced)

In [31]:
model_path = "fraud_detection_model.pkl"
joblib.dump(rf_classifier, model_path)
print(f"Model saved to {model_path}")

Model saved to fraud_detection_model.pkl


In [32]:
y_pred = rf_classifier.predict(X_test)

In [33]:
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [34]:
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_rep)

Accuracy: 93.61%

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.97      2899
           1       0.29      0.04      0.08       185

    accuracy                           0.94      3084
   macro avg       0.61      0.52      0.52      3084
weighted avg       0.90      0.94      0.91      3084



In [37]:
# Input for dynamic prediction
important_features = ['AgeOfVehicle', 'AgeOfPolicyHolder', 'Deductible', 'DriverRating']

print("\nEnter values for the following important features:")
user_data = {}
for feature in important_features:
    value = input(f"{feature}: ")
    try:
        user_data[feature] = float(value)
    except ValueError:
        print(f"Invalid input for {feature}, setting to 0 by default.")
        user_data[feature] = 0.0

# Create a DataFrame for user input
user_input_df = pd.DataFrame([user_data])

# Align columns with the trained model's input features 
for col in X.columns:
    if col not in user_input_df:
        user_input_df[col] = 0

# Ensure column order matches the training data
user_input_df = user_input_df[X.columns]

# Load the saved model
rf_classifier = joblib.load("fraud_detection_model.pkl")

# Make prediction
fraud_probability = rf_classifier.predict_proba(user_input_df)[0][1]
print(f"Fraud Probability: {fraud_probability:.2f}")
prediction_result = "Fraudulent Claim" if fraud_probability > 0.5 else "Non-Fraudulent Claim"
print(f"Prediction Result: {prediction_result}")



Enter values for the following important features:


  user_input_df[col] = 0
  user_input_df[col] = 0
  user_input_df[col] = 0
  user_input_df[col] = 0
  user_input_df[col] = 0
  user_input_df[col] = 0
  user_input_df[col] = 0
  user_input_df[col] = 0
  user_input_df[col] = 0
  user_input_df[col] = 0
  user_input_df[col] = 0
  user_input_df[col] = 0
  user_input_df[col] = 0
  user_input_df[col] = 0
  user_input_df[col] = 0
  user_input_df[col] = 0
  user_input_df[col] = 0
  user_input_df[col] = 0
  user_input_df[col] = 0
  user_input_df[col] = 0
  user_input_df[col] = 0


Fraud Probability: 1.00
Prediction Result: Fraudulent Claim
