In [1]:
%pip install joblib

Collecting joblib
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Downloading joblib-1.5.2-py3-none-any.whl (308 kB)
Installing collected packages: joblib
Successfully installed joblib-1.5.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# model.ipynb - Cell 1

import pandas as pd
import numpy as np
import joblib
import os
from backend.ml.fraud.fraud_detection import FraudDetector 
# Assuming FraudDetector class is defined and accessible

# --- 1. Define Relative File Path ---
# The notebook is in backend/ml. We use '../../' to step up to the VeriClaim root, 
# and then access the 'data' folder.
CSV_FILE_NAME = "health-insurance-coverage-2019.csv"
DATA_PATH = os.path.join("..", "..", "data", CSV_FILE_NAME) 

print(f"Attempting to load data from: {os.path.abspath(DATA_PATH)}")

# --- 2. Load the Data and Define Features ---
# We use the existing utility function from FraudDetector, 
# ensuring it loads the data needed for training.

# In a real scenario, you'd ensure this function uses DATA_PATH internally.
# For simplicity, we'll assume it handles loading its data internally 
# or use a direct manual load if the internal function isn't working:

try:
    # Option A: Use the existing function (Best if it handles paths correctly)
    df = FraudDetector.sample_training_dataframe(n_samples=5000) 
    print(f"Dataframe loaded using FraudDetector utility with {len(df)} samples.")
except Exception as e:
    print(f"Error using utility: {e}. Falling back to manual CSV load...")
    # Option B: Manual Load (If utility fails due to path)
    df = pd.read_csv(DATA_PATH)
    print(f"Manually loaded {len(df)} samples from CSV.")

# Define the features for training
FEATURE_COLS = ["amount", "days_since_last_claim", "num_previous_claims", "patient_age"]

# Check if required columns are present (Crucial step)
for col in FEATURE_COLS:
    if col not in df.columns:
        print(f"❌ ERROR: Column '{col}' missing from loaded data. Check your CSV header!")
        
print("\nProceed to Cell 2 for training...")

ModuleNotFoundError: No module named 'pandas'

In [3]:
%pip install pandas scikit-learn joblib

Collecting pandas
  Downloading pandas-2.3.2-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting numpy>=1.26.0 (from pandas)
  Using cached numpy-2.3.3-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.2-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading pandas-2.3.2-cp313-cp313-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ------------------- -------------------- 5.2/11.0 MB 28.9 MB/s eta 0:00:01
   ------------------------------------- -- 10.2


[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
import numpy as np

In [5]:
# model.ipynb - NEW Cell (Verification Test)

# 1. Import libraries needed for loading
import joblib
import os
import pandas as pd
from backend.ml.fraud.fraud_detection import FraudDetector 
from sklearn.preprocessing import StandardScaler # (Required if your FraudDetector uses scaling)

# Define the path to the saved model
MODEL_LOAD_PATH = os.path.join(os.getcwd(), "fraud_model.joblib") 

print(f"Attempting to load model from: {MODEL_LOAD_PATH}")

try:
    # 2. Load the trained model object
    loaded_detector = joblib.load(MODEL_LOAD_PATH)
    print("✅ Model loaded successfully!")

    # 3. Define a Test Case (Example: Low Risk Claim)
    low_risk_claim = {
        "amount": 500,                  # Low amount
        "days_since_last_claim": 500,   # Long time since last claim
        "num_previous_claims": 1,       # Low number of claims
        "patient_age": 45               # Typical age
    }

    # 4. Predict the Anomaly Score for the low-risk case
    low_risk_score = loaded_detector.predict_anomaly_single(low_risk_claim)
    
    print("\n--- TEST CASE: LOW RISK ---")
    print(f"Claim Data: {low_risk_claim}")
    print(f"Anomaly Score (Closer to 0 is less fraud): {low_risk_score:.4f}")

    # 5. Define a High Risk Test Case (Example: High Risk Claim)
    high_risk_claim = {
        "amount": 95000,                # Very high amount
        "days_since_last_claim": 10,    # Very recent claim
        "num_previous_claims": 10,      # High number of claims
        "patient_age": 22               # Less typical age/claim profile
    }
    
    # 6. Predict the Anomaly Score for the high-risk case
    high_risk_score = loaded_detector.predict_anomaly_single(high_risk_claim)
    
    print("\n--- TEST CASE: HIGH RISK ---")
    print(f"Claim Data: {high_risk_claim}")
    print(f"Anomaly Score (Closer to 1 is high fraud risk): {high_risk_score:.4f}")

except FileNotFoundError:
    print(f"❌ ERROR: Model file not found at {MODEL_LOAD_PATH}. Did you run the saving cell?")
except Exception as e:
    print(f"❌ ERROR during prediction: {e}")

ModuleNotFoundError: No module named 'backend'

In [6]:
# model.ipynb - NEW Cell (Verification Test)

# 1. Import libraries needed for loading
import joblib
import os
import sys
import pandas as pd
from sklearn.preprocessing import StandardScaler  # (Required if your FraudDetector uses scaling)

# --- Add project root to sys.path so Python can find "backend" ---
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../.."))
if project_root not in sys.path:
    sys.path.append(project_root)

print("✅ Project root added to PYTHONPATH:", project_root)

# Now import your detector
from backend.ml.fraud.fraud_detection import FraudDetector 

# Define the path to the saved model
MODEL_LOAD_PATH = os.path.join(project_root, "fraud_model.joblib")

print(f"Attempting to load model from: {MODEL_LOAD_PATH}")

try:
    # 2. Load the trained model object
    loaded_detector = joblib.load(MODEL_LOAD_PATH)
    print("✅ Model loaded successfully!")

    # 3. Define a Test Case (Example: Low Risk Claim)
    low_risk_claim = {
        "amount": 500,                  # Low amount
        "days_since_last_claim": 500,   # Long time since last claim
        "num_previous_claims": 1,       # Low number of claims
        "patient_age": 45               # Typical age
    }

    # 4. Predict the Anomaly Score for the low-risk case
    low_risk_score = loaded_detector.predict_anomaly_single(low_risk_claim)
    
    print("\n--- TEST CASE: LOW RISK ---")
    print(f"Claim Data: {low_risk_claim}")
    print(f"Anomaly Score (Closer to 0 is less fraud): {low_risk_score:.4f}")

    # 5. Define a High Risk Test Case (Example: High Risk Claim)
    high_risk_claim = {
        "amount": 95000,                # Very high amount
        "days_since_last_claim": 10,    # Very recent claim
        "num_previous_claims": 10,      # High number of claims
        "patient_age": 22               # Less typical age/claim profile
    }
    
    # 6. Predict the Anomaly Score for the high-risk case
    high_risk_score = loaded_detector.predict_anomaly_single(high_risk_claim)
    
    print("\n--- TEST CASE: HIGH RISK ---")
    print(f"Claim Data: {high_risk_claim}")
    print(f"Anomaly Score (Closer to 1 is high fraud risk): {high_risk_score:.4f}")

except FileNotFoundError:
    print(f"❌ ERROR: Model file not found at {MODEL_LOAD_PATH}. Did you run the saving cell?")
except Exception as e:
    print(f"❌ ERROR during prediction: {e}")


✅ Project root added to PYTHONPATH: c:\Users\arpit\Desktop\VeriClaim


ModuleNotFoundError: No module named 'backend'

In [None]:
# model.ipynb - NEW Cell (Verification Test)

# 1. Import libraries needed for loading
import joblib
import os
import sys
import pandas as pd
from sklearn.preprocessing import StandardScaler  # (Required if your FraudDetector uses scaling)

# --- Add project root to sys.path so Python can find "backend" ---
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../.."))
if project_root not in sys.path:
    sys.path.append(project_root)

print("✅ Project root added to PYTHONPATH:", project_root)

# Now import your detector
from backend.ml.fraud.fraud_detection import FraudDetector 

# Define the path to the saved model
MODEL_LOAD_PATH = os.path.join(project_root, "fraud_model.joblib")

print(f"Attempting to load model from: {MODEL_LOAD_PATH}")

try:
    # 2. Load the trained model object
    loaded_detector = joblib.load(MODEL_LOAD_PATH)
    print("✅ Model loaded successfully!")

    # 3. Define a Test Case (Example: Low Risk Claim)
    low_risk_claim = {
        "amount": 500,                  # Low amount
        "days_since_last_claim": 500,   # Long time since last claim
        "num_previous_claims": 1,       # Low number of claims
        "patient_age": 45               # Typical age
    }

    # 4. Predict the Anomaly Score for the low-risk case
    low_risk_score = loaded_detector.predict_anomaly_single(low_risk_claim)
    
    print("\n--- TEST CASE: LOW RISK ---")
    print(f"Claim Data: {low_risk_claim}")
    print(f"Anomaly Score (Closer to 0 is less fraud): {low_risk_score:.4f}")

    # 5. Define a High Risk Test Case (Example: High Risk Claim)
    high_risk_claim = {
        "amount": 95000,                # Very high amount
        "days_since_last_claim": 10,    # Very recent claim
        "num_previous_claims": 10,      # High number of claims
        "patient_age": 22               # Less typical age/claim profile
    }
    
    # 6. Predict the Anomaly Score for the high-risk case
    high_risk_score = loaded_detector.predict_anomaly_single(high_risk_claim)
    
    print("\n--- TEST CASE: HIGH RISK ---")
    print(f"Claim Data: {high_risk_claim}")
    print(f"Anomaly Score (Closer to 1 is high fraud risk): {high_risk_score:.4f}")

except FileNotFoundError:
    print(f"❌ ERROR: Model file not found at {MODEL_LOAD_PATH}. Did you run the saving cell?")
except Exception as e:
    print(f"❌ ERROR during prediction: {e}")


In [1]:
# model.ipynb - NEW Cell (Verification Test)

# 1. Import libraries needed for loading
import joblib
import os
import sys
import pandas as pd
from sklearn.preprocessing import StandardScaler

# --- Add project root to sys.path so Python can find "backend" ---
project_root = os.path.abspath(os.getcwd())  # since notebook is now in VERICLAIM/
if project_root not in sys.path:
    sys.path.append(project_root)

print("✅ Project root added to PYTHONPATH:", project_root)

# Now import your detector
from backend.ml.fraud.fraud_detection import FraudDetector 

# Define the path to the saved model
MODEL_LOAD_PATH = os.path.join(project_root, "fraud_model.joblib")

print(f"Attempting to load model from: {MODEL_LOAD_PATH}")

try:
    # 2. Load the trained model object
    loaded_detector = joblib.load(MODEL_LOAD_PATH)
    print("✅ Model loaded successfully!")

    # 3. Define a Test Case (Example: Low Risk Claim)
    low_risk_claim = {
        "amount": 500,
        "days_since_last_claim": 500,
        "num_previous_claims": 1,
        "patient_age": 45
    }

    # 4. Predict the Anomaly Score for the low-risk case
    low_risk_score = loaded_detector.predict_anomaly_single(low_risk_claim)

    print("\n--- TEST CASE: LOW RISK ---")
    print(f"Claim Data: {low_risk_claim}")
    print(f"Anomaly Score (Closer to 0 is less fraud): {low_risk_score:.4f}")

    # 5. Define a High Risk Test Case (Example: High Risk Claim)
    high_risk_claim = {
        "amount": 95000,
        "days_since_last_claim": 10,
        "num_previous_claims": 10,
        "patient_age": 22
    }

    # 6. Predict the Anomaly Score for the high-risk case
    high_risk_score = loaded_detector.predict_anomaly_single(high_risk_claim)

    print("\n--- TEST CASE: HIGH RISK ---")
    print(f"Claim Data: {high_risk_claim}")
    print(f"Anomaly Score (Closer to 1 is high fraud risk): {high_risk_score:.4f}")

except FileNotFoundError:
    print(f"❌ ERROR: Model file not found at {MODEL_LOAD_PATH}. Did you run the saving cell?")
except Exception as e:
    print(f"❌ ERROR during prediction: {e}")


✅ Project root added to PYTHONPATH: c:\Users\arpit\Desktop\VeriClaim\VeriClaim
Attempting to load model from: c:\Users\arpit\Desktop\VeriClaim\VeriClaim\fraud_model.joblib
❌ ERROR: Model file not found at c:\Users\arpit\Desktop\VeriClaim\VeriClaim\fraud_model.joblib. Did you run the saving cell?


In [2]:
# model.ipynb - Verification Test

import joblib
import os
import sys
import pandas as pd

# --- Add project root to sys.path so Python can find "backend" ---
project_root = os.path.abspath(os.getcwd())  # since notebook is at VERICLAIM/
if project_root not in sys.path:
    sys.path.append(project_root)

print("✅ Project root added to PYTHONPATH:", project_root)

# Import FraudDetector
from backend.ml.fraud.fraud_detection import FraudDetector, MODEL_PATH

# Initialize FraudDetector and load the trained model
detector = FraudDetector()
try:
    detector.load()
    print(f"✅ Model loaded successfully from: {MODEL_PATH}")
except FileNotFoundError:
    print(f"❌ Model not found at {MODEL_PATH}. Training a new one...")
    # create synthetic training data if not trained yet
    df = FraudDetector.sample_training_dataframe(2000)
    feature_cols = ["amount", "days_since_last_claim", "num_previous_claims", "patient_age"]
    detector.fit(df, feature_cols)
    print(f"✅ Model trained and saved to: {MODEL_PATH}")

# Define feature columns
feature_cols = ["amount", "days_since_last_claim", "num_previous_claims", "patient_age"]

# --- Test Case: Low Risk ---
low_risk_claim = {
    "amount": 500,
    "days_since_last_claim": 500,
    "num_previous_claims": 1,
    "patient_age": 45
}

low_risk_result = detector.predict_anomaly(low_risk_claim, feature_cols)

print("\n--- TEST CASE: LOW RISK ---")
print(f"Claim Data: {low_risk_claim}")
print(f"Prediction: {low_risk_result}")

# --- Test Case: High Risk ---
high_risk_claim = {
    "amount": 95000,
    "days_since_last_claim": 10,
    "num_previous_claims": 10,
    "patient_age": 22
}

high_risk_result = detector.predict_anomaly(high_risk_claim, feature_cols)

print("\n--- TEST CASE: HIGH RISK ---")
print(f"Claim Data: {high_risk_claim}")
print(f"Prediction: {high_risk_result}")


✅ Project root added to PYTHONPATH: c:\Users\arpit\Desktop\VeriClaim\VeriClaim
✅ Model loaded successfully from: c:\Users\arpit\Desktop\VeriClaim\VeriClaim\backend\ml\fraud\isolation_forest.joblib

--- TEST CASE: LOW RISK ---
Claim Data: {'amount': 500, 'days_since_last_claim': 500, 'num_previous_claims': 1, 'patient_age': 45}
Prediction: {'is_anomaly': False, 'score': 0.1470987103307776}

--- TEST CASE: HIGH RISK ---
Claim Data: {'amount': 95000, 'days_since_last_claim': 10, 'num_previous_claims': 10, 'patient_age': 22}
Prediction: {'is_anomaly': True, 'score': -0.14801755710314712}


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [3]:
import joblib
import pandas as pd


In [4]:
# Adjust the path if needed
model_path = "c:\\Users\\arpit\\Desktop\\VeriClaim\\VeriClaim\\backend\\ml\\fraud\\isolation_forest.joblib"
model = joblib.load(model_path)
print("Model loaded successfully!")


Model loaded successfully!


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [5]:
# Example claim data
sample_claim = {
    'amount': 5000,
    'days_since_last_claim': 45,
    'num_previous_claims': 2,
    'policy_age': 3,
    'claim_type': 1   # Example: 0 = minor, 1 = major
}

# Convert to DataFrame
claim_df = pd.DataFrame([sample_claim])


In [6]:
# Isolation Forest predicts -1 for anomaly (fraud) and 1 for normal
prediction = model.predict(claim_df)
print("Prediction:", prediction)

# Optional: interpret results
if prediction[0] == -1:
    print("⚠️ Fraudulent claim detected!")
else:
    print("✅ Claim seems normal.")


AttributeError: 'dict' object has no attribute 'predict'

In [7]:
print(type(model))
print(model.keys() if isinstance(model, dict) else "Not a dict")


<class 'dict'>
dict_keys(['model', 'features'])


In [8]:
# Example if it was saved in a dictionary
if isinstance(model, dict):
    actual_model = model['model']  # replace 'model' with the correct key if different
else:
    actual_model = model


In [9]:
prediction = actual_model.predict(claim_df)
print("Prediction:", prediction)

if prediction[0] == -1:
    print("⚠️ Fraudulent claim detected!")
else:
    print("✅ Claim seems normal.")




ValueError: X has 5 features, but IsolationForest is expecting 4 features as input.

In [10]:
print(actual_model.n_features_in_)


4


In [11]:
['amount', 'days_since_last_claim', 'num_previous_claims', 'policy_age']


['amount', 'days_since_last_claim', 'num_previous_claims', 'policy_age']

In [12]:
sample_claim = {
    'amount': 5000,
    'days_since_last_claim': 45,
    'num_previous_claims': 2,
    'policy_age': 3
}

claim_df = pd.DataFrame([sample_claim])


In [13]:
prediction = actual_model.predict(claim_df)
print("Prediction:", prediction)

if prediction[0] == -1:
    print("⚠️ Fraudulent claim detected!")
else:
    print("✅ Claim seems normal.")


Prediction: [1]
✅ Claim seems normal.




In [14]:
# Features your model was trained on
FEATURE_ORDER = ['amount', 'days_since_last_claim', 'num_previous_claims', 'policy_age']


In [15]:
import pandas as pd

# Example input with columns in any order or extra columns
input_claim = {
    'policy_age': 3,
    'claim_type': 1,  # extra column
    'num_previous_claims': 2,
    'days_since_last_claim': 45,
    'amount': 5000
}

claim_df = pd.DataFrame([input_claim])

# Keep only needed columns in correct order
claim_df = claim_df[FEATURE_ORDER]


In [16]:
prediction = actual_model.predict(claim_df)
print("Prediction:", prediction)

if prediction[0] == -1:
    print("⚠️ Fraudulent claim detected!")
else:
    print("✅ Claim seems normal.")


Prediction: [1]
✅ Claim seems normal.


