# Health Insurance Claim Fraud Detection

1) Load the Dataset

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Explanation:
# This program is designed to detect fraudulent health insurance claims using machine learning.
# We use the PaySim1 dataset, which simulates financial transactions, to train a fraud detection model.
# The model helps insurance companies identify potential fraud and prevent financial losses.

df = pd.read_csv(r"C:\Users\lenovo\Downloads\archive\PS_20174392719_1491204439457_log.csv")

# Display dataset info
df.info()




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [3]:
df.head(10)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0
5,1,PAYMENT,7817.71,C90045638,53860.0,46042.29,M573487274,0.0,0.0,0,0
6,1,PAYMENT,7107.77,C154988899,183195.0,176087.23,M408069119,0.0,0.0,0,0
7,1,PAYMENT,7861.64,C1912850431,176087.23,168225.59,M633326333,0.0,0.0,0,0
8,1,PAYMENT,4024.36,C1265012928,2671.0,0.0,M1176932104,0.0,0.0,0,0
9,1,DEBIT,5337.77,C712410124,41720.0,36382.23,C195600860,41898.0,40348.79,0,0


In [4]:
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [6]:
# Keep only 30,000 rows, delete the rest
df = df.sample(n=100000, random_state=42).reset_index(drop=True)

In [7]:
# Select relevant columns for fraud detection
features = ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
target = 'isFraud'

# Splitting the dataset into training and testing sets
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
X

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest
0,278,330218.42,20866.00,351084.42,452419.57,122201.15
1,15,11647.08,30370.00,18722.92,0.00,0.00
2,10,152264.21,106589.00,258853.21,201303.01,49038.80
3,403,1551760.63,0.00,0.00,3198359.45,4750120.08
4,206,78172.30,2921331.58,2999503.88,415821.90,337649.60
...,...,...,...,...,...,...
99995,235,371042.71,1699365.85,2070408.56,4301572.86,3930530.15
99996,357,19494.57,1921089.11,1940583.68,75635.52,56140.95
99997,369,141519.24,0.00,0.00,12853274.23,12994793.47
99998,154,635412.68,58610.00,694022.68,0.00,0.00


In [9]:
# Training the fraud detection model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

In [10]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Function to Predict Fraud
# This function takes transaction details and predicts if it is fraudulent or not.
def predict_fraud(transaction):
    transaction_df = pd.DataFrame([transaction])
    prediction = model.predict(transaction_df)
    return "Fraud" if prediction[0] == 1 else "Not Fraud"

# Example Usage
example_transaction = {
    "step": 50,
    "amount": 5000,
    "oldbalanceOrg": 10000,
    "newbalanceOrig": 5000,
    "oldbalanceDest": 20000,
    "newbalanceDest": 25000
}

print("Prediction for example transaction:", predict_fraud(example_transaction))

Accuracy: 99.95
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19972
           1       0.95      0.68      0.79        28

    accuracy                           1.00     20000
   macro avg       0.97      0.84      0.90     20000
weighted avg       1.00      1.00      1.00     20000

Prediction for example transaction: Not Fraud


In [12]:
transaction1001 = {
    "step": 788,
    "amount": 4856,
    "oldbalanceOrg": 122000,
    "newbalanceOrig": 8800,
    "oldbalanceDest": 88060,
    "newbalanceDest": 39000
}

print("Prediction for example transaction:", predict_fraud(transaction1001))

Prediction for example transaction: Fraud
