In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Explanation:
# This program is designed to detect fraudulent health insurance claims using machine learning.
# We use the PaySim1 dataset, which simulates financial transactions, to train a fraud detection model.
# The model helps insurance companies identify potential fraud and prevent financial losses.


df = pd.read_csv(r"C:\Users\lenovo\Downloads\archive\PS_20174392719_1491204439457_log.csv")

# Keep only 30,000 rows, delete the rest
df = df.sample(n=200000, random_state=42).reset_index(drop=True)

# Display dataset info
df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   step            200000 non-null  int64  
 1   type            200000 non-null  object 
 2   amount          200000 non-null  float64
 3   nameOrig        200000 non-null  object 
 4   oldbalanceOrg   200000 non-null  float64
 5   newbalanceOrig  200000 non-null  float64
 6   nameDest        200000 non-null  object 
 7   oldbalanceDest  200000 non-null  float64
 8   newbalanceDest  200000 non-null  float64
 9   isFraud         200000 non-null  int64  
 10  isFlaggedFraud  200000 non-null  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 16.8+ MB


In [2]:
# Select relevant columns for fraud detection
features = ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
target = 'isFraud'

# Splitting the dataset into training and testing sets
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [3]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,278,CASH_IN,330218.42,C632336343,20866.0,351084.42,C834976624,452419.57,122201.15,0,0
1,15,PAYMENT,11647.08,C1264712553,30370.0,18722.92,M215391829,0.0,0.0,0,0
2,10,CASH_IN,152264.21,C1746846248,106589.0,258853.21,C1607284477,201303.01,49038.8,0,0
3,403,TRANSFER,1551760.63,C333676753,0.0,0.0,C1564353608,3198359.45,4750120.08,0,0
4,206,CASH_IN,78172.3,C813403091,2921331.58,2999503.88,C1091768874,415821.9,337649.6,0,0


In [4]:
df.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [5]:
# Training and evaluating Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)


In [6]:
# Training and evaluating Logistic Regression model
log_model = LogisticRegression(max_iter=1000, random_state=42)
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)
accuracy_log = accuracy_score(y_test, y_pred_log)

In [7]:
# Training and evaluating XGBoost model
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

In [8]:

# Display model accuracies
print("Model Performance:")
print(f"Random Forest Accuracy: {accuracy_rf*100}%")
print(f"Logistic Regression Accuracy: {accuracy_log*100}%")
print(f"XGBoost Accuracy: {accuracy_xgb*100}%")

Model Performance:
Random Forest Accuracy: 99.95%
Logistic Regression Accuracy: 99.9325%
XGBoost Accuracy: 99.95%


In [9]:
# Function to Predict Fraud
def predict_fraud(transaction, model):
    transaction_df = pd.DataFrame([transaction])
    prediction = model.predict(transaction_df)
    return "Fraud" if prediction[0] == 1 else "Not Fraud"

# Example Usage
example_transaction = {
    "step": 50,
    "amount": 5000 ,
    "oldbalanceOrg": 25000,
    "newbalanceOrig":  25000,
    "oldbalanceDest": 50000,
    "newbalanceDest": 25000
}




In [10]:
# Predicting using the best model (based on accuracy)
best_model = max([(accuracy_rf, rf_model), (accuracy_log, log_model), (accuracy_xgb, xgb_model)], key=lambda x: x[0])[1]
print("Prediction for example transaction:", predict_fraud(example_transaction, best_model))

Prediction for example transaction: Not Fraud


In [11]:
# Store model names and their accuracies in a dictionary
model_accuracies = {
    "Random Forest": accuracy_rf*100,
    "Logistic Regression": accuracy_log*100,
    "XGBoost": accuracy_xgb*100
}

# Find the best model
best_model_name = max(model_accuracies, key=model_accuracies.get)
best_accuracy = model_accuracies[best_model_name]

print("\n===== Model Performance Summary =====")
for model, acc in model_accuracies.items():
    print(f"{model}: {acc:.4f}%")

print(f"\nBest Model: {best_model_name} with Accuracy: {best_accuracy:.4f}%")



===== Model Performance Summary =====
Random Forest: 99.9500%
Logistic Regression: 99.9325%
XGBoost: 99.9500%

Best Model: Random Forest with Accuracy: 99.9500%


In [12]:
prediction = pd.read_csv('predictions.csv')
prediction.head(5)

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,Actual_Label,Predicted_RF,Predicted_Log,Predicted_XGB
0,286,1004.54,5915.0,4910.46,0.0,0.0,0,0,0,0
1,202,105458.55,0.0,0.0,148402.44,253860.99,0,0,0,0
2,135,384594.44,126519.0,0.0,513945.83,898540.27,0,0,0,0
3,300,406869.11,217746.0,624615.11,1697567.5,1290698.39,0,0,0,0
4,358,6260.37,41886.0,35625.63,0.0,0.0,0,0,0,0


In [13]:
mp = pd.read_csv('model_performance.csv')
mp.head()


Unnamed: 0,Model,Accuracy
0,Random Forest,0.9995
1,Logistic Regression,0.999325
2,XGBoost,0.9995


In [14]:
# Function to check overfitting/underfitting
def check_overfitting(model, X_train, X_test, y_train, y_test, model_name):
    # Ensure only the original training features are used
    X_train = X_train[features]
    X_test = X_test[features]

    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)

    train_acc = accuracy_score(y_train, train_pred)
    test_acc = accuracy_score(y_test, test_pred)

    print(f"\n{model_name} Performance:")
    print(f"Training Accuracy: {train_acc*100:.4f}%")
    print(f"Testing Accuracy: {test_acc*100:.4f}%")

    if train_acc > test_acc + 0.05:  # Large gap = Overfitting
        print(f"{model_name} is **Overfitting** (Training > Testing by {train_acc - test_acc:.4f})")
    elif train_acc < test_acc - 0.05:  # Large gap = Underfitting
        print(f"{model_name} is **Underfitting** (Testing > Training by {test_acc - train_acc:.4f})")
    else:
        print(f"{model_name} is **Balanced**")

# Run Overfitting Check for Each Model
check_overfitting(rf_model, X_train, X_test, y_train, y_test, "Random Forest")
check_overfitting(log_model, X_train, X_test, y_train, y_test, "Logistic Regression")
check_overfitting(xgb_model, X_train, X_test, y_train, y_test, "XGBoost")



Random Forest Performance:
Training Accuracy: 99.9994%
Testing Accuracy: 99.9500%
Random Forest is **Balanced**

Logistic Regression Performance:
Training Accuracy: 99.9456%
Testing Accuracy: 99.9325%
Logistic Regression is **Balanced**

XGBoost Performance:
Training Accuracy: 99.9925%
Testing Accuracy: 99.9500%
XGBoost is **Balanced**


In [15]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.
