### FRAUD DETECTION SYSTEM 
    MACHINE LEARNING PIPELINE
        1. Import Libraries
        2. Load Dataset
        3. Feature & Target Selection
        4. Train-Test Split (Stratified)
        5. Feature Scaling
        6. Train Multiple Models
        7. Model Evaluation
        8. Ensemble Learning (Soft Voting)
        9. Final Model Selection
        10. Prediction on New Data
        11. Model Saving for Deployment

In [40]:
# Import pandas for data handling
import pandas as pd

# Import numpy for numerical operations
import numpy as np

# Import joblib to save and load trained models
import joblib

# Split dataset into training and testing sets
from sklearn.model_selection import train_test_split

# Scale numerical features for distance-based models
from sklearn.preprocessing import StandardScaler

# Logistic Regression â€“ baseline classification model
from sklearn.linear_model import LogisticRegression

# Decision Tree â€“ rule-based classification model
from sklearn.tree import DecisionTreeClassifier

# Random Forest â€“ ensemble of multiple decision trees
# Voting Classifier â€“ combines multiple models
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

# Support Vector Machine â€“ margin-based classifier
from sklearn.svm import SVC

# K-Nearest Neighbors â€“ distance-based classifier
from sklearn.neighbors import KNeighborsClassifier

# Accuracy â€“ overall prediction correctness
# Precision â€“ correctness of fraud predictions
# Recall â€“ ability to detect fraud cases
# F1 Score â€“ balance between precision and recall
# ROC-AUC â€“ model discrimination ability
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load the fraud detection dataset
df = pd.read_csv("fraud_detection.csv")

print(df.head())

print("Dataset Shape:", df.shape);  

   TransactionAmount  TransactionHour  TransactionFrequency  AccountAgeMonths  \
0           10995.27                8                     8                17   
1            2310.90               18                     2                73   
2            6704.00               17                     8                17   
3            2618.91               21                     4                64   
4            1877.31               11                     4                14   

   IsInternational  Fraud  
0                0      1  
1                0      0  
2                1      1  
3                0      0  
4                0      0  
Dataset Shape: (2000, 6)


### FEATURE â€“ TARGET SEPARATION

In [42]:
X = df[
    [
        "TransactionAmount",      # Amount of the transaction
        "TransactionHour",        # Hour when transaction occurred
        "TransactionFrequency",   # Number of recent transactions
        "AccountAgeMonths",       # Age of account in months
        "IsInternational"         # International transaction flag
    ]
]

y = df["Fraud"]; 

### TRAINâ€“TEST SPLIT

In [44]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
);

### FEATURE SCALING

In [46]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### DEFINE MULTIPLE ML MODELS

In [48]:
models = {
    # max_iter=1000 ensures proper convergence
    "Logistic Regression": LogisticRegression(max_iter=1000),

    # Decision Tree â†’ rule-based model
    # max_depth=5 controls tree complexity and prevents overfitting
    "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42),

    # Random Forest â†’ ensemble of multiple decision trees
    # n_estimators=100 means 100 trees are used
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),

    # Support Vector Machine â†’ margin-based classifier
    # probability=True enables probability predictions for ROC-AUC
    "SVM": SVC(kernel="rbf", probability=True),

    # K-Nearest Neighbors â†’ distance-based classifier
    # n_neighbors=5 uses 5 nearest points for prediction
    "KNN": KNeighborsClassifier(n_neighbors=5)
}; 

### TRAIN & EVALUATE MODELS 

In [50]:
# Store results of all models
results = []

print("\n================ MODEL PERFORMANCE =================\n")

# Train and evaluate each model
for name, model in models.items():

    # Models that need scaled data
    if name in ["Logistic Regression", "SVM", "KNN"]:
        model.fit(X_train_scaled, y_train)        # Train model
        y_pred = model.predict(X_test_scaled)     # Predict class
        y_prob = model.predict_proba(X_test_scaled)[:, 1]  # Predict probability

    # Models that do not need scaling
    else:
        model.fit(X_train, y_train)               # Train model
        y_pred = model.predict(X_test)            # Predict class
        y_prob = model.predict_proba(X_test)[:, 1]  # Predict probability

    # Store evaluation metrics
    results.append([
        name,                                     # Model name
        accuracy_score(y_test, y_pred),           # Accuracy
        precision_score(y_test, y_pred),          # Precision
        recall_score(y_test, y_pred),             # Recall
        f1_score(y_test, y_pred),                 # F1 Score
        roc_auc_score(y_test, y_prob)             # ROC-AUC
    ]); 

    print(f"{name} completed"); 



Logistic Regression completed
Decision Tree completed
Random Forest completed
SVM completed
KNN completed


## ENSEMBLE LEARNING â€“ SOFT VOTING

In [52]:
final_model = VotingClassifier(                      # Combines multiple models into one strong model
    estimators=[
        ("lr", LogisticRegression(max_iter=1000)),   # Logistic Regression: allowed 1000 tries to learn properly
        ("rf", RandomForestClassifier(n_estimators=100,random_state=42)),   # Random Forest uses 100 decision trees
        ("svm", SVC(kernel="rbf",  # RBF kernel creates curved decision boundary
            probability=True))  # Enables confidence scores (needed for soft voting)
    ],
    voting="soft"                                    # Final decision based on probability 
)

final_model.fit(X_train_scaled, y_train)             # Trains all models together using scaled data

### Convert model evaluation results into a table

In [54]:
results_df = pd.DataFrame(
    results,                                              # List containing all model metrics
    columns=["Model", "Accuracy", "Precision", "Recall",  # Names of each column
             "F1 Score", "ROC-AUC"]
)

print("\n================ FINAL RESULTS =================\n")  # Heading for output
print(results_df)                                        # Display final comparison table



                 Model  Accuracy  Precision  Recall  F1 Score   ROC-AUC
0  Logistic Regression     0.995        1.0    0.99  0.994975  1.000000
1        Decision Tree     1.000        1.0    1.00  1.000000  1.000000
2        Random Forest     1.000        1.0    1.00  1.000000  1.000000
3                  SVM     0.995        1.0    0.99  0.994975  1.000000
4                  KNN     0.995        1.0    0.99  0.994975  0.997487


In [55]:
import pandas as pd

# ------------------ STEP 1: TAKE USER INPUT ------------------
def get_transaction_input():
    print("\nEnter Transaction Details")

    account_age = float(input("Account Age (months): "))
    is_international = int(input("International? (0 = No, 1 = Yes): "))
    amount = float(input("Transaction Amount: "))
    frequency = float(input("Transaction Frequency: "))
    hour = int(input("Transaction Hour (0â€“23): "))

    # Put input values into DataFrame (same order as model training)
    data = pd.DataFrame(
        [[account_age, is_international, amount, frequency, hour]],
        columns=scaler.feature_names_in_
    )

    return data


# ------------------ STEP 2: GET INPUT DATA ------------------
transaction = get_transaction_input()

# ------------------ STEP 3: SCALE DATA ------------------
transaction_scaled = scaler.transform(transaction)

# ------------------ STEP 4: PREDICT ------------------
prediction = final_model.predict(transaction_scaled)[0]
probability = final_model.predict_proba(transaction_scaled)[0][1]

# ------------------ STEP 5: RESULT ------------------
print("\nPrediction:", "ðŸš¨ FRAUD" if prediction == 1 else "âœ… NO FRAUD")
print("Fraud Probability:", round(probability, 3))


Enter Transaction Details


Account Age (months):  1
International? (0 = No, 1 = Yes):  0
Transaction Amount:  10999
Transaction Frequency:  12
Transaction Hour (0â€“23):  8



Prediction: ðŸš¨ FRAUD
Fraud Probability: 0.944


In [56]:
# Saves the trained fraud detection model to a file
joblib.dump(final_model, "fraud_detection_model.pkl")

# Saves the scaler so new data is scaled the same way
joblib.dump(scaler, "scaler.pkl")

# Saves feature names to keep column order correct during prediction
joblib.dump(X.columns.tolist(), "feature_names.pkl")

# Confirms that all required files are saved successfully
print("\nModel, Scaler & Feature Names Saved Successfully!");  


Model, Scaler & Feature Names Saved Successfully!
