In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    roc_curve,
    confusion_matrix,
)
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

ModuleNotFoundError: No module named 'sklearn.utils._metadata_requests'

In [None]:
CSV_URL = "/kaggle/input/fraud-detection-in-transactions-dataset"


In [None]:
df = pd.read_csv(CSV_URL)

In [None]:
# Display the first few rows of the DataFrame
df.head()

In [None]:
# Print the shape of the dataset (number of rows and columns)
print(f'Dataset contains {df.shape[0]} rows and {df.shape[1]} columns')

In [None]:
# Display concise summary of the DataFrame, including data types and non-null values
df.info()

In [None]:
# Generate descriptive statistics of the DataFrame
df.describe()

In [None]:
# Check for missing values in each column
df.isnull().sum()

## --- Exploratory Data Analysis (EDA) ---

### Class Distribution of 'label' (Fraud vs. Not Fraud)

In [None]:
label_counts = df['label'].value_counts()
print(label_counts)

In [None]:
labels = ['Not Fraud (0)', 'Fraud (1)']
colors = ['lightgreen', 'salmon']

plt.figure(figsize=(5, 5))
plt.pie(label_counts, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90, explode=[0, 0.1])
plt.title('Fraud vs Not Fraud Distribution')
plt.show()

### Analysis of 'amount' Column

In [None]:
# Calculate Q1, Q3, and IQR for outlier detection
Q1 = df['amount'].quantile(0.25)
Q3 = df['amount'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds for outlier detection using the IQR method
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers in the 'amount' column
outliers = df[(df['amount'] < lower_bound) | (df['amount'] > upper_bound)]
print(f"Number of outliers in 'amount': {len(outliers)}")


In [None]:
# Create a boxplot to visualize the distribution and outliers of transaction amounts
sns.boxplot(data=df, x='amount')
plt.title('Boxplot of Transaction Amounts')
plt.show()


In [None]:
# Create a histogram to visualize the distribution of transaction amounts
sns.histplot(df['amount'], bins=50, kde=True)
plt.title('Transaction Amount Distribution')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.show()

#### The amount column is highly skewed 

### Fraud Rate by Categorical Features

In [None]:
# Define categorical features for analysis
cat_features = ['merchant_type', 'device_type']

# Loop through each categorical feature to visualize fraud rate
for col in cat_features:
    fraud_rate = df.groupby(col)['label'].mean().sort_values(ascending=False)
    fraud_rate.plot(kind='bar')
    plt.title(f'Fraud Rate by {col}')
    plt.ylabel('Fraud Rate')
    plt.show()


### Fraud Rate by Transaction Amount Range

In [None]:
# Define bins and labels for categorizing transaction amounts into ranges
bins = [0, 50, 100, 200, 400, 800, df['amount'].max()]
labels = ['0-50', '50-100', '100-200', '200-400', '400-800', '800+']

# Create a new column 'amount_range' by binning the 'amount' column
df['amount_range'] = pd.cut(df['amount'], bins=bins, labels=labels)

# Calculate the mean fraud rate for each 'amount_range'
fraud_rate_by_amount = df.groupby('amount_range', observed=True)['label'].mean()


## Fraud rate wrt to transaction amount range


In [None]:
# Plot the fraud rate by transaction amount range
fraud_rate_by_amount.plot(kind='bar', color='teal')
plt.title('Fraud Rate by Transaction Amount Range')
plt.xlabel('Amount Range')
plt.ylabel('Fraud Rate')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()


## üß† Exploratory Data Analysis (EDA) ‚Äì Key Insights & Interpretations

### ‚úÖ Dataset Summary:
- **Rows**: 1000  
- **Features**: 5  
- **Target (`label`)**: Binary (0 = non-fraud, 1 = fraud)  
- **Missing Values**: None

---

### üîç Class Distribution:
- **Non-Fraud (label = 0)**: 950  
- **Fraud (label = 1)**: 50  
- ‚ö†Ô∏è **Highly imbalanced** (~5% fraud rate)

---

### üìä Fraud Rate by `amount` Range:

| Amount Range | Fraud Rate |
|--------------|------------|
| 100‚Äì200      | **~8%** üî∫ |
| 400‚Äì800      | **~6%** üî∫ |
| Other ranges | < 5%       |

**Interpretation:**  
Fraud tends to concentrate in **mid to upper ranges of transaction amounts**, likely because:
- Low amounts aren‚Äôt attractive enough for fraud
- Extremely high amounts may trigger additional security

---

### üõçÔ∏è Fraud Rate by `merchant_type`:

| Merchant Type | Fraud Rate |
|---------------|------------|
| Others        | **~7%** üî∫ |
| Electronics   | ~5%        |
| Others (groceries, travel, clothing) | < 4% |

**Interpretation:**  
The "others" category could include high-risk or uncategorized vendors. Electronics is also slightly riskier ‚Äî potentially due to resale value.

---

### üì± Fraud Rate by `device_type`:

| Device Type | Fraud Rate |
|-------------|------------|
| Mobile      | **~5%**    |
| Tablet      | ~5%        |
| Desktop     | ~4%        |

**Interpretation:**  
Slightly more frauds occur on **mobile and tablet devices**, possibly due to easier spoofing or less secure access compared to desktop.

---

### üí° Actionable Insights:

- `amount`, `merchant_type`, and `device_type` show useful variance with fraud label ‚Äî ‚úÖ relevant for modeling.
- Handling class imbalance during modeling via `class_weight` or `SMOTE`.


## --- Data Preprocessing ---

### Log Transformation of 'amount'

In [None]:
# Apply log1p transformation to the 'amount' column to reduce skewness
df['log_amount'] = np.log1p(df['amount'])

In [None]:
# # Add a binary feature for mid-to-high risk amount range
# df['is_mid_high_amount'] = ((df['amount'] > 100) & (df['amount'] < 800)).astype(int)
# This technique didn't improve results

In [None]:
# Plot the distribution of the log-transformed 'amount'
sns.histplot(df['log_amount'], bins=50, kde=True)
plt.title('Transaction log Amount Distribution')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.show()

In [None]:
# The skewness of the original and log-transformed 'amount'
print("Original skew:", df['amount'].skew())
print("Log skew:", df['log_amount'].skew())


### Scaling 'log_amount'

In [None]:
# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Apply Min-Max scaling to the 'log_amount' column
df['log_amount_scaled'] = scaler.fit_transform(df[['log_amount']])

### Feature Dropping

In [None]:
# Drop 'transaction_id' as it's an identifier and 'amount_range' which was used for EDA
df = df.drop(columns=['transaction_id', 'amount_range']) 


### One-Hot Encoding for Categorical Features


In [None]:
# Apply One-Hot Encoding to 'merchant_type' and 'device_type'
# 'drop_first=True' prevents multicollinearity
df_encoded = pd.get_dummies(df,columns=['merchant_type', 'device_type'], drop_first=True, dtype=int)

In [None]:
# Display the head of the encoded DataFrame
df_encoded.head()

### Correlation Heatmap

In [None]:
# Plot a correlation heatmap to visualize relationships between features
plt.figure(figsize=(10,6))
sns.heatmap(df_encoded.corr(),annot=True,cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

## --- Model Training and Evaluation ---

In [None]:
# Define a function to prepare features (X) and target (y)

def prepare_features(df):
    """
    Separates the DataFrame into features (X) and target (y).
    Drops the original 'amount' and 'log_amount' columns from features.
    """
    X = df.drop(['amount', 'log_amount', 'label'], axis=1)
    y = df['label']
    return X, y


In [None]:
# Define a function for hyperparameter tuning using GridSearchCV
def tune_model(model, param_grid, X_train, y_train, model_name="Model"):
    """
    Performs hyperparameter tuning on a given model using GridSearchCV.
    Evaluates models based on the 'f1' score.
    """
    grid = GridSearchCV(model, param_grid, scoring='f1', cv=5, n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    print(f"Best parameters for {model_name}: {grid.best_params_}")
    return grid.best_estimator_


In [None]:
# Define a function for stratified train-test splitting
def stratified_split(X, y, test_size=0.2):
    """
    Splits the dataset into training and testing sets while preserving
    the proportion of classes in the target variable (stratified split).
    """
    return train_test_split(X, y, test_size=test_size, stratify=y, random_state=42)


In [None]:
# Define a function to apply SMOTE for handling class imbalance
def apply_smote(X_train, y_train):
    """
    Applies Synthetic Minority Over-sampling Technique (SMOTE) to the
    training data to address class imbalance.
    """
    smote = SMOTE(random_state=42)
    return smote.fit_resample(X_train, y_train)


In [None]:
# Define a function to evaluate a given model
def evaluate_model(model, X_test, y_test):
    """
    Evaluates a classification model using various metrics including
    Accuracy, Precision, Recall, F1-score, AUC, Confusion Matrix, and ROC Curve.
    Handles both scikit-learn models and Keras models (for ANN).
    """
    # Predict probabilities or decision function scores
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):  # For SVC with probability=True
        y_proba = model.decision_function(X_test)
        # Normalize decision scores to [0, 1] for AUC if not already probabilities
        y_proba = (y_proba - y_proba.min()) / (y_proba.max() - y_proba.min())
    else:  # Assume Keras model with direct probability output
        y_proba = model.predict(X_test).flatten()

    # Convert probabilities to binary predictions based on a 0.5 threshold
    y_pred = (y_proba >= 0.5).astype(int)

    # Calculate evaluation metrics
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred),
        'AUC': roc_auc_score(y_test, y_proba),
        'ConfusionMatrix': confusion_matrix(y_test, y_pred),
        'ROC': roc_curve(y_test, y_proba),
    }
    return metrics


In [None]:
# Define a function to plot the confusion matrix
def plot_confusion_matrix(cm, title):
    """
    Plots a confusion matrix using seaborn.
    """
    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"Confusion Matrix - {title}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.show()


In [None]:
# Define a function to plot the ROC curve
def plot_roc_curve(fpr, tpr, model_name, auc_score):
    """
    Plots the Receiver Operating Characteristic (ROC) curve.
    """
    plt.figure(figsize=(5, 4))
    plt.plot(fpr, tpr, label=f"{model_name} (AUC = {auc_score:.2f})")
    plt.plot([0, 1], [0, 1], "k--")  # Diagonal dashed line for random classifier
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"ROC Curve - {model_name}")
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
# Define a function to run and evaluate multiple classification models
def run_all_models(X_train, X_test, y_train, y_test):
    """
    Trains and evaluates a set of common classification models.
    Prints metrics and plots confusion matrices and ROC curves for each.
    """
    models = {
        'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
        'Decision Tree': DecisionTreeClassifier(class_weight='balanced', random_state=42),
        'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42),
        'KNN': KNeighborsClassifier(), # KNN does not support class_weight directly, SMOTE will help
        'SVM': SVC(probability=True, class_weight='balanced', random_state=42),
    }

    results = {}

    for name, model in models.items():
        print(f"\n--- Training {name} ---")
        model.fit(X_train, y_train)
        metrics = evaluate_model(model, X_test, y_test)
        results[name] = metrics

        print(f"\nüîç {name} Evaluation:")
        for k, v in metrics.items():
            if k not in ['ConfusionMatrix', 'ROC']:
                print(f"{k}: {v:.4f}")
        plot_confusion_matrix(metrics['ConfusionMatrix'], name)
        fpr, tpr, _ = metrics['ROC']
        plot_roc_curve(fpr, tpr, name, metrics['AUC'])

    return results

# --- Main execution flow for model training and evaluation ---

In [None]:
# Prepare features (X) and target (y) from the encoded DataFrame
X, y = prepare_features(df_encoded)

# Split the data into training and testing sets using stratified sampling
X_train, X_test, y_train, y_test = stratified_split(X, y)

# Apply SMOTE to the training data to balance the classes
X_train_sm, y_train_sm = apply_smote(X_train, y_train)

# Run and evaluate all defined classification models
results = run_all_models(X_train_sm, X_test, y_train_sm, y_test)



### Hyperparameter Tuning

In [None]:
# Tune Random Forest Classifier
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
best_rf = tune_model(rf, rf_param_grid, X_train_sm, y_train_sm, model_name="Random Forest")


In [None]:
# Tune Support Vector Machine (SVM) Classifier
svm = SVC(probability=True, class_weight='balanced', random_state=42)
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}
best_svm = tune_model(svm, svm_param_grid, X_train_sm, y_train_sm, model_name="SVM")


### Evaluate Tuned Models

In [None]:
# Evaluate and plot results for the best (tuned) Random Forest and SVM models
for name, model in [('Tuned Random Forest', best_rf), ('Tuned SVM', best_svm)]:
    print(f"\n--- Evaluating {name} ---")
    metrics = evaluate_model(model, X_test, y_test)

    print(f"\nüîß {name} Evaluation:")
    for k, v in metrics.items():
        if k not in ['ConfusionMatrix', 'ROC']:
            print(f"{k}: {v:.4f}")
    plot_confusion_matrix(metrics['ConfusionMatrix'], name)
    fpr, tpr, _ = metrics['ROC']
    plot_roc_curve(fpr, tpr, name, metrics['AUC'])



## --- Artificial Neural Network (ANN) ---

In [None]:
# Convert DataFrame to NumPy arrays for Keras
X_train_ann = X_train_sm.to_numpy()
X_test_ann = X_test.to_numpy()
y_train_ann = y_train_sm.to_numpy()
y_test_ann = y_test.to_numpy()


In [None]:
# Define the ANN model architecture
ann = Sequential([
    Dense(32, input_dim=X_train_ann.shape[1], activation='relu'), # Input layer with 32 neurons, ReLU activation
    Dropout(0.3), # Dropout layer to prevent overfitting
    Dense(16, activation='relu'), # Hidden layer with 16 neurons, ReLU activation
    Dense(1, activation='sigmoid') # Output layer with 1 neuron (binary classification), Sigmoid activation
])


In [None]:
# Compile the ANN model
ann.compile(
    optimizer='adam', # Adam optimizer
    loss='binary_crossentropy', # Binary cross-entropy loss for binary classification
    metrics=['accuracy'] # Monitor accuracy during training
)



In [None]:
# Define Early Stopping callback to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


In [None]:
# Train the ANN model
print("\n--- Training Artificial Neural Network (ANN) ---")
history = ann.fit(
    X_train_ann, y_train_ann,
    validation_split=0.2, # Use 20% of training data for validation
    epochs=50, # Maximum number of epochs
    batch_size=32, # Batch size for training
    callbacks=[early_stop], # Apply early stopping
    verbose=1 # Show training progress
)


In [None]:
# Get predicted probabilities from the ANN model
y_pred_prob_ann = ann.predict(X_test_ann).flatten()


In [None]:
# Convert probabilities to binary predictions using a 0.5 threshold
y_pred_ann = (y_pred_prob_ann >= 0.5).astype(int)

In [None]:
# Re-define evaluate_model function to correctly handle Keras model output
# This is necessary if the previous evaluate_model was not designed for Keras
# If the previous evaluate_model was already handling Keras, this re-definition might be redundant.
# However, it's safer to ensure correct handling of y_pred and y_proba.
def evaluate_model_keras(model, X_test, y_test):
    """
    Evaluates a Keras classification model specifically.
    Calculates various metrics and returns them.
    """
    y_proba = model.predict(X_test).flatten()
    y_pred = (y_proba >= 0.5).astype(int)

    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred),
        'AUC': roc_auc_score(y_test, y_proba),
        'ConfusionMatrix': confusion_matrix(y_test, y_pred),
        'ROC': roc_curve(y_test, y_proba),
    }
    return metrics


In [None]:
# Evaluate the ANN model
print("\n--- ANN Evaluation ---")
metrics_ann = evaluate_model_keras(ann, X_test_ann, y_test_ann)

In [None]:
# Print ANN evaluation metrics
for k, v in metrics_ann.items():
    if k not in ['ConfusionMatrix', 'ROC']:
        print(f"{k}: {v:.4f}")

In [None]:
# Plot Confusion Matrix for ANN
plot_confusion_matrix(metrics_ann['ConfusionMatrix'], "ANN")


In [None]:
# Plot ROC Curve for ANN
fpr, tpr, _ = metrics_ann['ROC']
plot_roc_curve(fpr, tpr, "ANN", metrics_ann['AUC'])


## Model Performance & Conclusion

This section summarizes the performance of various classification models on the fraud detection task, highlighting challenges and offering insights for future improvements.

---

## Model Evaluation Metrics Overview

The following table presents key evaluation metrics for all trained models, including those after hyperparameter tuning.

> ‚ö†Ô∏è **Note**: Due to the highly imbalanced nature of the dataset (95% non-fraud, 5% fraud), **Accuracy** is a misleading metric. Instead, **Precision**, **Recall**, **F1-Score**, and **AUC** are critical for assessing fraud detection effectiveness.

| Model                 | Accuracy | Precision | Recall | F1-Score | AUC    |
|----------------------|----------|-----------|--------|----------|--------|
| Logistic Regression  | 0.4200   | 0.0431    | 0.5000 | 0.0794   | 0.4100 |
| Decision Tree        | 0.6850   | 0.0656    | 0.4000 | 0.1127   | 0.5500 |
| Random Forest        | 0.6900   | 0.0667    | 0.4000 | 0.1143   | 0.4589 |
| KNN                  | 0.6550   | 0.0462    | 0.3000 | 0.0800   | 0.4024 |
| SVM                  | 0.6000   | 0.0139    | 0.1000 | 0.0244   | 0.4295 |
| Tuned Random Forest  | 0.6550   | 0.0462    | 0.3000 | 0.0800   | 0.4650 |
| Tuned SVM            | 0.6150   | 0.0282    | 0.2000 | 0.0494   | 0.3937 |
| ANN                  | 0.9500   | 0.0000    | 0.0000 | 0.0000   | 0.4416 |

---

## Analysis of Model Performance & Why Models Struggled

### Dominant Impact of Class Imbalance

- Despite using techniques like **SMOTE** and `class_weight='balanced'`, the **extreme imbalance** (95% non-fraud vs. 5% fraud) severely affected test performance.
- Models show **very low F1-Scores**, reflecting poor trade-offs between precision and recall for fraud cases.
- The **ANN's high accuracy (0.95)** but zero precision and recall is a classic sign of a model always predicting the majority class, missing fraud completely.
- This underscores why **accuracy is a poor metric** for imbalanced datasets.

### Insufficiently Discriminative Features

- The primary limitation lies in the lack of **strong predictive signals** in the current features.

#### `log_amount_scaled` Distribution Overlap

- Visualizations revealed **significant overlap** between fraud and non-fraud transactions.
- Even after transformation, transaction amount doesn't serve as a clear signal.

#### Limited Categorical Feature Impact

- Features like `merchant_type` and `device_type` showed **minor fraud rate differences** during EDA.
- The "Others" category in `merchant_type` had a higher fraud rate but lacked specificity.

### Limitations of Current Approach

- **SMOTE**: Creates synthetic samples from existing minority class points. If these points are not informative, synthetic data can be **noisy** or lead to **overfitting**.
- **Model Limitations**: Both simple (Logistic Regression, SVM-linear) and complex models (Random Forest, ANN, RBF-SVM) failed, suggesting the issue is **data quality**, not model choice.

---

## Future Recommendations for Improvement

To improve fraud detection, the focus should shift toward **richer, more discriminative features**.

### Rich Feature Engineering

- **Time-Based Features** (if timestamp available):
  - `time_of_day`, `day_of_week`, `transaction_frequency_per_user`

- **Behavioral Features**:
  - Average transaction amount over time windows (e.g., last 1hr, 24hr)
  - Number of unique merchants/devices used recently
  - Ratio of current amount to user‚Äôs historical average
  - Indicators of new or unusual locations

### External Data Integration

- Use **blacklists**, or public data about **high-risk merchant categories**.

### Anomaly Detection Techniques

- Shift from supervised learning to:
  - **Isolation Forest**
  - **One-Class SVM**
  - **Autoencoders**
- Better suited for **rare outliers** like fraud in imbalanced data.

### Deep Dive into Feature Interactions

- Explore **feature combinations** or interactions that may highlight subtle fraud patterns not visible in isolation.

---

> By focusing on feature quality and exploring alternative detection strategies, there is significant potential to improve fraud detection on challenging, imbalanced datasets.

In [None]:
# --- For Numerical Feature: log_amount_scaled ---
plt.figure(figsize=(10, 6))
sns.histplot(df_encoded[df_encoded['label'] == 0]['log_amount_scaled'], color='blue', label='Non-Fraud (0)', kde=True, stat='density', alpha=0.6, common_norm=False)
sns.histplot(df_encoded[df_encoded['label'] == 1]['log_amount_scaled'], color='red', label='Fraud (1)', kde=True, stat='density', alpha=0.6, common_norm=False)
plt.title('Distribution of log_amount_scaled by Fraud Status')
plt.xlabel('Scaled Log Amount')
plt.ylabel('Density')
plt.legend()
plt.show()

