In [1]:
import pandas as pd
import numpy as np
import sys
import os
import joblib 
import shap
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sys.path.append(os.path.abspath('../scripts'))

import preprocess as preprocessing
import feature_engineering as engineering
import model_training as train
import visualize 

In [3]:
# --- Data Preparation ---
credit_df = preprocessing.load_data('../data/creditcard.csv')
credit_df = preprocessing.clean_credit_card_data(credit_df)
X_credit = credit_df.drop('Class', axis=1)
y_credit = credit_df['Class']
X_credit_train, X_credit_test, y_credit_train, y_credit_test = train.split_data(X_credit, y_credit)

In [4]:
# Apply scaling and SMOTE as in Task 2
scaler = preprocessing.StandardScaler()
X_credit_train[['Time', 'Amount']] = scaler.fit_transform(X_credit_train[['Time', 'Amount']])
X_credit_test[['Time', 'Amount']] = scaler.transform(X_credit_test[['Time', 'Amount']])
X_credit_train_res, y_credit_train_res = preprocessing.handle_imbalance(X_credit_train, y_credit_train)

In [5]:
# --- Retrain the Best Model (Random Forest) ---
rf_credit_model = train.train_random_forest(X_credit_train_res, y_credit_train_res)
print("Credit Card Fraud Model (Random Forest) is ready.")

Credit Card Fraud Model (Random Forest) is ready.


In [8]:
print("\n--- SHAP Summary Plot for Credit Card Fraud Model ---")

# Use the sampling strategy to ensure speed
if len(X_credit_test) > 2000:
    X_credit_test_sample = X_credit_test.sample(n=2000, random_state=42)
else:
    X_credit_test_sample = X_credit_test
    
credit_shap_values = visualize.generate_summary_plot(
    rf_credit_model,
    X_credit_test_sample,
    feature_names=X_credit_test_sample.columns,
    plot_title="Global Feature Importance for Credit Card Fraud (on 2000 Samples)",
    check_additivity=False # Setting to False is recommended for speed
)




--- SHAP Summary Plot for Credit Card Fraud Model ---
Calculating SHAP values for 2000 samples. This may take a moment...


TypeError: TreeExplainer.__init__() got an unexpected keyword argument 'check_additivity'

In [None]:
# --- Interpretation of the Summary Plot ---
print("""
**Interpretation of the Credit Card Fraud Summary Plot:**

The SHAP summary plot reveals the most significant features driving the model's predictions for credit card fraud.
- **Top Features:** Features like `V14`, `V12`, `V10`, and `V17` are the most impactful. This is expected, as these PCA-transformed features are designed to capture underlying transactional patterns.
- **Impact Direction:**
  - For `V14`, `V12`, and `V10`, lower values (indicated by the blue color) have a high positive SHAP value, meaning they strongly push the model's prediction towards fraud (Class 1).
  - For `V17`, higher values (red color) are associated with a higher risk of fraud.
- **Business Insight:** This tells us that specific, anonymized patterns in the transaction data are extremely predictive of fraud. While we don't know the exact nature of these 'V' features, we can confirm that the model has learned to identify high-risk anomalies based on them.
""")


In [None]:
# --- Local Prediction Explanation (Force Plot) ---
print("\n--- SHAP Force Plot for a Single Fraudulent Transaction ---")
# Find an actual fraudulent transaction in the test set
fraudulent_indices = y_credit_test[y_credit_test == 1].index
if not fraudulent_indices.empty:
    visualize.generate_force_plot(
        rf_credit_model,
        X_credit_test,
        instance_index=fraudulent_indices[0],
        plot_title=f"Force Plot for a True Positive Fraud Case (Index: {fraudulent_indices[0]})"
    )

In [None]:
# --- Interpretation of the Force Plot ---
print("""
**Interpretation of the Force Plot:**
The force plot above breaks down a single prediction.
- **Base Value:** This is the average prediction over the entire dataset.
- **Driving Features:** Features shown in **red** (like `V14`, `V10`) are pushing the prediction higher (towards fraud). Features in **blue** are pushing it lower.
- **Final Prediction:** The combination of these forces results in the final prediction score. For this fraudulent case, we can see that the low values of several 'V' features were the primary contributors to the model flagging it as fraud, overcoming the features that suggested it was a legitimate transaction.
""")

In [None]:
# --- Data Preparation ---
fraud_df = preprocessing.load_data('../data/Fraud_Data.csv')
fraud_df = preprocessing.clean_fraud_data(fraud_df)
fraud_df = engineering.create_time_features(fraud_df)
fraud_df = engineering.time_since_signup(fraud_df)
fraud_df = fraud_df.drop(['user_id', 'signup_time', 'purchase_time', 'device_id', 'ip_address'], axis=1)
X_fraud = fraud_df.drop('class', axis=1)
y_fraud = fraud_df['class']

In [None]:
categorical_features = ['source', 'browser', 'sex']
numerical_features = ['purchase_value', 'age', 'hour_of_day', 'day_of_week', 'time_since_signup']
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train.split_data(X_fraud, y_fraud)

In [None]:
# --- Retrain the Best Model (Random Forest Pipeline) ---
rf_fraud_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessing.ColumnTransformer(transformers=[
        ('num', preprocessing.StandardScaler(), numerical_features),
        ('cat', preprocessing.OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)])),
    ('classifier', train.RandomForestClassifier(random_state=42, n_estimators=100))
])
rf_fraud_pipeline.fit(X_fraud_train, y_fraud_train)
print("\nE-commerce Fraud Model (Random Forest Pipeline) is ready.")

In [None]:
# --- Global Feature Importance (Summary Plot) ---
processed_feature_names = (numerical_features +
                           rf_fraud_pipeline.named_steps['preprocessor']
                           .named_transformers_['cat']
                           .get_feature_names_out(categorical_features).tolist())


In [None]:
# Transform the test data to get the features the model actually sees
X_fraud_test_processed = pd.DataFrame(
    rf_fraud_pipeline.named_steps['preprocessor'].transform(X_fraud_test),
    columns=processed_feature_names,
    index=X_fraud_test.index
)

In [None]:
print("\n--- SHAP Summary Plot for E-commerce Fraud Model ---")
visualize.generate_summary_plot(
    rf_fraud_pipeline.named_steps['classifier'],
    X_fraud_test_processed,
    feature_names=processed_feature_names,
    plot_title="Global Feature Importance for E-commerce Fraud"
)

In [None]:
# --- Interpretation of the Summary Plot ---
print("""
**Interpretation of the E-commerce Fraud Summary Plot:**

This plot shows what drives fraud detection in our e-commerce dataset.
- **Top Feature:** The most dominant feature is `time_since_signup`. The plot clearly shows that low values (blue dots) have very high SHAP values, indicating that a **short duration between signing up and making a purchase is the single biggest predictor of fraud**.
- **Other Key Features:**
  - `purchase_value`: High purchase values are also associated with a higher risk of fraud.
  - `age`: Younger users (lower age) seem to have a slightly higher fraud risk.
- **Business Insight:** This provides a direct, actionable insight for Adey Innovations. Transactions from newly created accounts should be subject to higher scrutiny. The fraud team can now focus their efforts on this specific user segment, potentially implementing additional verification steps for them.
""")