# Train model

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)

# --- Configuration ---
FILE_NAME = 'data/dataset.csv'
TARGET_COLUMN = 'target_variable'
RANDOM_STATE = 42
# CHANGED: Test size to 20% (0.2) as requested
TEST_SIZE = 0.2

print(f"--- Starting GTM ML Explainability Analysis (80/20 Split) ---")

# 1. Data Loading and Preparation
try:
    df = pd.read_csv(FILE_NAME)
except FileNotFoundError:
    print(f"Error: The file '{FILE_NAME}' was not found. Please ensure it's in the same directory.")
    exit()

# Drop the 'id' column as it's an identifier, not a feature
X = df.drop(columns=['id', TARGET_COLUMN])
y = df[TARGET_COLUMN]

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

print(f"\nTraining set size: {X_train.shape[0]} samples (80%)")
print(f"Testing set size: {X_test.shape[0]} samples (20%)")
print("-" * 50)

# 2. Model Training: Random Forest Classifier
model = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=RANDOM_STATE,
    n_jobs=-1 # Use all available cores
)

print("Training Random Forest Classifier...")
model.fit(X_train, y_train)
print("Training complete.")
print("-" * 50)

# 3. Model Evaluation
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Calculate Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)


print("## Model Performance Metrics (80/20 Split)")
print(f"Accuracy:  {accuracy:.10f}")
print(f"Precision: {precision:.10f}")
print(f"Recall:    {recall:.10f}")
print(f"F1 Score:  {f1:.10f}  <-- Key metric for business classification")
print(f"ROC AUC:   {roc_auc:.10f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("-" * 50)

# 4. Explainability (SHAP Analysis)

print("## SHAP Analysis for Explainability")

  from .autonotebook import tqdm as notebook_tqdm


--- Starting GTM ML Explainability Analysis (80/20 Split) ---

Training set size: 28719 samples (80%)
Testing set size: 7180 samples (20%)
--------------------------------------------------
Training Random Forest Classifier...
Training complete.
--------------------------------------------------
## Model Performance Metrics (80/20 Split)
Accuracy:  0.8435933148
Precision: 0.8311306902
Recall:    0.8377738307
F1 Score:  0.8344390388  <-- Key metric for business classification
ROC AUC:   0.9312661934

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      3802
           1       0.83      0.84      0.83      3378

    accuracy                           0.84      7180
   macro avg       0.84      0.84      0.84      7180
weighted avg       0.84      0.84      0.84      7180

--------------------------------------------------
## SHAP Analysis for Explainability


## Save model

In [3]:
import joblib
joblib.dump(model, 'rootForest.pkl')
print('Model saved to rootForest.pkl')

Model saved to rootForest.pkl


#

In [None]:
# Use shap.Explainer for tree-based models
explainer = shap.TreeExplainer(model)

# Calculate SHAP values on a sample of the test data for faster computation
# Using a sample size up to 5000 from the test set.
X_sample = X_test.sample(n=min(5000, X_test.shape[0]), random_state=RANDOM_STATE)
shap_values = explainer.shap_values(X_sample.to_numpy())

input()

# --- Plot 1: SHAP Summary Plot (Feature Importance and Impact) ---
plt.figure(figsize=(10, 6))
# This plot summarizes feature importance globally.
shap.summary_plot(shap_values, X_sample, show=False)
plt.title("SHAP Summary Plot (80/20 Split)", fontsize=16)
plt.tight_layout()
summary_plot_path = 'rf_shap_summary_plot_80_20.png'
plt.savefig(summary_plot_path, bbox_inches='tight')
print(f"SHAP Summary Plot saved to: {summary_plot_path}")
plt.close()
input()
# --- Plot 2: SHAP Dependence Plot (Feature Interaction) ---

# Get the feature with the highest mean absolute SHAP value
shap_df = pd.DataFrame({
    'feature': X.columns,
    'mean_abs_shap': np.abs(shap_values.values).mean(axis=0)
}).sort_values(by='mean_abs_shap', ascending=False)

most_important_feature = shap_df.iloc[0]['feature']
print(f"Most important feature (for Dependence Plot): {most_important_feature}")

# This plot shows how the most important feature affects the prediction.
plt.figure(figsize=(10, 6))
shap.dependence_plot(
    most_important_feature,
    shap_values.values,
    X_sample,
    interaction_index='auto', # Automatically finds the best feature to show interaction
    show=False
)
plt.title(f"SHAP Dependence Plot for: {most_important_feature} (80/20 Split)", fontsize=16)
plt.tight_layout()
dependence_plot_path = 'rf_shap_dependence_plot_80_20.png'
plt.savefig(dependence_plot_path, bbox_inches='tight')
print(f"SHAP Dependence Plot saved to: {dependence_plot_path}")
plt.close()

print("\n--- Script finished successfully ---")

 # SHAP (not working)


In [1]:
print("Loading SHAP explainer...")

Loading SHAP explainer...


In [2]:
explainer = shap.TreeExplainer(model)

NameError: name 'shap' is not defined

In [4]:

print("Explainer loaded.")

X_sample = X_test.sample(n=min(5000, X_test.shape[0]), random_state=RANDOM_STATE)


Explainer loaded.


In [None]:

print("Sampled test data loaded.")

# Use NumPy for SHAP computation
explainer = shap.explainers.Tree(model)
shap_values = explainer(X_train)
# For binary classification: choose the positive class
# shap_class1 = shap_values[1]


In [None]:
# --- Plot 1: summary ---
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_class1, X_sample, show=False)
plt.savefig("rf_shap_summary_plot_80_20.png", bbox_inches='tight')
print("SHAP Summary Plot saved to: rf_shap_summary_plot_80_20.png")
plt.close()

# --- Plot 2: dependence plot ---
shap_df = pd.DataFrame({
    'feature': X_sample.columns,
    'mean_abs_shap': np.abs(shap_class1).mean(axis=0)
}).sort_values(by='mean_abs_shap', ascending=False)

most_important_feature = shap_df.iloc[0]['feature']

plt.figure(figsize=(10, 6))
shap.dependence_plot(
    most_important_feature,
    shap_class1,
    X_sample,
    interaction_index='auto',
    show=False
)
plt.savefig("rf_shap_dependence_plot_80_20.png", bbox_inches='tight')
plt.close()


# LIME