In [7]:
import pandas as pd
import lightgbm as lgb
import shap
from sklearn.metrics import accuracy_score

In [8]:

# Step 1: Load train and test data
train_path = "../../data/processed/train_data.csv"  # Replace with your train file path
test_path = "../../data/processed/test_data.csv"    # Replace with your test file path

# Assuming the target column is named 'target'
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

# Split features and target
X_train = train_data.drop(columns=["target"])  # Replace "target" with the actual name
y_train = train_data["target"]

X_test = test_data.drop(columns=["target"])
y_test = test_data["target"]


In [9]:
# Step 2: Train an initial LightGBM model
lgbm_model = lgb.LGBMClassifier(random_state=42)
lgbm_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 416, number of negative: 430
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028526 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 191576
[LightGBM] [Info] Number of data points in the train set: 846, number of used features: 754
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.491726 -> initscore=-0.033100
[LightGBM] [Info] Start training from score -0.033100


In [10]:
# Step 3: Compute SHAP values
explainer = shap.TreeExplainer(lgbm_model)
shap_values = explainer.shap_values(X_train)[1]  # For binary classification, class 1



In [11]:
# Calculate mean absolute SHAP values
mean_shap_values = abs(shap_values).mean(axis=0)
shap_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Mean_SHAP_Value': mean_shap_values
}).sort_values(by='Mean_SHAP_Value', ascending=False)

print("Top Features by SHAP Importance:\n", shap_importance)

Top Features by SHAP Importance:
     Feature  Mean_SHAP_Value
753     753         0.019974
0         0         0.019974
737     737         0.019974
736     736         0.019974
735     735         0.019974
..      ...              ...
6         6         0.019974
5         5         0.019974
4         4         0.019974
3         3         0.019974
2         2         0.019974

[754 rows x 2 columns]


In [12]:
# Visualize SHAP summary plot
shap.summary_plot(shap_values, X_train, feature_names=X_train.columns)

AssertionError: Summary plots need a matrix of shap_values, not a vector.

In [None]:
# Step 4: Select top features based on SHAP
top_n = 10  # Adjust this number based on your analysis
selected_features = shap_importance['Feature'].iloc[:top_n].tolist()

In [None]:
# Filter datasets to only use selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

In [None]:
# Step 5: Retrain LightGBM with selected features
lgbm_model_selected = lgb.LGBMClassifier(random_state=42)
lgbm_model_selected.fit(X_train_selected, y_train)

In [None]:
# Step 6: Evaluate the final model
y_pred = lgbm_model_selected.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print("\nFinal Model Accuracy with Selected Features:", accuracy)