In [None]:
pip install PyALE imblearn lime matplotlib numpy pandas seaborn shap scikit-learn --quiet

# LIBRARIES

In [None]:
import pandas as pd
import shap
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the original CSV file
df = pd.read_csv("keypoints_per_frame.csv")

# Convert label: 'Bowled' → 1, all others → 0
df['label'] = df['label'].apply(lambda x: 1 if x == "Pull Shot" else 0)

# Save the modified CSV (optional)
df.to_csv("Keypoint datasets/pull_shot_keypoints_per_frame.csv", index=False)

# (Optional) preview the result
df.head()

#Prepare features and labels
X = df.drop(columns=["label", "video", "frame"])
y = df["label"]
print(df["label"].value_counts())

# LOAD DATASET AND PREPARE DATA

In [None]:
# 🟦 Step 4: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# RF TRAINING

In [None]:
#Train RF
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)



#Make predictions
y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)


#Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


# FEATURE IMPORTANCE

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

importances = rf.feature_importances_
indices = importances.argsort()[::-1]
top_n = 51

# Create figure
plt.figure(figsize=(12, 30))

# Get top feature importances and names
top_features = [X.columns[i] for i in indices[:top_n]]
top_importances = importances[indices[:top_n]]

# Plot
ax = sns.barplot(x=top_importances, y=top_features)

# Annotate each bar with the importance value
for i, (value, name) in enumerate(zip(top_importances, top_features)):
    ax.text(value + 0.001, i, f"{value:.4f}", va='center')

plt.title("Feature Importances")
plt.tight_layout()
plt.show()


# PERMUTATION IMPORTANCE

In [None]:
from sklearn.inspection import permutation_importance

result = permutation_importance(rf, X_test, y_test, n_repeats=10, random_state=42)

# Show top 10 features
sorted_idx = result.importances_mean.argsort()[::-1]
for i in sorted_idx[:10]:
    print(f"{X_test.columns[i]}: {result.importances_mean[i]:.4f}")

# SHAP

In [None]:
#Adding SHAP to the RF classifier

# Create the SHAP explainer for the Random Forest model
explainer = shap.TreeExplainer(rf)

# Calculate SHAP values for the test set
shap_values = explainer.shap_values(X_test)

print(shap_values.shape)



In [None]:
shap.initjs()
# Generate a SHAP summary plot
shap.summary_plot(shap_values[:,:,1], X_test)

# Generate a SHAP force plot for an individual prediction
shap.plots.force(explainer.expected_value[1], shap_values[5,:,1], X_test.iloc[5])

#Generate Waterfall
shap.plots.waterfall(
    shap.Explanation(
        values=shap_values[5, :, 1],
        base_values=explainer.expected_value[1],
        data=X_test.iloc[5],
        feature_names=X_test.columns.tolist()
    )
)

# TOP CONFIDENT PREDICTIONS

In [None]:
# Get predicted probabilities for class 1
probs = rf.predict_proba(X_test)[:, 1]

# Get top N indices by prediction confidence
N = 5
top_indices = np.argsort(-np.abs(probs - 0.5))[:N]  # furthest from 0.5

# Plot waterfall for each
for idx in top_indices:
    print(f"Waterfall for test sample #{idx} (prob = {probs[idx]:.2f})")
    shap.plots.waterfall(
        shap.Explanation(
            values=shap_values[idx, :, 1],
            base_values=explainer.expected_value[1],
            data=X_test.iloc[idx],
            feature_names=X_test.columns.tolist()
        )
    )

# TOP POSITIVE PREDICTIONS

In [None]:
# Get predicted probabilities and labels
probs = rf.predict_proba(X_test)[:, 1]       # Probability of class 1
preds = rf.predict(X_test)                   # Predicted class labels

# Keep only indices where the predicted class is 1
class_1_indices = np.where(preds == 1)[0]

# Compute confidence as distance from 0.5
confidences = np.abs(probs[class_1_indices] - 0.5)

# Get top N class 1 predictions by confidence
N = 10
top_indices = class_1_indices[np.argsort(-confidences)[:N]]

# Plot SHAP waterfall for each
for idx in top_indices:
    print(f"Waterfall for test sample #{idx} (prob = {probs[idx]:.2f})")
    shap.plots.waterfall(
        shap.Explanation(
            values=shap_values[idx, :, 1],  # Class 1 SHAP values
            base_values=explainer.expected_value[1],
            data=X_test.iloc[idx],
            feature_names=X_test.columns.tolist()
        )
    )


# TOP NEGATIVE PREDICTIONS

In [None]:
# Get predicted probabilities and labels
probs = rf.predict_proba(X_test)[:, 1]       # Probability of class 1
preds = rf.predict(X_test)                   # Predicted class labels

# Keep only indices where the predicted class is 1
class_1_indices = np.where(preds == 0)[0]

# Compute confidence as distance from 0.5
confidences = np.abs(probs[class_1_indices] - 0.5)

# Get top N class 1 predictions by confidence
N = 10
top_indices = class_1_indices[np.argsort(-confidences)[:N]]

# Plot SHAP waterfall for each
for idx in top_indices:
    print(f"Waterfall for test sample #{idx} (prob = {probs[idx]:.2f})")
    shap.plots.waterfall(
        shap.Explanation(
            values=shap_values[idx, :, 1],  # Class 1 SHAP values
            base_values=explainer.expected_value[1],
            data=X_test.iloc[idx],
            feature_names=X_test.columns.tolist()
        )
    )


# MOST INFULENTIAL FEATURE

In [None]:
# Step 1: Get SHAP values for class 1
shap_class1 = shap_values[:, :, 1]  # shape: (samples, features)

# Step 2: Identify the most influential feature overall (e.g., by mean absolute SHAP value)
mean_abs_shap = np.abs(shap_class1).mean(axis=0)
top_feature_index = np.argmax(mean_abs_shap)
top_feature_name = X_test.columns[top_feature_index]

print(f"Top contributing feature: {top_feature_name}")

# Step 3: Get the top 10 samples where that feature has the highest SHAP value
top_10_indices = np.argsort(-np.abs(shap_class1[:, top_feature_index]))[:10]

# Step 4: Display info for those samples
for idx in top_10_indices:
    prob = rf.predict_proba(X_test.iloc[[idx]])[0, 1]
    shap_val = shap_class1[idx, top_feature_index]
    print(f"Sample #{idx} — Prob(class 1): {prob:.2f}, SHAP[{top_feature_name}] = {shap_val:.3f}")
    
    shap.plots.waterfall(
        shap.Explanation(
            values=shap_class1[idx],
            base_values=explainer.expected_value[1],
            data=X_test.iloc[idx],
            feature_names=X_test.columns.tolist()
        )
    )


# GROUPED SHAP BY JOINT

In [None]:
groups = defaultdict(list)
for i, name in enumerate(X_test.columns):
    group_key = name.rsplit("_", 1)[0]  # splits from the right, keeping e.g. 'LeftEyeInner'
    groups[group_key].append(i)
# Dictionary looks like this, name-value pair
# {
#     'LeftElbow': [24, 25, 26],
#     'RightShoulder': [18, 19, 20],
#     ...
# }
# SHAP values for class 1
shap_vals = shap_values[:, :, 1]  # shape: (n_samples, n_features)

# Aggregate absolute SHAP values over groups
group_shap_values = []
group_names = []

for group_name, indices in groups.items():
    mean_abs_shap = np.abs(shap_vals[:, indices]).mean()
    group_shap_values.append(mean_abs_shap)
    group_names.append(group_name)

# Sort by importance
sorted_indices = np.argsort(group_shap_values)[::-1]
group_names_sorted = [group_names[i] for i in sorted_indices]
group_values_sorted = [group_shap_values[i] for i in sorted_indices]

plt.figure(figsize=(10, 6))
sns.barplot(x=group_values_sorted, y=group_names_sorted)
plt.title("Grouped SHAP Feature Importance (Class 1)")
plt.xlabel("Mean |SHAP value|")
plt.ylabel("Keypoint")
plt.tight_layout()
plt.show()

# GROUPED SHAP BY LIMB

In [None]:
# Make mapping per limb
limb_mapping = {
    "Head": [
        "Nose", "LeftEyeInner", "LeftEye", "LeftEyeOuter",
        "RightEyeInner", "RightEye", "RightEyeOuter",
        "LeftEar", "RightEar", "MouthLeft", "MouthRight"
    ],
    "Left Arm": ["LeftShoulder", "LeftElbow", "LeftWrist"],
    "Right Arm": ["RightShoulder", "RightElbow", "RightWrist"]
}

# Build groups by limb
limb_groups = defaultdict(list)

for i, name in enumerate(X_test.columns):
    keypoint = name.rsplit("_", 1)[0]  # e.g., "LeftElbow"

    for limb, keypoints in limb_mapping.items():
        if keypoint in keypoints:
            limb_groups[limb].append(i)
            break

shap_vals = shap_values[:, :, 1]  # class 1 SHAP values

limb_names = []
limb_shap_values = []

# For each limb, get all shap values for class 1 by index
for limb, indices in limb_groups.items():
    mean_abs_shap = np.abs(shap_vals[:, indices]).mean()
    limb_names.append(limb)
    limb_shap_values.append(mean_abs_shap)

# Sort by effect
sorted_idx = np.argsort(limb_shap_values)[::-1]
limb_names_sorted = [limb_names[i] for i in sorted_idx]
limb_values_sorted = [limb_shap_values[i] for i in sorted_idx]

plt.figure(figsize=(8, 5))
sns.barplot(x=limb_values_sorted, y=limb_names_sorted)
plt.title("Grouped SHAP Importance by Limb (Class 1)")
plt.xlabel("Mean |SHAP value|")
plt.ylabel("Body Region")
plt.tight_layout()
plt.show()

# LIME

In [None]:
import lime
import lime.lime_tabular

# Step 1: Create the explainer
limeExplainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_train.values,
    feature_names=X_train.columns.tolist(),
    class_names=[str(cls) for cls in rf.classes_],
    mode='classification',
    random_state = 42
)

# Step 2: Define the wrapper predict function with column names
def predict_fn_with_names(x):
    # Convert x to DataFrame with feature names
    df = pd.DataFrame(x, columns=X_train.columns)
    return rf.predict_proba(df)

# Step 3: Choose a test sample to explain (e.g., index 5)
i = 285
exp = limeExplainer.explain_instance(
    data_row=X_test.iloc[i].values,         # values of the instance
    predict_fn=predict_fn_with_names,       # your fixed function
    num_features=10                         # number of top features to show
)

# Step 4: Show the explanation
exp.show_in_notebook()

# ALE PLOTS

In [None]:
!pip install PyALE

In [None]:
class Class1Wrapper:
    def __init__(self, model):
        self.model = model

    def predict(self, X):
        import pandas as pd
        df = pd.DataFrame(X, columns=X_test.columns)
        return self.model.predict_proba(df)[:, 1]  # only class 1 probability

In [None]:
from PyALE import ale


wrapped_model = Class1Wrapper(rf)

ale_eff = ale(
    X=X_test,
    model=wrapped_model,
    feature=["LeftWrist_y"],  # must be a list
    include_CI=True
)

# SAVE IN CSV

In [None]:
# # Extract SHAP values for class 1
# shap_class_1 = shap_values[:, :, 1]  # shape: (n_samples, n_features)

# # Prepare the prediction info
# prediction_info = pd.DataFrame({
#     "Sample": range(len(X_test)),
#     "True Label": y_test.values,
#     "Predicted Label": y_pred,
#     "Prob Class 0": y_proba[:, 0],
#     "Prob Class 1": y_proba[:, 1]
# })

# # Extract SHAP values for class 1
# shap_class_1 = shap_values[:, :, 1]  # shape: (n_samples, n_features)

# # Create column names for SHAP values
# shap_columns = [f"shap_{name}" for name in X_test.columns]

# # Turn SHAP values into a DataFrame
# shap_df = pd.DataFrame(shap_class_1, columns=shap_columns)

# # Reset index for consistency
# shap_df = shap_df.reset_index(drop=True)

# # Final DataFrame with everything
# full_results = pd.concat([prediction_info, X_test.reset_index(drop=True), shap_df], axis=1)

# # Save to CSV
# full_results.to_csv("Shap_result_tables/pull_shot_shap_predictions_rf.csv", index=False)