In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
import numpy as np
import matplotlib
matplotlib.use('Agg') # Use a non-interactive backend
import matplotlib.pyplot as plt
import seaborn as sns
import os
import shap



In [2]:
# --- Setup Output Directory ---
output_folder = 'rf_outputs'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    print(f"Created directory: {output_folder}")

# Read data from dataset.csv
df = pd.read_csv('dataset.csv')

# Assuming the last column is the target variable and others are features
X = df.iloc[:, 1:-1]
y = df.iloc[:, -1]

# Split data into training, validation, and testing sets (80%, 10%, 10%)
# First, split into training (80%) and a temporary set (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Then, split the temporary set into validation (10%) and testing (10%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# --- Hyperparameter Tuning (Pruning) ---
# Find the best max_depth using the validation set to prevent overfitting
# print("--- Finding Best Max Depth for Random Forest ---")
# depths = range(30, 40) 
# val_scores = []
# for depth in depths:
#     # Using n_estimators=100 is a common and robust starting point
#     model = RandomForestClassifier(n_estimators=100, max_depth=depth, random_state=42, n_jobs=-1)
#     model.fit(X_train, y_train)
#     val_scores.append(model.score(X_val, y_val))

best_depth = 35
print(f"Best max_depth found: {best_depth}")

# Initialize and fit the final model with the best hyperparameter
print("\n--- Training Final Random Forest Model ---")
model = RandomForestClassifier(n_estimators=65, max_depth=best_depth, random_state=42, min_samples_split=6, min_samples_leaf= 1, n_jobs=-1)
model.fit(X_train, y_train)

# --- Final Model Evaluation on the Test Set ---
print("\n--- Final Evaluation on Test Set ---")
y_pred = model.predict(X_test)

# Calculate and print key metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Model Accuracy on Test Set: {accuracy}")
print(f"F1 Score on Test Set: {f1}")



Best max_depth found: 35

--- Training Final Random Forest Model ---

--- Final Evaluation on Test Set ---
Model Accuracy on Test Set: 0.843175487465181
F1 Score on Test Set: 0.8432577771614429


In [3]:
df.head()

Unnamed: 0,id,product_A_sold_in_the_past,product_B_sold_in_the_past,product_A_recommended,product_A,product_C,product_D,cust_hitrate,cust_interactions,cust_contracts,opp_month,opp_old,competitor_Z,competitor_X,competitor_Y,cust_in_iberia,target_variable
0,1,-0.25992,-0.34794,-0.1097,-0.0891,-0.02372,-0.04247,0.12643,0.30731,-0.34997,-1.41464,-0.28185,0,0,0,1,1
1,2,-0.25992,-0.28599,-0.1097,-0.0891,-0.02372,-0.04247,0.42465,3.26349,0.92184,-1.11923,3.54793,1,0,0,0,0
2,3,-0.25992,-0.18998,-0.1097,-0.0891,-0.02372,-0.04247,0.72559,1.29633,-0.34997,0.65323,-0.28185,0,0,0,0,0
3,4,0.65945,3.35306,-0.1097,-0.0891,-0.02372,-0.04247,1.55091,-0.6817,3.46544,1.24404,-0.28185,0,0,0,1,0
4,5,-0.25992,-0.24623,-0.1097,-0.0891,-0.02372,-0.04247,-0.50193,0.6551,-0.03202,1.53945,-0.28185,1,0,0,0,0


In [3]:
# --- Model Interpretation and Explainability with SHAP ---

# 1. SHAP Feature Importance
print("\n--- Generating SHAP Feature Importance ---")
# For tree-based models like Random Forest, TreeExplainer is much more efficient
explainer = shap.TreeExplainer(model)

# # To speed up the calculation, we can explain a representative sample of the test data
# X_test_sample = shap.sample(X_test, 200, random_state=42)
# shap_values = explainer.shap_values(X_test_sample)

# plt.figure()
# # Note: We use the same sampled data for plotting
# shap.summary_plot(shap_values, X_test_sample, feature_names=X_test.columns.tolist(),
#     max_display=20,  # Show top 20 most important features
#     show=False)
# plt.title('SHAP Feature Importance for Random Forest')
# plt.savefig(os.path.join(output_folder, 'rf_shap_importance.png'), dpi=300)
# print(f"\nSaved SHAP feature importance plot to {output_folder}/rf_shap_importance.png")
# plt.close()

# explainer = shap.TreeExplainer(model)
 
shap.TreeExplainer(
    model, X_train, feature_names=X_train.columns.tolist()
)




--- Generating SHAP Feature Importance ---


<shap.explainers._tree.TreeExplainer at 0x7236336faad0>

In [6]:
rf_explainer = shap.TreeExplainer(
    model, X_train, feature_names=X_train.columns.tolist()
)
rf_explainer

<shap.explainers._tree.TreeExplainer at 0x72361aa507d0>

In [None]:
%%time
shap_values = rf_explainer.shap_values(X_train, y_train)

  7%|=                   | 3866/57438 [02:01<27:56]       

In [None]:
shap_values = shap_values[:, :-1]

pd.DataFrame(shap_values, columns=X_train.columns.tolist()).head()

In [None]:
shap.summary_plot(
    shap_values, X_train, feature_names=X_train.columns, plot_type="bar"
)

In [4]:
# 2. Standard Feature Importance
print("\n--- Feature Importance ---")
importances = model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

print(feature_importance_df)

# Plotting Feature Importance
plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_importance_df, palette='viridis')
plt.title('Feature Importance from Random Forest')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.tight_layout()
plt.savefig(os.path.join(output_folder, 'rf_feature_importance.png'), dpi=300)
print(f"\nSaved feature importance plot to {output_folder}/rf_feature_importance.png")



--- Feature Importance ---
                       feature  importance
6                 cust_hitrate    0.324188
7            cust_interactions    0.191203
9                    opp_month    0.135442
1   product_B_sold_in_the_past    0.118977
0   product_A_sold_in_the_past    0.054755
8               cust_contracts    0.054115
10                     opp_old    0.034537
13                competitor_Y    0.019282
2        product_A_recommended    0.018506
14              cust_in_iberia    0.018105
11                competitor_Z    0.017960
3                    product_A    0.007887
5                    product_D    0.002732
12                competitor_X    0.001403
4                    product_C    0.000908



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='importance', y='feature', data=feature_importance_df, palette='viridis')



Saved feature importance plot to rf_outputs/rf_feature_importance.png


In [5]:

# 3. Confusion Matrix
print("\n--- Confusion Matrix ---")
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=model.classes_, yticklabels=model.classes_)
plt.title('Confusion Matrix for Random Forest')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.savefig(os.path.join(output_folder, 'rf_confusion_matrix.png'), dpi=300)
print(f"Saved confusion matrix plot to {output_folder}/rf_confusion_matrix.png")


--- Confusion Matrix ---
Saved confusion matrix plot to rf_outputs/rf_confusion_matrix.png
