# Imports and Setup

In [None]:
from pathlib import Path
import sys
import joblib
import shap
from matplotlib import pyplot as plt

ROOT_PATH = Path().resolve().parent
sys.path.append(str(ROOT_PATH))

from src.utils.config import (
    SAVE_PATH,
    label,
    primary_metric
)

from src.utils.modeling import prepare_data
from src.utils.evaluation import evaluate_pipeline, summarize_model_results

# Load data
data = joblib.load(SAVE_PATH / "data.pkl")

X = data["X"]
X_train = data["X_train"]
y_train = data["y_train"]
X_test = data["X_test"]
y_test = data["y_test"]

# Bundle into one variable for evaluation
data = [X, X_train, y_train, X_test, y_test]

# Training

### DummyClassifier

In [2]:
# dummy_pipeline, dummy_data = evaluate_pipeline('dummy_classifier', data)

### Logistic Regression

In [3]:
# logistic_pipeline, logistic_data = evaluate_pipeline('logistic_regression', data)

### Random Forest

In [4]:
# forest_pipeline, forest_data = evaluate_pipeline('random_forest', data)

### LightGBM

In [5]:
light_pipeline, light_data = evaluate_pipeline('lightgbm', data)

ValueError: 
All the 175 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
175 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\rober\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\rober\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\rober\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\pipeline.py", line 423, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\Users\rober\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\pipeline.py", line 377, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\Users\rober\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\memory.py", line 326, in __call__
    return self.func(*args, **kwargs)
  File "c:\Users\rober\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\pipeline.py", line 957, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\Users\rober\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\_set_output.py", line 157, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "c:\Users\rober\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\rober\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\compose\_column_transformer.py", line 754, in fit_transform
    result = self._fit_transform(X, y, _fit_transform_one)
  File "c:\Users\rober\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\compose\_column_transformer.py", line 681, in _fit_transform
    return Parallel(n_jobs=self.n_jobs)(
  File "c:\Users\rober\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\parallel.py", line 65, in __call__
    return super().__call__(iterable_with_config)
  File "c:\Users\rober\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 1986, in __call__
    return output if self.return_generator else list(output)
  File "c:\Users\rober\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 1914, in _get_sequential_output
    res = func(*args, **kwargs)
  File "c:\Users\rober\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\parallel.py", line 127, in __call__
    return self.function(*args, **kwargs)
  File "c:\Users\rober\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\pipeline.py", line 957, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\Users\rober\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\_set_output.py", line 157, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "c:\Users\rober\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\base.py", line 919, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "c:\Users\rober\AppData\Local\Programs\Python\Python39\lib\site-packages\category_encoders\utils.py", line 298, in fit
    X, y = convert_inputs(X, y)
  File "c:\Users\rober\AppData\Local\Programs\Python\Python39\lib\site-packages\category_encoders\utils.py", line 89, in convert_inputs
    raise ValueError(msg)
ValueError: `X` and `y` both have indexes, but they do not match. If you are shuffling your input data on purpose (e.g. via permutation_test_score) use np arrays instead of data frames / series


# Analysis

In [None]:
# Precomputed metric results from evaluation
model_data = {
    # 'dummy_classifier': dummy_data,
    # 'logistic_regression': logistic_data,
    # 'random_forest': forest_data,
    'lightgbm': light_data
}

fitted_pipelines = {
    # 'dummy_classifier': dummy_pipeline,
    # 'logistic_regression': logistic_pipeline,
    # 'random_forest': forest_pipeline,
    'lightgbm': light_pipeline
}

metrics_to_display = {
    'F1': '{:.2%}',
    'Accuracy': '{:.2%}',
    'Precision': '{:.2%}',
    'Roc_auc': '{:.2%}'
}
metrics_to_display = {primary_metric.capitalize(): metrics_to_display[primary_metric.capitalize()], **{k: v for k, v in metrics_to_display.items() if k != primary_metric.capitalize()}}

best_model, best_pipeline = summarize_model_results(
    model_data,
    primary_metric,
    metrics_to_display,
    fitted_pipelines
)

In [None]:
X_train_preprocessed = X_train

# Get predictions on training data
y_train_pred = best_pipeline.predict(X_train)

# Only get wrong predictions
y_train = y_train.reset_index(drop=True)
wrong_mask = y_train_pred != y_train
wrong_examples = X_train_preprocessed.loc[wrong_mask]

label_map = {
    0: f"Not {label}",
    1: label.capitalize()
}

display = wrong_examples.copy()
display['Actual'] = y_train[wrong_mask].values
display['Predicted'] = y_train_pred[wrong_mask]

display['Actual'] = display['Actual'].map(label_map)
display['Predicted'] = display['Predicted'].map(label_map)

# Show a few wrong examples
display.sample(5, random_state=42)

#### Model performance metrics
Summarize the key evaluation metrics and what they say about the model's overall predictive power. Highlight any strengths or weaknesses revealed by these numbers

#### Feature importance analysis
Describe which features contribute most to the model's decisions. Include insights from feature importance scores, SHAP values, and more. Explain why certain features might be especially influential

#### Overfitting/underfitting and generalization
Discuss evidence of overfitting/underfitting, if any. Use training vs. validation scores, learning curves, or cross-validation results to support analysis. Explain how well the model is expected to perform on unseen data

#### Comparison to baseline models
Compare the model's performance to the basline models, Dummy and Logistic Regression. Highlight improvements and explain why this model is a better choice than those options

#### Error analysis
Common failure cases (e.g., certain classes, edge cases) and examples of misclassified instances

#### Model deployment considerations
Inference time and scalability. Will it work for real time predictions?

#### Data quality and preprocessing impact
Effect of missing data handling and impact of feature engineering

#### Summary
Brief summary containing the most important points from the above information

In [None]:
wrong_row_idx = wrong_examples.index[0]

row = display.loc[wrong_row_idx]
actual_value = row['Actual']
predicted_value = row['Predicted']

model = best_pipeline.named_steps['model']

shap_input = X_train_preprocessed.loc[[wrong_row_idx]]
explainer = shap.Explainer(model)
shap_values = explainer.shap_values(shap_input)
shap_values_instance = shap_values[1] 
shap_values_instance = shap.Explanation(
    values=shap_values_instance[0].flatten(),  # Flatten the values to 1D to match the shape
    base_values=explainer.expected_value[1],  # Base value for the positive class
    data=shap_input.values.flatten(),  # Flatten the input values to match the expected shape
    feature_names=X_train_preprocessed.columns.tolist()
)

print(f"\nSHAP explanation: (Pred: {predicted_value}, Actual: {actual_value})")

shap.plots.waterfall(shap_values_instance,show=False)

ax = plt.gca()

# Add custom labels on the left and right side of the plot
# Left label
ax.annotate(f"Not {label}", xy=(0, -.1), xycoords='axes fraction', 
            horizontalalignment='left', verticalalignment='bottom', fontsize=12, color='black')

# Right label
ax.annotate(label.capitalize(), xy=(1, -.1), xycoords='axes fraction', 
            horizontalalignment='right', verticalalignment='bottom', fontsize=12, color='black')

plt.show()

Analyze what went wrong and why

# Summary

#### Key insights

#### Limitations and possible improvements

#### Business implications