# Imports and Setup

In [None]:
from pathlib import Path
import sys
import yaml
import joblib
import warnings
import shap
import pandas as pd
from matplotlib import pyplot as plt
from IPython.display import display, Markdown

ROOT = Path().resolve().parent
SAVE_DIR = ROOT / "saved"
sys.path.append(str(ROOT))

with open("../config.yaml", "r") as f:
    config = yaml.safe_load(f)

label = config["general"]["label"]
value_mappings = config["preprocessing"]["value_mappings"]
primary_metric = config["general"]["primary_metric"]

from utils.utils import (
    evaluate_pipeline,
    summarize_model_results,
    cleaning_pipeline,
    preprocessor
)

X = joblib.load(SAVE_DIR / "X.pkl")
X_train = joblib.load(SAVE_DIR / "X_train.pkl")
y_train = joblib.load(SAVE_DIR / "y_train.pkl")
X_test = joblib.load(SAVE_DIR / "X_test.pkl")
y_test = joblib.load(SAVE_DIR / "y_test.pkl")
X_val = joblib.load(SAVE_DIR / "X_val.pkl")
y_val = joblib.load(SAVE_DIR / "y_val.pkl")


X_train_cleaned = cleaning_pipeline.fit_transform(X_train)
y_train = y_train.reset_index(drop=True)
X_train_preprocessed = pd.DataFrame(
    preprocessor.fit_transform(X_train_cleaned, y_train),
    columns=preprocessor.get_feature_names_out()
)


# from imblearn.under_sampling import RandomUnderSampler
# rus = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
# X_train_preprocessed, y_train = rus.fit_resample(X_train_preprocessed, y_train)

# from imblearn.combine import SMOTEENN
# sampler = SMOTEENN(sampling_strategy=0.5, random_state=42)
# X_train_preprocessed, y_train = sampler.fit_resample(X_train_preprocessed, y_train)


X_test_cleaned = cleaning_pipeline.transform(X_test)
X_test_preprocessed = pd.DataFrame(
    preprocessor.transform(X_test_cleaned),
    columns=preprocessor.get_feature_names_out()
)

X_val_cleaned = cleaning_pipeline.transform(X_val)
X_val_preprocessed = pd.DataFrame(
    preprocessor.transform(X_val_cleaned),
    columns=preprocessor.get_feature_names_out()
)

# Bundle into one variable for evaluation
data = [X, X_train_preprocessed, y_train, X_test_preprocessed, y_test, X_val_preprocessed, y_val]

# Training

### DummyClassifier

In [None]:
dummy_pipeline, dummy_data = evaluate_pipeline('dummy_classifier', data)

### Logistic Regression

In [None]:
logistic_pipeline, logistic_data = evaluate_pipeline('logistic_regression', data)

### Random Forest

In [None]:
forest_pipeline, forest_data = evaluate_pipeline('random_forest', data)

### LightGBM

In [None]:
light_pipeline, light_data = evaluate_pipeline('lightgbm', data)

# Analysis

In [None]:
# Precomputed metric results from evaluation
model_data = {
    'dummy_classifier': dummy_data,
    'logistic_regression': logistic_data,
    'random_forest': forest_data,
    'lightgbm': light_data
}

fitted_pipelines = {
    'dummy_classifier': dummy_pipeline,
    'logistic_regression': logistic_pipeline,
    'random_forest': forest_pipeline,
    'lightgbm': light_pipeline
}

metrics_to_display = {
    'F1': '{:.2%}',
    'Accuracy': '{:.2%}',
    'Precision': '{:.2%}'
}

best_model, best_pipeline = summarize_model_results(
    model_data,
    primary_metric,
    metrics_to_display,
    fitted_pipelines
)

In [None]:
# Get predictions on training data
y_train_pred = best_pipeline.predict(X_train_preprocessed)

# Only get wrong predictions
wrong_mask = y_train_pred != y_train
wrong_mask = wrong_mask
X_train_preprocessed = X_train_preprocessed
y_train = y_train
wrong_examples = X_train_preprocessed[wrong_mask.values]

label_map = {
    0: f"Not {label}",
    1: label.capitalize()
}

display = wrong_examples.copy()
display['Actual'] = y_train[wrong_mask].values
display['Predicted'] = y_train_pred[wrong_mask]

display['Actual'] = display['Actual'].map(label_map)
display['Predicted'] = display['Predicted'].map(label_map)

# Show a few wrong examples
display.sample(5, random_state=42)

#### Model performance metrics
Summarize the key evaluation metrics and what they say about the model's overall predictive power. Highlight any strengths or weaknesses revealed by these numbers

#### Feature importance analysis
Describe which features contribute most to the model's decisions. Include insights from feature importance scores, SHAP values, and more. Explain why certain features might be especially influential

#### Overfitting/underfitting and generalization
Discuss evidence of overfitting/underfitting, if any. Use training vs. validation scores, learning curves, or cross-validation results to support analysis. Explain how well the model is expected to perform on unseen data

#### Comparison to baseline models
Compare the model's performance to the basline models, Dummy and Logistic Regression. Highlight improvements and explain why this model is a better choice than those options

#### Error analysis
Common failure cases (e.g., certain classes, edge cases) and examples of misclassified instances

#### Model deployment considerations
Inference time and scalability. Will it work for real time predictions?

#### Data quality and preprocessing impact
Effect of missing data handling and impact of feature engineering

#### Summary
Brief summary containing the most important points from the above information

In [None]:
wrong_row_idx = wrong_examples.index[0]

actual_value = display.loc[wrong_row_idx, 'Actual']
predicted_value = display.loc[wrong_row_idx, 'Predicted']

model = best_pipeline.named_steps['model']

shap_input = X_train_preprocessed.loc[[wrong_row_idx]]
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(shap_input)
shap_values_instance = shap_values[1] 
shap_values_instance = shap.Explanation(
    values=shap_values_instance[0].flatten(),  # Flatten the values to 1D to match the shape
    base_values=explainer.expected_value[1],  # Base value for the positive class
    data=shap_input.values.flatten(),  # Flatten the input values to match the expected shape
    feature_names=X_train_preprocessed.columns.tolist()
)

print(f"\nSHAP explanation: (Pred: {predicted_value}, Actual: {actual_value})")

shap.plots.waterfall(shap_values_instance,show=False)

ax = plt.gca()

# Add custom labels on the left and right side of the plot
# Left label
ax.annotate(f"Not {label}", xy=(0, -.1), xycoords='axes fraction', 
            horizontalalignment='left', verticalalignment='bottom', fontsize=12, color='black')

# Right label
ax.annotate(label.capitalize(), xy=(1, -.1), xycoords='axes fraction', 
            horizontalalignment='right', verticalalignment='bottom', fontsize=12, color='black')

plt.show()

Analyze what went wrong and why

# Summary

#### Key insights

#### Limitations and possible improvements

#### Business implications