In [None]:
# Import python packages
import streamlit as st
import pandas as pd
import numpy as np

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()

df = session.table("ctm.public.nba")


In [None]:
describe table nba;

# Summary Null Handling Strategy by Column

- **CustomerID (Identifier)**
  - **Strategy:** Drop row or disregard
  - **Rationale:** CustomerID serves as a unique identifier and is critical for data integrity and accurate record linkage. Missing values could compromise join operations and result in unreliable analyses.

- **Age (Continuous)**
  - **Strategy:** Impute with median or mean
  - **Rationale:** Age is a continuous variable. Imputation with the median is preferred to mitigate the influence of outliers, while the mean provides a suitable alternative for normally distributed data, ensuring robust statistical analysis.

- **Segment (Categorical)**
  - **Strategy:** Impute with mode or introduce a “Missing” category
  - **Rationale:** Segment is a categorical variable. Imputing with the mode preserves the most representative value, while adding a “Missing” category maintains transparency regarding unknown values for further investigation.

- **Recency (Categorical)**
     - **Strategy:** Impute with mode or introduce a "Missing" category
    - **Rationale:** Recency is a categorical segmentation variable with defined engagement states (Active,     Inactive, Dormant). Imputing with the mode preserves the most common engagement pattern while maintaining data distribution integrity. Alternatively, introducing a "Missing" category retains data completeness and allows for potential analysis of unclassified customer behavior patterns.
 
- **PriorInterest (Binary)**
  - **Strategy:** Impute with 0 (no interest)
  - **Rationale:** PriorInterest is a binary variable. Imputing missing values as 0 (no interest) is a conservative approach that avoids overestimating customer engagement and maintains model stability.

- **DidPurchase (Binary)**
  - **Strategy:** Exclude rows with missing values (if used as the modeling target)
  - **Rationale:** DidPurchase is the target variable for predictive modeling. Rows with missing values must be omitted to ensure the integrity of model training and avoid bias from incomplete outcomes.


In [None]:
--Are there null values in segment column
select segment, count(segment)
FROM ctm.public.nba
GROUP BY segment;-- 493 null values found

In [None]:
--Are there null values in segment column
select count(segment)
FROM ctm.public.nba
where segment = 'NULL';-- 493 null values found

In [None]:
UPDATE ctm.public.nba
SET segment = 'Missing'
WHERE segment = 'NULL'; --updated to new category 'Missing'

In [None]:
select count(age)
from nba
where age is null; -- no missing values

In [None]:
select sum(customerid)
from nba
where customerid is null; -- no missing values

In [None]:
select recency, count(recency)
FROM ctm.public.nba
GROUP BY recency;-- 493 null values found & deleted

In [None]:
UPDATE ctm.public.nba
SET recency = 'Missing'
WHERE recency = 'NULL'; --updated to new category 'Missing'

In [None]:
select didpurchase, count(didpurchase)
FROM ctm.public.nba
GROUP BY didpurchase-- only 0 and 1 found / no null values found 

In [None]:
select priorinterest, count(priorinterest)
FROM ctm.public.nba
GROUP BY priorinterest-- no null values found 

## Key metrics:

- Overall purchase rate (DidPurchase mean)
- Purchase rate by Segment, Recency, and Age cohorts


Statistical relationships:


- Correlation between PriorInterest and DidPurchase
- Conversion lift for high-interest vs. low-interest customers


Visualizations:


- Bar charts: Purchase rate by segment/recency , age bins
- Heatmap: Correlation matrix of features


In [None]:
-- Purchase conversion rates by mean
SELECT 
  ROUND(AVG(DidPurchase) * 100, 2) || '%' AS conversion_rate
FROM ctm.public.nba;

In [None]:
-- Purchase conversion rates by segment
SELECT 
  Segment, 
  ROUND(AVG(DidPurchase) * 100, 2) || '%' AS conversion_rate
FROM ctm.public.nba
GROUP BY Segment
ORDER BY conversion_rate DESC;

In [None]:
import matplotlib.pyplot as plt

# Data
segments = ['A', 'B', 'C']
conversion_rates = [51.41, 32.17, 21.81]

# Create bar chart
plt.figure(figsize=(8, 5))
bars = plt.bar(segments, conversion_rates, color=['#1f77b4', '#ff7f0e', '#2ca02c'])

# Add data labels
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height,
             f'{height:.2f}%', ha='center', va='bottom')

plt.title('Purchase Conversion Rates by Segment')
plt.xlabel('Segment')
plt.ylabel('Conversion Rate (%)')
plt.ylim(0, 60)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# What the Chart Shows

# Segment A has the highest conversion rate (51.41%)
# Segment B is intermediate (32.17%)
# Segment C has the lowest (21.81%)

# Insight
# Targeting Segment A offers the highest potential for conversions.
# Segments B and C may benefit from tailored engagement strategies to boost their conversion rates.

In [None]:
-- Purchase conversion rates by recency
SELECT 
  recency, 
  ROUND(AVG(DidPurchase) * 100, 2) || '%' AS conversion_rate
FROM ctm.public.nba
GROUP BY recency
ORDER BY conversion_rate DESC;

In [None]:
import pandas as pd

recency_data = pd.DataFrame({
    'recency': ['Active', 'Inactive', 'Dormant'],
    'conversion_rate': [41.38, 20.07, 12.04]
})

import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
bars = plt.bar(recency_data['recency'], recency_data['conversion_rate'], color=['#1f77b4', '#ff7f0e', '#2ca02c'])

# Add data labels
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height,
             f'{height:.2f}%', ha='center', va='bottom')

plt.title('Purchase Conversion Rates by Recency Segment')
plt.xlabel('Recency Segment')
plt.ylabel('Conversion Rate (%)')
plt.ylim(0, 60)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Targeting Active customers is most effective, while Inactive and Dormant segments may require reactivation strategies to boost conversions.

In [None]:
-- Purchase conversion rates by segment
WITH binned_data AS (
  SELECT 
    CASE
      WHEN age <= 20 THEN '0-20'
      WHEN age <= 30 THEN '21-30'
      WHEN age <= 40 THEN '31-40'
      WHEN age <= 50 THEN '41-50'
      WHEN age <= 60 THEN '51-60'
      WHEN age <= 70 THEN '61-70'
      ELSE '71-80'
    END AS age_bin,
    AVG(DidPurchase) * 100 AS conversion_rate
  FROM ctm.public.nba
  GROUP BY age_bin
)
SELECT 
  age_bin,
  ROUND(conversion_rate, 2) || '%' AS conversion_rate
FROM binned_data
ORDER BY conversion_rate DESC;


In [None]:
import matplotlib.pyplot as plt

# Data from your query
age_bins = ['0-20', '21-30', '31-40', '41-50', '51-60', '61-70', '71-80']
conversion_rates = [12.38, 30.34, 36.47, 32.91, 24.54, 15.73, 11.71]

# Create chart
plt.figure(figsize=(10, 6))
bars = plt.bar(age_bins, conversion_rates, color='skyblue')

# Add data labels
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height,
             f'{height:.2f}%', ha='center', va='bottom')

# Formatting
plt.title('Conversion Rate by Age Group', fontsize=14)
plt.xlabel('Age Group', fontsize=12)
plt.ylabel('Conversion Rate (%)', fontsize=12)
plt.ylim(0, 40)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Key Insights from Binned Data
# Peak Conversion:

# Ages 31-40 have highest conversion (36.47%)

# Followed by 21-30 (30.34%) and 41-50 (32.91%)

# Lowest Conversion:

# Seniors (71-80) convert at 11.71%

# Young adults (0-20) at 12.38%

# Business Implications:

# Target marketing to 31-50 age groups

# Investigate low conversion in youth/senior segments

# Create age-specific product recommendations

In [None]:
SELECT 
  CORR(PRIORINTEREST, DIDPURCHASE) AS correlation_coefficient
FROM ctm.public.nba;

In [None]:
import pandas as pd
from snowflake.snowpark.context import get_active_session

session = get_active_session()
df = session.table("ctm.public.nba").to_pandas()

correlation = df['PRIORINTEREST'].corr(df['DIDPURCHASE'])
print(f"Correlation: {correlation:.4f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Fetch data for segments A, B, C
query = """
SELECT 
    SEGMENT,
    CORR(DIDPURCHASE, PRIORINTEREST) AS correlation
FROM ctm.public.nba
WHERE SEGMENT IN ('A', 'B', 'C')
    AND PRIORINTEREST IS NOT NULL
    AND DIDPURCHASE IS NOT NULL
GROUP BY SEGMENT
"""

corr_df = session.sql(query).to_pandas()
print(corr_df)
plt.figure(figsize=(10, 6))
ax = sns.barplot(
    x='SEGMENT', 
    y='CORRELATION', 
    data=corr_df,
    palette='viridis'
)

# Add data labels
for p in ax.patches:
    ax.annotate(f"{p.get_height():.3f}", 
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', 
                xytext=(0, 10), 
                textcoords='offset points')

plt.title('Correlation: DIDPURCHASE vs. PRIORINTEREST by Segment', fontsize=14)
plt.xlabel('Segment', fontsize=12)
plt.ylabel('Correlation Coefficient', fontsize=12)
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from snowflake.snowpark.context import get_active_session

# Execute query
corr_df = session.sql("""
    SELECT 
        RECENCY,
        CORR(DIDPURCHASE, PRIORINTEREST) AS correlation
    FROM ctm.public.nba
    WHERE RECENCY IN ('Active', 'Inactive', 'Dormant')
        AND PRIORINTEREST IS NOT NULL
        AND DIDPURCHASE IS NOT NULL
    GROUP BY RECENCY
    ORDER BY correlation DESC
""").to_pandas()

# Plot results
plt.figure(figsize=(10, 6))
ax = sns.barplot(
    x='RECENCY', 
    y='CORRELATION', 
    data=corr_df,
    palette='coolwarm'
)

# Add data labels
for p in ax.patches:
    ax.annotate(f"{p.get_height():.3f}", 
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', 
                xytext=(0, 10), 
                textcoords='offset points',
                fontsize=10)

plt.title('Correlation: DIDPURCHASE vs. PRIORINTEREST by Recency', fontsize=14)
plt.xlabel('Recency Category', fontsize=12)
plt.ylabel('Correlation Coefficient', fontsize=12)
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [None]:
WITH rates AS (
  SELECT
    PriorInterest,
    AVG(DidPurchase) AS conversion_rate
  FROM ctm.public.nba
  GROUP BY PriorInterest
)
SELECT 
  (MAX(CASE WHEN PriorInterest = 1 THEN conversion_rate END) 
   - MAX(CASE WHEN PriorInterest = 0 THEN conversion_rate END))
  / MAX(CASE WHEN PriorInterest = 0 THEN conversion_rate END) 
  * 100 AS conversion_lift_percent
FROM rates;


In [None]:
high_interest_rate = df[df['PRIORINTEREST'] == 1]['DIDPURCHASE'].mean()
low_interest_rate = df[df['PRIORINTEREST'] == 0]['DIDPURCHASE'].mean()

conversion_lift = ((high_interest_rate - low_interest_rate) / low_interest_rate) * 100
print(f"Conversion Lift: {conversion_lift:.2f}%")


In [None]:
from scipy.stats import ttest_ind

group1 = df[df['PRIORINTEREST'] == 1]['DIDPURCHASE']
group0 = df[df['PRIORINTEREST'] == 0]['DIDPURCHASE']

t_stat, p_value = ttest_ind(group1, group0)
print(f"p-value: {p_value:.4f}")  # p < 0.05 indicates significant difference


## **Quadrant Names for PriorInterest vs. DidPurchase**

| PriorInterest | DidPurchase | Quadrant Name (Descriptive) | Quadrant Name (Creative)        |
|---------------|-------------|-----------------------------|---------------------------------|
| 0             | 0           | Uninterested & Didn't Buy   | Missed Opportunity              |
| 0             | 1           | Uninterested & Bought       | Surprise Win                    |
| 1             | 0           | Interested & Didn't Buy     | Lost Conversion                 |
| 1             | 1           | Interested & Bought         | Conversion Success              |

## **Variations & Suggestions**

- **Uninterested & Didn't Buy (0,0):**
  - **Missed Opportunity**
  - **Low Intent, No Action**
  - **Passive Audience**
- **Uninterested & Bought (0,1):**
  - **Surprise Win**
  - **Impulse Purchase**
  - **Unexpected Conversion**
- **Interested & Didn't Buy (1,0):**
  - **Lost Conversion**
  - **High Intent, No Sale**
  - **Abandoned Cart**
- **Interested & Bought (1,1):**
  - **Conversion Success**
  - **High Intent, High Action**
  - **Ideal Customer**

## **Example Table with Quadrant Names**

| Quadrant Name     | PriorInterest | DidPurchase | Description                      |
|-------------------|---------------|-------------|----------------------------------|
| Missed Opportunity| 0             | 0           | No interest, no purchase         |
| Surprise Win      | 0             | 1           | No interest, but purchased       |
| Lost Conversion   | 1             | 0           | Interested, but didn't purchase  |
| Conversion Success| 1             | 1           | Interested and purchased         |


## **Summary Table**

| Quadrant Name     | Targeting Strategy Examples                                  |
|-------------------|-------------------------------------------------------------|
| Lost Conversion   | Retargeting ads, abandoned cart emails, dynamic creatives   |
| Conversion Success| Upsell/cross-sell, loyalty programs, personalized follow-up |
| Missed Opportunity| Awareness campaigns, educational content, lookalike targeting|
| Surprise Win      | Post-purchase engagement, feedback requests, cross-sell     |

In [None]:
SELECT 
    PRIORINTEREST, 
    DIDPURCHASE, 
    COUNT(DISTINCT CUSTOMERID) AS customer_count
FROM ctm.public.nba
GROUP BY PRIORINTEREST, DIDPURCHASE
ORDER BY PRIORINTEREST, DIDPURCHASE

In [None]:
WITH counts AS (
  SELECT 
    PRIORINTEREST, 
    DIDPURCHASE, 
    COUNT(DISTINCT CUSTOMERID) AS customer_count
  FROM ctm.public.nba
  GROUP BY PRIORINTEREST, DIDPURCHASE
),
total AS (
  SELECT SUM(customer_count) AS total_customers
  FROM counts
)
SELECT 
  counts.PRIORINTEREST, 
  counts.DIDPURCHASE, 
  counts.customer_count,
  round((counts.customer_count / total.total_customers) * 100 ,2) || '%' AS percentage
FROM counts, total
ORDER BY PRIORINTEREST, DIDPURCHASE;


In [None]:
-- 2602 (2.6%) customers showed interest but did not purchase

WITH interest_no_purchase AS (
  SELECT 
    CUSTOMERID,
    SEGMENT,
    RECENCY,
    CASE
      WHEN AGE < 20 THEN '0-19'
      WHEN AGE BETWEEN 20 AND 29 THEN '20-29'
      WHEN AGE BETWEEN 30 AND 39 THEN '30-39'
      WHEN AGE BETWEEN 40 AND 49 THEN '40-49'
      WHEN AGE BETWEEN 50 AND 59 THEN '50-59'
      ELSE '60+'
    END AS age_bin
  FROM ctm.public.nba
  WHERE PRIORINTEREST = 1 AND DIDPURCHASE = 0
)
SELECT 
  SEGMENT,
  age_bin,
  RECENCY,
  COUNT(DISTINCT CUSTOMERID) AS customer_count,
  ROUND((COUNT(DISTINCT CUSTOMERID) / 2602.0) * 100, 1) AS percent_of_group
FROM interest_no_purchase
GROUP BY SEGMENT, age_bin, RECENCY
ORDER BY customer_count DESC;


In [None]:
SELECT
  SEGMENT,
  COUNT(DISTINCT CUSTOMERID) AS customer_count,
  ROUND((COUNT(DISTINCT CUSTOMERID) / 2602.0) * 100, 1) || '%' AS percent_of_group
FROM ctm.public.nba
WHERE PRIORINTEREST = 1 AND DIDPURCHASE = 0
GROUP BY SEGMENT
ORDER BY customer_count DESC;


In [None]:
SELECT
  RECENCY,
  COUNT(DISTINCT CUSTOMERID) AS customer_count,
  ROUND((COUNT(DISTINCT CUSTOMERID) / 2602.0) * 100, 1)  || '%' AS percent_of_group
FROM ctm.public.nba
WHERE PRIORINTEREST = 1 AND DIDPURCHASE = 0
GROUP BY RECENCY
ORDER BY customer_count DESC;


In [None]:
SELECT
  CASE
    WHEN AGE < 20 THEN '0-19'
    WHEN AGE BETWEEN 20 AND 29 THEN '20-29'
    WHEN AGE BETWEEN 30 AND 39 THEN '30-39'
    WHEN AGE BETWEEN 40 AND 49 THEN '40-49'
    WHEN AGE BETWEEN 50 AND 59 THEN '50-59'
    ELSE '60+'
  END AS age_bin,
  COUNT(DISTINCT CUSTOMERID) AS customer_count,
  ROUND((COUNT(DISTINCT CUSTOMERID) / 2602.0) * 100, 1) || '%' AS percent_of_group
FROM ctm.public.nba
WHERE PRIORINTEREST = 1 AND DIDPURCHASE = 0
GROUP BY age_bin
ORDER BY customer_count DESC;


In [None]:
# contingency table data 
ct = pd.DataFrame({
    '0': [67398, 2602],  # Did Not Purchase
    '1': [23424, 6576]    # Did Purchase
}, index=['0', '1'])  # Index: Prior Interest

# Rename rows and columns for clarity
ct.index = ['No Prior Interest', 'Prior Interest']
ct.columns = ['Did Not Purchase', 'Did Purchase']
# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(ct, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Purchase Status')
plt.ylabel('Prior Interest')
plt.title('Prior Interest vs. Purchase Counts')
plt.show()

print(ct)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Ensure column names are ALL CAPS
df.columns = df.columns.str.upper()

# Calculate conversion rates
conversion_rates = df.groupby('PRIORINTEREST')['DIDPURCHASE'].mean().reset_index()
conversion_rates['PRIORINTEREST'] = conversion_rates['PRIORINTEREST'].map(
    {0: 'No Prior Interest', 1: 'Prior Interest'}
)

# Plot
plt.figure(figsize=(8, 5))
ax = sns.barplot(
    x='PRIORINTEREST', 
    y='DIDPURCHASE', 
    hue='PRIORINTEREST',  # Assign x variable to hue
    data=conversion_rates, 
    palette='viridis',
    legend=False  # Disable legend to avoid duplication
)

# Calculate and annotate lift
high_rate = conversion_rates.loc[conversion_rates['PRIORINTEREST'] == 'Prior Interest', 'DIDPURCHASE'].values[0]
low_rate = conversion_rates.loc[conversion_rates['PRIORINTEREST'] == 'No Prior Interest', 'DIDPURCHASE'].values[0]
lift = (high_rate - low_rate) / low_rate * 100

# Position annotation correctly
x_pos = conversion_rates[conversion_rates['PRIORINTEREST'] == 'Prior Interest'].index[0]
plt.text(x_pos, high_rate + 0.02, f"Lift: +{lift:.1f}%", 
         ha='center', fontsize=12, fontweight='bold')

# Formatting
plt.title('Conversion Rate by Prior Interest', fontsize=14)
plt.xlabel('Prior Interest Level', fontsize=12)
plt.ylabel('Conversion Rate', fontsize=12)
plt.ylim(0, min(1, high_rate * 1.3))  # Dynamic upper limit
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y:.0%}'))
plt.tight_layout()
plt.show()

#both customers have been on the website but customer that visit with priot interest are 177% more likely to purchase than first time users

## Predictive Modeling
Train purchase propensity model:
- Features: Age, Segment, Recency, PriorInterest
- Target: DidPurchase

Algorithm: Logistic Regression or Random Forest

Evaluate model:

- Metrics: AUC-ROC, precision-recall


Feature importance analysis (e.g., SHAP values)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder


# Load Snowflake table into Snowpark DataFrame
df_snowpark = session.table("ctm.public.nba")

# Convert to Pandas DataFrame
df = df_snowpark.to_pandas()

# Prepare categorical features
categorical_cols = ['SEGMENT', 'RECENCY']

# Initialize OneHotEncoder with correct parameters
encoder = OneHotEncoder(drop='first', sparse_output=False)  # Use sparse_output instead of sparse

# Fit and transform categorical features
categorical_features = encoder.fit_transform(df[categorical_cols])

# Create DataFrame for encoded features
categorical_df = pd.DataFrame(categorical_features, 
                              columns=encoder.get_feature_names_out(categorical_cols))

# Combine features
X = pd.concat([
    df[['AGE', 'PRIORINTEREST']],
    categorical_df
], axis=1)
y = df['DIDPURCHASE']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve

# Initialize and train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Generate predicted probabilities and class predictions
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability of positive class
y_pred = model.predict(X_test)  # Class predictions at default threshold (0.5)

# Calculate metrics
roc_auc = roc_auc_score(y_test, y_pred_proba)
pr_auc = average_precision_score(y_test, y_pred_proba)

print(f"ROC-AUC: {roc_auc:.4f}")
print(f"PR-AUC: {pr_auc:.4f}")


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, 
                              display_labels=['No Purchase', 'Purchase'])
disp.plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix (Threshold = 0.5)')
plt.show()

# Top-left: True Negatives (correct non-churn predictions)
# Bottom-right: True Positives (correct churn predictions)
# Top-right: False Positives (Type I errors)
# Bottom-left: False Negatives (Type II errors)

Evaluating the model

In [None]:
# 3. Threshold Analysis
# Calculate ROC curve
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = roc_auc_score(y_test, y_pred_proba)
# Plot ROC curve
if callable(plt.plot):
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve - Threshold Analysis')
    plt.legend()
    plt.grid(True)
    plt.show()
else:
    print("Error: plt.plot is not callable. Restart kernel.")

In [None]:
# Calibration plot
prob_true, prob_pred = calibration_curve(y_test, y_pred_proba, n_bins=10)
plt.figure(figsize=(8, 6))
plt.plot(prob_pred, prob_true, marker='o', label='Random Forest')
plt.plot([0, 1], [0, 1], linestyle='--', label='Perfectly calibrated')
plt.xlabel('Predicted Probability')
plt.ylabel('True Probability')
plt.title('Calibration Curve')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score

# Calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
ap = average_precision_score(y_test, y_pred_proba)

# Plot curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'AP = {ap:.3f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Print optimal threshold
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal Threshold: {optimal_threshold:.3f}")


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

# Reload matplotlib to fix NoneType error
import importlib
import matplotlib
importlib.reload(matplotlib)
import matplotlib.pyplot as plt

# Calculate metrics across thresholds
threshold_values = np.linspace(0, 1, 1000)
metrics = []

for thresh in threshold_values:
    y_pred_thresh = (y_pred_proba >= thresh).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_thresh).ravel()
    
    metrics.append({
        'threshold': thresh,
        'f1_score': f1_score(y_test, y_pred_thresh),
        'precision': precision_score(y_test, y_pred_thresh, zero_division=0),
        'recall': recall_score(y_test, y_pred_thresh),
        'fpr': fp / (fp + tn) if (fp + tn) > 0 else 0
    })

metrics_df = pd.DataFrame(metrics)

# Create figure and axis explicitly
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(metrics_df['threshold'], metrics_df['precision'], label='Precision', linewidth=2)
ax.plot(metrics_df['threshold'], metrics_df['recall'], label='Recall', linewidth=2)
ax.axvline(optimal_threshold, color='r', linestyle='--', label=f'Optimal Threshold ({optimal_threshold:.3f})')
ax.set_xlabel('Classification Threshold')
ax.set_ylabel('Score')
ax.set_title('Precision-Recall Tradeoff')
ax.legend()
ax.grid(True)
plt.show()


In [None]:
import shap

# Initialize SHAP explainer
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Check structure of shap_values (list of two arrays for binary classification)
print(f"SHAP values structure: {type(shap_values)} with {len(shap_values)} elements")

# Global feature importance (use positive class index [1])
shap.summary_plot(shap_values[1], X_test, plot_type="bar")

# Individual prediction explanation (first 3 samples)
for i in range(3):
    shap.force_plot(
        explainer.expected_value[1],
        shap_values[1][i],  # Use positive class SHAP values
        X_test.iloc[i],
        matplotlib=True
    )

print(f"SHAP type: {type(shap_values)}")
print(f"SHAP length: {len(shap_values)}")
print(f"Element 0 shape: {shap_values[0].shape}")
print(f"Element 1 shape: {shap_values[1].shape}")

print(f"X_test features: {X_test.shape[1]}")
print(f"SHAP features: {shap_values[1].shape[1]}")
