In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_auc_score

# 1. Load and Clean Dataset
# Standardizing missing values as suggested by reviewer feedback
df = pd.read_csv('Senior_Living_Lead_Intent_Dataset.csv', na_values=['NA', ''])

# 2. Define Target (Outcome)
# High Intent is defined as leads that reached 'Connected' or 'Deal' status
pos_status = ['Connected', 'Discovery Completed', 'In Progress', 'Open Deal']
df['Actual_Success'] = df['Lead Status'].apply(lambda x: 1 if x in pos_status else 0)

# 3. Train-Test Split (Academic Rigor)
# We split the data: 80% to train the model, 20% to test its accuracy
train_df, test_df = train_test_split(df, test_size=0.20, random_state=42)

# 4. Model Evaluation (Testing on the 20% unseen data)
y_true = test_df['Actual_Success']
y_scores = test_df['intent_score']
# Threshold of 0.5 for classification
y_pred = (y_scores >= 0.5).astype(int)

# 5. Calculate Metrics
auc_score = roc_auc_score(y_true, y_scores)
print(f"Model Test Set AUC: {auc_score:.2f}")

# 6. Generate Validated Confusion Matrix
plt.figure(figsize=(8, 6))
disp = ConfusionMatrixDisplay.from_predictions(
    y_true, 
    y_pred, 
    display_labels=['Low Intent', 'High Intent'],
    cmap='Blues'
)
plt.title(f'Confusion Matrix (Test Set) - AUC: {auc_score:.2f}')
plt.grid(False)

# 7. Save the scientific visual
# Ensure the 'visuals' folder exists locally or remove the prefix
plt.savefig('confusion_matrix.png') 
plt.show()

# 8. Final Privacy Scrub for Export
df['Associated Note'] = "Note redacted for privacy compliance"
df.drop(columns=['Actual_Success']).to_csv('Senior_Living_Lead_Intent_Dataset.csv', index=False)