# Purchase Propensity Model

This notebook develops a model to predict customer purchase propensity based on demographic and behavioral features.

## Objectives:
- Predict likelihood of future purchases
- Identify key drivers of purchase behavior
- Segment customers by purchase potential
- Provide targeting recommendations

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, classification_report
from sklearn.preprocessing import StandardScaler
import snowflake.connector
from snowflake.connector.pandas_tools import pd_read_sql

plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

In [None]:
# Connect to Snowflake
conn_params = {
    'user': 'workshop_user',
    'password': 'VotreMotDePasse123!',
    'account': 'dnb65599.snowflakecomputing.com',
    'warehouse': 'ANYCOMPANY_WH',
    'database': 'ANYCOMPANY_LAB',
    'schema': 'ANALYTICS'
}

conn = snowflake.connector.connect(**conn_params)
print("Connected to Snowflake!")

In [None]:
# Load customer data with synthetic target
# Note: In real scenario, target would be based on actual purchase history
query = """
SELECT 
    *,
    -- Synthetic target: high propensity if high income and young age
    CASE WHEN annual_income > 60000 AND age < 50 THEN 1 ELSE 0 END AS purchase_propensity
FROM ANALYTICS.customer_ml_features
"""

df = pd_read_sql(query, conn)
print(f"Loaded {len(df)} customer records")
print(f"Target distribution: {df['purchase_propensity'].value_counts(normalize=True)}")
print(df.head())

In [None]:
# Feature selection
features = ['age', 'annual_income', 'age_group_encoded', 'income_segment_encoded',
           'region_north', 'region_south', 'region_east', 'region_west']

X = df[features]
y = df['purchase_propensity']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Selected features: {features}")
print(f"Feature matrix shape: {X_scaled.shape}")

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

In [None]:
# Train Logistic Regression model
lr_model = LogisticRegression(random_state=42, class_weight='balanced')
lr_model.fit(X_train, y_train)

# Predictions
y_pred_lr = lr_model.predict(X_test)
y_pred_proba_lr = lr_model.predict_proba(X_test)[:, 1]

print("Logistic Regression trained!")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba_lr):.3f}")

In [None]:
# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)
y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]

print("Random Forest trained!")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba_rf):.3f}")

In [None]:
# Model comparison
models = ['Logistic Regression', 'Random Forest']
predictions = [y_pred_lr, y_pred_rf]
probabilities = [y_pred_proba_lr, y_pred_proba_rf]

for name, pred, proba in zip(models, predictions, probabilities):
    print(f"\n{name}:")
    print(classification_report(y_test, pred))
    print(f"ROC AUC: {roc_auc_score(y_test, proba):.3f}")

In [None]:
# ROC Curves
plt.figure(figsize=(8, 6))

for name, proba in zip(models, probabilities):
    fpr, tpr, _ = roc_curve(y_test, proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Purchase Propensity Models')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Feature importance (Random Forest)
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance - Purchase Propensity')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

print("Top features driving purchase propensity:")
print(feature_importance.head())

In [None]:
# Generate propensity scores for all customers
df['propensity_score'] = rf_model.predict_proba(scaler.transform(df[features]))[:, 1]

# Create propensity segments
df['propensity_segment'] = pd.qcut(df['propensity_score'], q=4, labels=['Low', 'Medium', 'High', 'Very High'])

print("Propensity score distribution:")
print(df['propensity_segment'].value_counts().sort_index())

# Visualize propensity distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='propensity_score', hue='propensity_segment', multiple='stack')
plt.title('Customer Purchase Propensity Distribution')
plt.xlabel('Propensity Score')
plt.ylabel('Number of Customers')
plt.show()

## Business Recommendations

### Key Insights:
1. **Top Drivers**: Income and age are primary drivers of purchase propensity
2. **High Propensity Segments**: Focus marketing efforts on high-propensity customers
3. **Model Performance**: [ROC AUC scores]

### Actionable Recommendations:
1. **Targeted Marketing**: Prioritize high-propensity customers for campaigns
2. **Personalization**: Tailor offers based on propensity scores
3. **Retention Focus**: Develop retention strategies for high-propensity segments
4. **Acquisition Strategy**: Target similar profiles for customer acquisition

### Implementation:
- Integrate propensity scores into CRM
- Use scores for campaign targeting
- Monitor score changes over time
- Update model with new data quarterly

In [None]:
# Save results
results_df = df[['customer_id', 'propensity_score', 'propensity_segment']]
print(f"Results ready: {len(results_df)} customer propensity scores")

# Close connection
conn.close()
print("Purchase propensity analysis completed!")