<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [3]</a>'.</span>

In [1]:
from datetime import datetime
print(f"Execution Start Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


Execution Start Time: 2025-11-11 22:21:56


In [2]:
# Package upgrade skipped - using environment default
print('Using default sagemaker_studio package from environment')


Using default sagemaker_studio package from environment


# Test 06: Customer Purchase Analytics using DuckDB

This notebook performs data analytics on customer purchase data to identify key features and trends

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [3]:
from sagemaker_studio import sqlutils

ImportError: cannot import name 'sqlutils' from 'sagemaker_studio' (/opt/conda/lib/python3.11/site-packages/sagemaker_studio/__init__.py)

In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)

# Create synthetic customer purchase data
n_customers = 1000
customers = pd.DataFrame({
    "customer_id": np.arange(1, n_customers + 1),
    "age": np.random.randint(18, 70, size=n_customers),
    "gender": np.random.choice(["Male", "Female"], size=n_customers),
    "annual_income": np.random.normal(70000, 15000, n_customers).round(2),
    "spending_score": np.random.randint(1, 100, n_customers),
    "purchase_count": np.random.poisson(15, n_customers),
    "avg_purchase_value": np.random.uniform(20, 500, n_customers).round(2),
    "churned": np.random.choice([0, 1], size=n_customers, p=[0.8, 0.2])
})

customers["total_spent"] = (df["purchase_count"] * df["avg_purchase_value"]).round(2)
customers.head()

In [None]:
sql_output_aixx = sqlutils.sql("SELECT COUNT(*) AS total_customers FROM customers")
sql_output_aixx

In [None]:
sql_output_4rnb = sqlutils.sql("SELECT \n    COUNT(*) AS total_customers,\n    AVG(age) AS avg_age,\n    AVG(annual_income) AS avg_income,\n    AVG(total_spent) AS avg_spent,\n    SUM(churned) AS churned_customers\nFROM customers;")
sql_output_4rnb

In [None]:
sql_output_avi8 = sqlutils.sql("SELECT \n    gender,\n    ROUND(AVG(annual_income), 2) AS avg_income,\n    ROUND(AVG(spending_score), 2) AS avg_spending_score,\n    ROUND(AVG(total_spent), 2) AS avg_total_spent\nFROM customers\nGROUP BY gender\nORDER BY avg_total_spent DESC;")
sql_output_avi8

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
customers.groupby("gender")["total_spent"].mean().plot(kind="bar", color=["#4C72B0", "#55A868"])
plt.title("Average Total Spending by Gender")
plt.ylabel("Average Total Spent ($)")
plt.show()

In [None]:
sql_output_cciq = sqlutils.sql("SELECT *\nFROM customers\nWHERE total_spent > (\n    SELECT PERCENTILE_CONT(0.9) WITHIN GROUP (ORDER BY total_spent) FROM customers\n)\nORDER BY total_spent DESC;")
sql_output_cciq

In [None]:
sql_output_6pi2 = sqlutils.sql("SELECT \n    CASE \n        WHEN total_spent < 2000 THEN 'Low Spender'\n        WHEN total_spent BETWEEN 2000 AND 6000 THEN 'Medium Spender'\n        ELSE 'High Spender'\n    END AS spending_tier,\n    COUNT(*) AS num_customers,\n    ROUND(AVG(annual_income), 2) AS avg_income\nFROM customers\nGROUP BY spending_tier\nORDER BY num_customers DESC;")
sql_output_6pi2

In [None]:
sql_output_dk2q = sqlutils.sql("SELECT \n    CASE \n        WHEN total_spent < 2000 THEN 'Low Spender'\n        WHEN total_spent BETWEEN 2000 AND 6000 THEN 'Medium Spender'\n        ELSE 'High Spender'\n    END AS spending_tier,\n    COUNT(*) AS num_customers\nFROM customers\nGROUP BY spending_tier")
sql_output_dk2q

In [None]:
plt.figure(figsize=(7,5))
plt.pie(sql_output_dk2q["num_customers"], labels=sql_output_dk2q["spending_tier"], autopct="%1.1f%%", startangle=140)
plt.title("Customer Spending Segments")
plt.show()

In [None]:
import seaborn as sns

corr = customers[["age", "annual_income", "spending_score", "purchase_count", "avg_purchase_value", "total_spent"]].corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

X = df[["age", "annual_income", "spending_score", "purchase_count", "avg_purchase_value", "total_spent"]]
y = df["churned"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
import numpy as np

importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=True)
plt.figure(figsize=(8,5))
importances.plot(kind='barh', color="#66c2a5")
plt.title("Feature Importance for Churn Prediction")
plt.show()

In [None]:
churn_results = pd.DataFrame({
    "customer_id": X_test.index,
    "actual": y_test.values,
    "predicted": y_pred
})

In [None]:
sql_output_3n8o = sqlutils.sql("SELECT \n    COUNT(*) AS total_tested,\n    SUM(predicted) AS predicted_churns,\n    SUM(CASE WHEN actual = predicted THEN 1 ELSE 0 END) * 100.0 / COUNT(*) AS accuracy\nFROM churn_results")
sql_output_3n8o