# üìä Customer Segmentation Analysis with AWS Glue & Spark

This notebook demonstrates how to:
- Connect to AWS Glue Data Catalog
- Query the `sagemaker_sample_db.churn` table
- Perform customer segmentation analysis using PySpark
- Generate insights and visualizations for business strategy

**Dataset:** Telecom customer data with usage patterns and service metrics

## ‚ö° Compute Environment: AWS Glue Serverless

This notebook runs on **AWS Glue serverless compute** with PySpark for distributed data processing.

In [None]:
# Install required packages
!pip install -q -r requirements.txt

In [None]:
%%pyspark default.spark
%number_of_workers --name (default.spark.compatibility) 10 -f

## üì• Setup: Import Libraries and Load Data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-v0_8-poster')
sns.set_context('talk', font_scale=1.25)

In [None]:
%%pyspark default.spark
import pyspark.sql.functions as F  # noqa: E402
from pyspark.sql.functions import round

In [None]:
%%pyspark default.spark

# Load data from Glue table
df_raw = spark.table("sagemaker_sample_db.churn")

# Filter out all header rows and invalid data
df = df_raw.filter(
    ~F.col("state").isin(["State", "state"]) & 
    ~F.col("vmail_plan").isin(["VMail Plan", "vmail_plan"]) &
    ~F.col("intl_plan").isin(["Int'l Plan", "intl_plan"]) &
    ~F.col("custserv_calls").isin(["CustServ Calls", "custserv_calls"]) &
    ~F.col("account_length").isin(["Account Length", "account_length"]) &
    F.col("account_length").rlike("^[0-9]+$")  # Only numeric account lengths
)
df.persist()

print(f"üìä Dataset loaded successfully!")
print(f"üìà Raw records: {df_raw.count():,}")
print(f"üìà Clean records: {df.count():,}")
print(f"üìã Total columns: {len(df.columns)}")

# Show dataset
print("\nüîç Dataset:")
df

## üîç Data Exploration

In [None]:
%%pyspark default.spark

# Show a few sample records with selected columns only
print("üìã Sample Customer Records:")
df.select("state", "account_length", "intl_plan", "vmail_plan", round("day_mins", 2).alias("day_mins"), "custserv_calls").limit(3)

In [None]:
%%pyspark default.spark

# Basic dataset overview
print("üìä DATASET OVERVIEW")
print("=" * 25)

total_customers = df.count()
print(f"üìà Total Customers: {total_customers:,}")
print(f"üìã Ready for customer segmentation analysis!")


## üìà Customer Segmentation Analysis

In [None]:
%%pyspark default.spark

print("üó∫Ô∏è CUSTOMERS BY STATE")
print("=" * 25)
# Customer distribution by state

state_customers = df.groupBy("state").agg(
        F.count("*").alias("total_customers"),
        F.round(F.avg("day_mins"), 2).alias("avg_day_mins"),
        F.round(F.avg("day_charge"), 2).alias("avg_day_charge")
    ).orderBy(F.desc("total_customers"))

print("\nüîù Top 10 States by Customer Count:")
state_customers.limit(10)

In [None]:
%%pyspark default.spark

print("üåç INTERNATIONAL PLAN ANALYSIS")
print("=" * 40)
# International plan analysis

intl_plan_stats = df.groupBy("intl_plan").agg(
        F.count("*").alias("customers"),
        F.round(F.avg("intl_mins"), 2).alias("avg_intl_mins"),
        F.round(F.avg("intl_charge"), 2).alias("avg_intl_charge")
    )

intl_plan_stats

In [None]:
%%pyspark default.spark
# Voicemail plan analysis
print("\nüìß VOICEMAIL PLAN ANALYSIS")

vmail_plan_stats = df.groupBy("vmail_plan").agg(
        F.count("*").alias("customers"),
        F.round(F.avg("vmail_message"), 1).alias("avg_vmail_messages")
    )

vmail_plan_stats

In [None]:
%%pyspark default.spark

print("üìû CUSTOMER SERVICE ANALYSIS")
print("=" * 40)
# Customer service calls analysis

service_calls_dist = df.groupBy("custserv_calls").count().orderBy("custserv_calls")

service_calls_dist.show()

# Convert to pandas for visualization
service_calls_pd = service_calls_dist.toPandas()
%push service_calls_pd --force

In [None]:
%pop service_calls_pd

plt.figure(figsize=(10, 6))
sns.barplot(data=service_calls_pd, x="custserv_calls", y="count", palette="Reds_r")
plt.title("Distribution of Customer Service Calls")
plt.xlabel("Number of Service Calls")
plt.ylabel("Number of Customers")
plt.tight_layout()
plt.show()

In [None]:
%%pyspark default.spark

# Summary insights

# Calculate missing percentages for summary
high_usage = df.filter(
    (F.col("day_mins") > 200) | 
    (F.col("eve_mins") > 200) | 
    (F.col("night_mins") > 100)
)
high_usage_count = high_usage.count()
high_usage_pct = (high_usage_count / total_customers) * 100

high_service_calls = df.filter(F.col("custserv_calls") >= 4)
high_service_count = high_service_calls.count()
high_service_pct = (high_service_count / total_customers) * 100
print("üí° KEY INSIGHTS")
print("=" * 20)

print(f"üìä Total customers analyzed: {total_customers:,}")
print(f"üìà High usage customers: {high_usage_pct:.1f}%")
print(f"‚ö†Ô∏è High service call customers: {high_service_pct:.1f}%")

# Clean up
df.unpersist()
print("‚úÖ Analysis complete!")