
Customer Transaction Analysis Project

Problem Statement:
This PySpark script solves a business analytics problem where we need to:
1. Calculate total transaction amounts per customer from order data
2. Rank customers based on their spending
3. Identify the 3rd highest-spending customer for targeted marketing

Input Data:
- Customers DataFrame: Contains customer details (id, name, address, etc.)
- Card_Orders DataFrame: Contains transaction records (order_id, cust_id, amount, etc.)

Key Operations:
1. Joins customer data with transaction records
2. Aggregates total spending per customer
3. Uses window functions to rank customers by spending
4. Filters to extract the 3rd highest spender

Business Value:
- Enables customer segmentation by value
- Supports loyalty/rewards program decisions
- Identifies high-value customers for retention efforts

Technical Stack:
- PySpark (DataFrames, SQL functions, Window operations)
- Local Spark session (can scale to cluster)


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Initialize the Spark session
spark = SparkSession.builder.master("local").appName("CustomerTransactions").getOrCreate()

# Create the customers DataFrame
customers_data = [
    (1, 'Jill', 'Doe', 'New York', '123 Main St', '555-1234'),
    (2, 'Henry', 'Smith', 'Los Angeles', '456 Oak Ave', '555-5678'),
    (3, 'William', 'Johnson', 'Chicago', '789 Pine Rd', '555-8765'),
    (4, 'Emma', 'Daniel', 'Houston', '321 Maple Dr', '555-4321'),
    (5, 'Charlie', 'Davis', 'Phoenix', '654 Elm St', '555-6789')
]

customers_columns = ['id', 'first_name', 'last_name', 'city', 'address', 'phone_number']
customers_df = spark.createDataFrame(customers_data, customers_columns)

# Create the card_orders DataFrame
card_orders_data = [
    (1, 1, '2024-11-01 10:00:00', 'Electronics', 200),
    (2, 2, '2024-11-02 11:30:00', 'Groceries', 150),
    (3, 1, '2024-11-03 15:45:00', 'Clothing', 120),
    (4, 3, '2024-11-04 09:10:00', 'Books', 90),
    (8, 3, '2024-11-08 10:20:00', 'Groceries', 130),
    (9, 1, '2024-11-09 12:00:00', 'Books', 180),
    (10, 4, '2024-11-10 11:15:00', 'Electronics', 200),
    (11, 5, '2024-11-11 14:45:00', 'Furniture', 150),
    (12, 2, '2024-11-12 09:30:00', 'Furniture', 180)
]

card_orders_columns = ['order_id', 'cust_id', 'order_date', 'order_details', 'total_order_cost']
card_orders_df = spark.createDataFrame(card_orders_data, card_orders_columns)

# Convert order_date to timestamp
card_orders_df = card_orders_df.withColumn("order_date", F.col("order_date").cast("timestamp"))

# Step 1: Join the customers and card_orders DataFrames to calculate the total transaction amount
customer_transactions = (
    customers_df
    .join(card_orders_df, customers_df.id == card_orders_df.cust_id, "inner")
    .groupBy(customers_df.id, customers_df.first_name, customers_df.last_name)
    .agg(F.sum(card_orders_df.total_order_cost).alias("total_transaction_amount"))
)
customer_transactions.show()

# Step 2: Rank the customers based on their total transaction amount
window_spec = Window.orderBy(F.desc('total_transaction_amount'))

ranked_transactions = customer_transactions.withColumn('rank', F.rank().over(window_spec))
ranked_transactions.show()
# Step 3: Select the customer with the third-highest total transaction amount
third_highest_customer = ranked_transactions.filter(ranked_transactions["rank"] == 3).select('id', 'first_name', 'last_name')


third_highest_customer.show()

