# Transforming Data with Joins & Aggregations

In [0]:
# Import the necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, sum, avg, round, countDistinct

In [0]:

# Load our streaming events dataset
events_path = "/pyspark/video-streaming-data/module3-transform/joins_aggregations/streaming_events.csv"
events_df = spark.read.option("header", "true").option("inferSchema", "true").csv(events_path)

# Load our reference datasets
users_path = "/pyspark/video-streaming-data/module3-transform/joins_aggregations/users.csv"
users_df = spark.read.option("header", "true").option("inferSchema", "true").csv(users_path)

content_path = "/pyspark/video-streaming-data/module3-transform/joins_aggregations/content_catalog.csv"
content_df = spark.read.option("header", "true").option("inferSchema", "true").csv(content_path)

In [0]:
# Look at the schema of our datasets to understand the join keys
print("Events Schema (preview):")
events_df.printSchema()

print("\nUsers Schema (preview):")
users_df.printSchema()

print("\nContent Schema (preview):")
content_df.printSchema()

Events Schema (preview):
root
 |-- event_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- content_id: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- duration_seconds: integer (nullable = true)
 |-- device_type: string (nullable = true)
 |-- quality: string (nullable = true)
 |-- buffering_count: integer (nullable = true)
 |-- error_type: string (nullable = true)
 |-- ip_address: string (nullable = true)
 |-- country: string (nullable = true)
 |-- session_id: string (nullable = true)


Users Schema (preview):
root
 |-- user_id: string (nullable = true)
 |-- signup_date: date (nullable = true)
 |-- subscription_tier: string (nullable = true)
 |-- last_billing_date: date (nullable = true)
 |-- account_status: string (nullable = true)
 |-- preferred_genres: string (nullable = true)
 |-- age_group: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- language: string (nullable = true)


Content Schema (preview):
root
 |-- con

### Joins

In [0]:
# Basic INNER JOIN to bring user information to our events
events_with_users = events_df.join(
    users_df,
    on="user_id",
    how="inner"
)

print("Events joined with Users:")
events_with_users.select("event_id", "user_id", "subscription_tier", "age_group").limit(3).display()

# Check how many events we have before and after the join
print(f"Events before join: {events_df.count()}")
print(f"Events after inner join with users: {events_with_users.count()}")

Events joined with Users:


event_id,user_id,subscription_tier,age_group
EVT10019,USR11535,Standard,18-24
EVT10022,USR11351,Basic,55-64
EVT10043,USR11021,Basic,"""""romance"""""


Events before join: 50000
Events after inner join with users: 5054


In [0]:
# LEFT JOIN to keep all events, even if the user isn't in our users table
events_with_users_left = events_df.join(
    users_df,
    on="user_id",
    how="left"
)

print(f"Events after left join with users: {events_with_users_left.count()}")

# Join with the content catalog to get content details
events_enriched = events_with_users_left.join(
    content_df,
    on="content_id",
    how="left"
)

print("Fully enriched events:")
events_enriched.select("event_id", "user_id", "subscription_tier", "content_id", "title", "genre").limit(3).display()


Events after left join with users: 50000
Fully enriched events:


event_id,user_id,subscription_tier,content_id,title,genre
EVT10000,USR41813,,CON10763,Business-focused even-keeled data-warehouse,Thriller
EVT10001,USR46484,,CON12784,,
EVT10002,USR37573,,CON16367,,


### Aggregations

In [0]:
# Basic aggregation: Count events by subscription tier
events_by_tier = events_enriched.groupBy("subscription_tier") \
    .agg(count("*").alias("event_count")) \
    .orderBy(col("event_count").desc())

print("Events by subscription tier:")
events_by_tier.display()

Events by subscription tier:


subscription_tier,event_count
,44946
Premium,1711
Standard,1693
Basic,1650


In [0]:
# Average streaming duration by content genre
avg_duration_by_genre = events_enriched.groupBy("genre") \
    .agg(
        count("*").alias("event_count"),
        round(avg("duration_seconds"), 2).alias("avg_duration_seconds")
    ) \
    .orderBy(col("avg_duration_seconds").desc())

print("Average streaming duration by content genre:")
avg_duration_by_genre.limit(5).display()

Average streaming duration by content genre:


genre,event_count,avg_duration_seconds
Sci-Fi,342,5137.73
Comedy,261,5024.74
Thriller,408,4601.58
Documentary,349,4563.66
Family,373,4474.03


In [0]:
# Complex aggregation: Content performance by subscription tier and genre
content_performance = events_enriched.groupBy("subscription_tier", "genre") \
    .agg(
        countDistinct("user_id").alias("unique_viewers"),
        round(avg("duration_seconds") / 60, 2).alias("avg_minutes_per_view"),
        round(sum("duration_seconds") / 3600, 2).alias("total_hours_watched")
    ) \
    .orderBy(col("subscription_tier"), col("total_hours_watched").desc())

print("Content performance by subscription tier and genre:")
content_performance.limit(5).display()

Content performance by subscription tier and genre:


subscription_tier,genre,unique_viewers,avg_minutes_per_view,total_hours_watched
,,26624,71.64,48266.94
,Thriller,370,77.24,477.58
,Sci-Fi,292,85.99,419.94
,Family,342,73.12,419.23
,Animation,350,71.18,417.58


In [0]:
# Save our enriched dataset for downstream analytics
output_path = "pyspark/video-streaming-data/module3-transform/joins_aggregations/enriched_events"

# Save as Parquet (columnar format)
events_enriched.write.mode("overwrite").parquet(output_path)

print(f"Enriched data saved to {output_path}")

Enriched data saved to pyspark/video-streaming-data/module3-transform/joins_aggregations/enriched_events
