In [1]:
# Demo: Read YouTube Bronze Data with PySpark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, LongType

spark = (
    SparkSession.builder
    .master("local[*]")
    .appName("youtube-bronze-demo")
    .getOrCreate()
)
print(f"Spark version: {spark.version}")
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Spark version: 3.5.4-amzn-0


In [3]:
import glob as pyglob

BRONZE_PATH = "/home/hadoop/workspace/data/bronze/metadata/source=channel"

files = pyglob.glob(f"{BRONZE_PATH}/dt=*/*/_compacted.jsonl")
lines = []
for f in files:
    with open(f) as fh:
        lines.extend(line.strip() for line in fh if line.strip())

raw_df = spark.read.json(spark.sparkContext.parallelize(lines))


                                                                                

In [5]:
# Flatten into a Silver-like analytical view
silver_df = (
    raw_df
    .select(
        F.col("id").alias("video_id"),
        F.col("snippet.channelId").alias("channel_id"),
        F.col("snippet.channelTitle").alias("channel_name"),
        F.col("snippet.title").alias("title"),
        F.col("snippet.publishedAt").alias("published_at"),
        F.col("statistics.viewCount").cast(LongType()).alias("view_count"),
        F.col("statistics.likeCount").cast(LongType()).alias("like_count"),
        F.col("statistics.commentCount").cast(LongType()).alias("comment_count"),
        F.col("contentDetails.duration").alias("duration"),
    )
    .withColumn("like_view_ratio", F.round(F.col("like_count") / F.col("view_count"), 4))
)

silver_df.show(truncate=50)

+-----------+------------------------+---------------------+--------------------------------------------------+--------------------+----------+----------+-------------+----------+---------------+
|   video_id|              channel_id|         channel_name|                                             title|        published_at|view_count|like_count|comment_count|  duration|like_view_ratio|
+-----------+------------------------+---------------------+--------------------------------------------------+--------------------+----------+----------+-------------+----------+---------------+
|7ysrM5BMe6Q|UC8butISFwT-Wl7EV0hUK0BQ|     freeCodeCamp.org|Don't get so wrapped up in the details that you...|2026-02-12T12:52:59Z|      2542|        20|            1|     PT58S|         0.0079|
|KUbbb0T1YZ8|UC8butISFwT-Wl7EV0hUK0BQ|     freeCodeCamp.org| Have you ever wondered where Python got its name?|2026-02-11T13:29:30Z|      3660|        18|            1|     PT12S|         0.0049|
|UJjKVceWmdM|UC8butI

In [None]:
# Top videos by engagement (like/view ratio)
print("=== Top 10 Videos by Engagement ===")
(
    silver_df
    .orderBy(F.col("like_view_ratio").desc())
    .select("channel_name", "title", "view_count", "like_count", "like_view_ratio")
    .show(10, truncate=60)
)

In [None]:
# Aggregate stats per channel
print("=== Channel Summary ===")
(
    silver_df
    .groupBy("channel_name")
    .agg(
        F.count("*").alias("video_count"),
        F.sum("view_count").alias("total_views"),
        F.sum("like_count").alias("total_likes"),
        F.round(F.avg("like_view_ratio"), 4).alias("avg_engagement"),
    )
    .orderBy(F.col("total_views").desc())
    .show(truncate=False)
)

In [None]:
spark.stop()