In [1]:
import os
import sys
import math

import subprocess
import altair as alt
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [2]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
try:
    java_home = subprocess.check_output(['/usr/libexec/java_home', '-v', '17']).decode('utf-8').strip()
    os.environ["JAVA_HOME"] = java_home
    print(f"Java found at: {java_home}")
except subprocess.CalledProcessError:
    print("JAVA 17 NOT FOUND. Please run 'brew install openjdk@17' in your terminal.")

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

Java found at: /opt/homebrew/Cellar/openjdk@17/17.0.17/libexec/openjdk.jdk/Contents/Home


In [4]:
spark = (
    SparkSession.builder
    .appName('Airport Traffic')
    .master('local[4]')
    .config('spark.executor.memory', '2g')
    .config('spark.executor.cores', '2')
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "1")
    .config("spark.dynamicAllocation.maxExecutors", "4")
    .config('spark.executor.memoryOverhead', '512m')
    .config("spark.driver.memory", "2g")
    .config("spark.driver.maxResultSize", "2g")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.coalescePartitions.enabled', 'true')
    .config('spark.sql.adaptive.advisoryPartitionSizeInBytes', '64mb')
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config('spark.dynamicAllocation.executorIdleTimeout', '60s')
    .config('spark.sql.autoBroadcastJoinThreshold', '512mb')
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/02 14:44:44 WARN Utils: Your hostname, Zygimantass-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 10.43.73.162 instead (on interface en0)
25/12/02 14:44:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/02 14:44:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
df_youtube_path = '/Users/zygimantas/Documents/DataSets/youtube_tech_videos_20251120_133004.csv'
df_youtube_path_v2 = '/Users/zygimantas/Documents/DataSets/youtube_tech_channels_20251120_133753.csv'

In [6]:
df_youtube = spark.read.csv(df_youtube_path, header=True, inferSchema=True)

In [15]:
df_youtube.createOrReplaceTempView('youtube')

In [7]:
df_youtube.show(5)

+-----------+--------------------+-------------------+-------+------+--------+----------+--------------------+--------------------+--------------------+---------------+--------------------+-----------------+
|   video_id|               title|       published_at|  views| likes|comments|  duration|           thumbnail|           video_url|          channel_id|   channel_name|          scraped_at|duration_readable|
+-----------+--------------------+-------------------+-------+------+--------+----------+--------------------+--------------------+--------------------+---------------+--------------------+-----------------+
|zzlS5dtzKKA|Most Important LT...|2025-11-19 00:47:05| 211782|  4012|      88|     PT27S|https://i.ytimg.c...|https://www.youtu...|UCXuqSBlHAE6Xw-ye...|Linus Tech Tips|2025-11-20 13:37:...|             0:27|
|Cmnfwabz0bA|This TV Factory i...|2025-11-18 21:20:33| 572162| 29769|    1876|  PT13M43S|https://i.ytimg.c...|https://www.youtu...|UCXuqSBlHAE6Xw-ye...|Linus Tech Tips|

In [8]:
df_youtube.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- published_at: timestamp (nullable = true)
 |-- views: integer (nullable = true)
 |-- likes: integer (nullable = true)
 |-- comments: integer (nullable = true)
 |-- duration: string (nullable = true)
 |-- thumbnail: string (nullable = true)
 |-- video_url: string (nullable = true)
 |-- channel_id: string (nullable = true)
 |-- channel_name: string (nullable = true)
 |-- scraped_at: timestamp (nullable = true)
 |-- duration_readable: string (nullable = true)



In [9]:
df_youtube.count()

1300

In [17]:
spark.sql(
    "select count(*) from youtube"
).show()

+--------+
|count(1)|
+--------+
|    1300|
+--------+



In [19]:
spark.sql(
    "SELECT * FROM youtube"
).show(5)

+-----------+--------------------+-------------------+-------+------+--------+----------+--------------------+--------------------+--------------------+---------------+--------------------+-----------------+
|   video_id|               title|       published_at|  views| likes|comments|  duration|           thumbnail|           video_url|          channel_id|   channel_name|          scraped_at|duration_readable|
+-----------+--------------------+-------------------+-------+------+--------+----------+--------------------+--------------------+--------------------+---------------+--------------------+-----------------+
|zzlS5dtzKKA|Most Important LT...|2025-11-19 00:47:05| 211782|  4012|      88|     PT27S|https://i.ytimg.c...|https://www.youtu...|UCXuqSBlHAE6Xw-ye...|Linus Tech Tips|2025-11-20 13:37:...|             0:27|
|Cmnfwabz0bA|This TV Factory i...|2025-11-18 21:20:33| 572162| 29769|    1876|  PT13M43S|https://i.ytimg.c...|https://www.youtu...|UCXuqSBlHAE6Xw-ye...|Linus Tech Tips|

In [22]:
spark.sql(
    "SELECT title, views, published_at FROM youtube"
).show(5)

+--------------------+-------+-------------------+
|               title|  views|       published_at|
+--------------------+-------+-------------------+
|Most Important LT...| 211782|2025-11-19 00:47:05|
|This TV Factory i...| 572162|2025-11-18 21:20:33|
|The Hardest Secre...| 523009|2025-11-15 08:24:56|
|Correct Way to Ap...|3002514|2025-11-14 20:27:41|
|Valve’s New Conso...|4399002|2025-11-13 15:54:48|
+--------------------+-------+-------------------+
only showing top 5 rows


<llm-snippet-file>youtube_tech_videos.ipynb</llm-snippet-file>


In [26]:
df_youtube.select(
    'title', 'views', 'published_at'
).show(5)


+--------------------+-------+-------------------+
|               title|  views|       published_at|
+--------------------+-------+-------------------+
|Most Important LT...| 211782|2025-11-19 00:47:05|
|This TV Factory i...| 572162|2025-11-18 21:20:33|
|The Hardest Secre...| 523009|2025-11-15 08:24:56|
|Correct Way to Ap...|3002514|2025-11-14 20:27:41|
|Valve’s New Conso...|4399002|2025-11-13 15:54:48|
+--------------------+-------+-------------------+
only showing top 5 rows


In [30]:
df_youtube.describe()

DataFrame[summary: string, video_id: string, title: string, views: string, likes: string, comments: string, duration: string, thumbnail: string, video_url: string, channel_id: string, channel_name: string, duration_readable: string]

In [31]:
df_youtube.summary()

DataFrame[summary: string, video_id: string, title: string, views: string, likes: string, comments: string, duration: string, thumbnail: string, video_url: string, channel_id: string, channel_name: string, duration_readable: string]

In [34]:
df_youtube.filter(
    F.col('views') > 10000000
).show(truncate=False)

+-----------+------------------------------------------------------+-------------------+--------+------+--------+----------+------------------------------------------------+-------------------------------------------+------------------------+------------------+--------------------------+-----------------+
|video_id   |title                                                 |published_at       |views   |likes |comments|duration  |thumbnail                                       |video_url                                  |channel_id              |channel_name      |scraped_at                |duration_readable|
+-----------+------------------------------------------------------+-------------------+--------+------+--------+----------+------------------------------------------------+-------------------------------------------+------------------------+------------------+--------------------------+-----------------+
|5UGIbURj508|Top 4 agentic experiences for Gemini in Android Studio|2025-11-11 

<llm-snippet-file>youtube_tech_videos.ipynb</llm-snippet-file>


In [36]:
spark.sql(
    "SELECT * FROM youtube "
    "where views > 10000000"
).show()


+-----------+--------------------+-------------------+--------+------+--------+----------+--------------------+--------------------+--------------------+------------------+--------------------+-----------------+
|   video_id|               title|       published_at|   views| likes|comments|  duration|           thumbnail|           video_url|          channel_id|      channel_name|          scraped_at|duration_readable|
+-----------+--------------------+-------------------+--------+------+--------+----------+--------------------+--------------------+--------------------+------------------+--------------------+-----------------+
|5UGIbURj508|Top 4 agentic exp...|2025-11-11 02:06:37|79601182|164140|      14|    PT1M6S|https://i.ytimg.c...|https://www.youtu...|UCVHFbqXqoYvEWM1D...|Android Developers|2025-11-20 13:37:...|             1:06|
|MnbZLgjfLhQ|The Carpenter Who...|2025-08-28 16:03:46|10335657|477913|    6761|   PT2M32S|https://i.ytimg.c...|https://www.youtu...|UC68KSmHePPePCjW4...

In [39]:
spark.sql("""
    select * from youtube
""").show(10)

+-----------+--------------------+-------------------+-------+------+--------+----------+--------------------+--------------------+--------------------+---------------+--------------------+-----------------+
|   video_id|               title|       published_at|  views| likes|comments|  duration|           thumbnail|           video_url|          channel_id|   channel_name|          scraped_at|duration_readable|
+-----------+--------------------+-------------------+-------+------+--------+----------+--------------------+--------------------+--------------------+---------------+--------------------+-----------------+
|zzlS5dtzKKA|Most Important LT...|2025-11-19 00:47:05| 211782|  4012|      88|     PT27S|https://i.ytimg.c...|https://www.youtu...|UCXuqSBlHAE6Xw-ye...|Linus Tech Tips|2025-11-20 13:37:...|             0:27|
|Cmnfwabz0bA|This TV Factory i...|2025-11-18 21:20:33| 572162| 29769|    1876|  PT13M43S|https://i.ytimg.c...|https://www.youtu...|UCXuqSBlHAE6Xw-ye...|Linus Tech Tips|

In [40]:
df_youtube.filter(
    F.col('channel_name') == "Linus Tech Tips"
).show()

+-----------+--------------------+-------------------+-------+------+--------+----------+--------------------+--------------------+--------------------+---------------+--------------------+-----------------+
|   video_id|               title|       published_at|  views| likes|comments|  duration|           thumbnail|           video_url|          channel_id|   channel_name|          scraped_at|duration_readable|
+-----------+--------------------+-------------------+-------+------+--------+----------+--------------------+--------------------+--------------------+---------------+--------------------+-----------------+
|zzlS5dtzKKA|Most Important LT...|2025-11-19 00:47:05| 211782|  4012|      88|     PT27S|https://i.ytimg.c...|https://www.youtu...|UCXuqSBlHAE6Xw-ye...|Linus Tech Tips|2025-11-20 13:37:...|             0:27|
|Cmnfwabz0bA|This TV Factory i...|2025-11-18 21:20:33| 572162| 29769|    1876|  PT13M43S|https://i.ytimg.c...|https://www.youtu...|UCXuqSBlHAE6Xw-ye...|Linus Tech Tips|

In [42]:
spark.sql("""
    SELECT * FROM youtube
    WHERE channel_name = 'Linus Tech Tips'
""").show(5)

+-----------+--------------------+-------------------+-------+------+--------+----------+--------------------+--------------------+--------------------+---------------+--------------------+-----------------+
|   video_id|               title|       published_at|  views| likes|comments|  duration|           thumbnail|           video_url|          channel_id|   channel_name|          scraped_at|duration_readable|
+-----------+--------------------+-------------------+-------+------+--------+----------+--------------------+--------------------+--------------------+---------------+--------------------+-----------------+
|zzlS5dtzKKA|Most Important LT...|2025-11-19 00:47:05| 211782|  4012|      88|     PT27S|https://i.ytimg.c...|https://www.youtu...|UCXuqSBlHAE6Xw-ye...|Linus Tech Tips|2025-11-20 13:37:...|             0:27|
|Cmnfwabz0bA|This TV Factory i...|2025-11-18 21:20:33| 572162| 29769|    1876|  PT13M43S|https://i.ytimg.c...|https://www.youtu...|UCXuqSBlHAE6Xw-ye...|Linus Tech Tips|

In [43]:
df_youtube.printSchema

<bound method DataFrame.printSchema of DataFrame[video_id: string, title: string, published_at: timestamp, views: int, likes: int, comments: int, duration: string, thumbnail: string, video_url: string, channel_id: string, channel_name: string, scraped_at: timestamp, duration_readable: string]>

In [47]:
df_youtube.filter(
    (F.col('views') > 500_000) & (F.col('likes') > 10_000)
).show(5)

+-----------+--------------------+-------------------+-------+------+--------+----------+--------------------+--------------------+--------------------+---------------+--------------------+-----------------+
|   video_id|               title|       published_at|  views| likes|comments|  duration|           thumbnail|           video_url|          channel_id|   channel_name|          scraped_at|duration_readable|
+-----------+--------------------+-------------------+-------+------+--------+----------+--------------------+--------------------+--------------------+---------------+--------------------+-----------------+
|Cmnfwabz0bA|This TV Factory i...|2025-11-18 21:20:33| 572162| 29769|    1876|  PT13M43S|https://i.ytimg.c...|https://www.youtu...|UCXuqSBlHAE6Xw-ye...|Linus Tech Tips|2025-11-20 13:37:...|            13:43|
|dnKy4UEdiyo|The Hardest Secre...|2025-11-15 08:24:56| 523009| 10052|    1798|PT4H30M52S|https://i.ytimg.c...|https://www.youtu...|UCXuqSBlHAE6Xw-ye...|Linus Tech Tips|

In [50]:
spark.sql("""
          SELECT *
          FROM youtube
          where views > 500000
            and likes > 10000
          """).show(5)

+-----------+--------------------+-------------------+-------+------+--------+----------+--------------------+--------------------+--------------------+---------------+--------------------+-----------------+
|   video_id|               title|       published_at|  views| likes|comments|  duration|           thumbnail|           video_url|          channel_id|   channel_name|          scraped_at|duration_readable|
+-----------+--------------------+-------------------+-------+------+--------+----------+--------------------+--------------------+--------------------+---------------+--------------------+-----------------+
|Cmnfwabz0bA|This TV Factory i...|2025-11-18 21:20:33| 572162| 29769|    1876|  PT13M43S|https://i.ytimg.c...|https://www.youtu...|UCXuqSBlHAE6Xw-ye...|Linus Tech Tips|2025-11-20 13:37:...|            13:43|
|dnKy4UEdiyo|The Hardest Secre...|2025-11-15 08:24:56| 523009| 10052|    1798|PT4H30M52S|https://i.ytimg.c...|https://www.youtu...|UCXuqSBlHAE6Xw-ye...|Linus Tech Tips|

In [54]:
df_youtube.filter(
    F.col('title').isNotNull()
).show(5)

+-----------+--------------------+-------------------+-------+------+--------+----------+--------------------+--------------------+--------------------+---------------+--------------------+-----------------+
|   video_id|               title|       published_at|  views| likes|comments|  duration|           thumbnail|           video_url|          channel_id|   channel_name|          scraped_at|duration_readable|
+-----------+--------------------+-------------------+-------+------+--------+----------+--------------------+--------------------+--------------------+---------------+--------------------+-----------------+
|zzlS5dtzKKA|Most Important LT...|2025-11-19 00:47:05| 211782|  4012|      88|     PT27S|https://i.ytimg.c...|https://www.youtu...|UCXuqSBlHAE6Xw-ye...|Linus Tech Tips|2025-11-20 13:37:...|             0:27|
|Cmnfwabz0bA|This TV Factory i...|2025-11-18 21:20:33| 572162| 29769|    1876|  PT13M43S|https://i.ytimg.c...|https://www.youtu...|UCXuqSBlHAE6Xw-ye...|Linus Tech Tips|

In [55]:
spark.sql("""
  select * from youtube
  where title is not null
""").show(5)

+-----------+--------------------+-------------------+-------+------+--------+----------+--------------------+--------------------+--------------------+---------------+--------------------+-----------------+
|   video_id|               title|       published_at|  views| likes|comments|  duration|           thumbnail|           video_url|          channel_id|   channel_name|          scraped_at|duration_readable|
+-----------+--------------------+-------------------+-------+------+--------+----------+--------------------+--------------------+--------------------+---------------+--------------------+-----------------+
|zzlS5dtzKKA|Most Important LT...|2025-11-19 00:47:05| 211782|  4012|      88|     PT27S|https://i.ytimg.c...|https://www.youtu...|UCXuqSBlHAE6Xw-ye...|Linus Tech Tips|2025-11-20 13:37:...|             0:27|
|Cmnfwabz0bA|This TV Factory i...|2025-11-18 21:20:33| 572162| 29769|    1876|  PT13M43S|https://i.ytimg.c...|https://www.youtu...|UCXuqSBlHAE6Xw-ye...|Linus Tech Tips|

In [56]:
df_youtube.orderBy('views', ascending=False).show(5)

+-----------+--------------------+-------------------+--------+------+--------+-----------+--------------------+--------------------+--------------------+------------------+--------------------+-----------------+
|   video_id|               title|       published_at|   views| likes|comments|   duration|           thumbnail|           video_url|          channel_id|      channel_name|          scraped_at|duration_readable|
+-----------+--------------------+-------------------+--------+------+--------+-----------+--------------------+--------------------+--------------------+------------------+--------------------+-----------------+
|5UGIbURj508|Top 4 agentic exp...|2025-11-11 02:06:37|79601182|164140|      14|     PT1M6S|https://i.ytimg.c...|https://www.youtu...|UCVHFbqXqoYvEWM1D...|Android Developers|2025-11-20 13:37:...|             1:06|
|G3e-cpL7ofc|HTML & CSS Full C...|2022-02-05 18:41:52|17244882|313427|   16969| PT6H31M24S|https://i.ytimg.c...|https://www.youtu...|UCB6dvaWu0N8uVq

In [57]:
spark.sql("""
    SELECT * FROM youtube
      ORDER BY views DESC
""").show(5)

+-----------+--------------------+-------------------+--------+------+--------+-----------+--------------------+--------------------+--------------------+------------------+--------------------+-----------------+
|   video_id|               title|       published_at|   views| likes|comments|   duration|           thumbnail|           video_url|          channel_id|      channel_name|          scraped_at|duration_readable|
+-----------+--------------------+-------------------+--------+------+--------+-----------+--------------------+--------------------+--------------------+------------------+--------------------+-----------------+
|5UGIbURj508|Top 4 agentic exp...|2025-11-11 02:06:37|79601182|164140|      14|     PT1M6S|https://i.ytimg.c...|https://www.youtu...|UCVHFbqXqoYvEWM1D...|Android Developers|2025-11-20 13:37:...|             1:06|
|G3e-cpL7ofc|HTML & CSS Full C...|2022-02-05 18:41:52|17244882|313427|   16969| PT6H31M24S|https://i.ytimg.c...|https://www.youtu...|UCB6dvaWu0N8uVq