In [6]:
import os
import sys
import math

import subprocess
import altair as alt
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [7]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [8]:
try:
    java_home = subprocess.check_output(['/usr/libexec/java_home', '-v', '17']).decode('utf-8').strip()
    os.environ["JAVA_HOME"] = java_home
    print(f"Java found at: {java_home}")
except subprocess.CalledProcessError:
    print("JAVA 17 NOT FOUND. Please run 'brew install openjdk@17' in your terminal.")

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

Java found at: /opt/homebrew/Cellar/openjdk@17/17.0.17/libexec/openjdk.jdk/Contents/Home


In [9]:
spark = (
    SparkSession.builder
    .appName('Airport Traffic')
    .master('local[4]')
    .config('spark.executor.memory', '2g')
    .config('spark.executor.cores', '2')
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "1")
    .config("spark.dynamicAllocation.maxExecutors", "4")
    .config('spark.executor.memoryOverhead', '512m')
    .config("spark.driver.memory", "2g")
    .config("spark.driver.maxResultSize", "2g")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.coalescePartitions.enabled', 'true')
    .config('spark.sql.adaptive.advisoryPartitionSizeInBytes', '64mb')
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config('spark.dynamicAllocation.executorIdleTimeout', '60s')
    .config('spark.sql.autoBroadcastJoinThreshold', '512mb')
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/02 14:06:55 WARN Utils: Your hostname, Zygimantass-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 10.43.73.162 instead (on interface en0)
25/12/02 14:06:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/02 14:06:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [10]:
df_youtube_path = '/Users/zygimantas/Documents/DataSets/youtube_tech_videos_20251120_133004.csv'
df_youtube_path_v2 = '/Users/zygimantas/Documents/DataSets/youtube_tech_channels_20251120_133753.csv'

In [11]:
df_youtube = spark.read.csv(df_youtube_path, header=True, inferSchema=True)

In [14]:
df_youtube.show()

+-----------+--------------------+-------------------+-------+------+--------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+
|   video_id|               title|       published_at|  views| likes|comments|  duration|           thumbnail|           video_url|          channel_id|        channel_name|          scraped_at|duration_readable|
+-----------+--------------------+-------------------+-------+------+--------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+
|zzlS5dtzKKA|Most Important LT...|2025-11-19 00:47:05| 211782|  4012|      88|     PT27S|https://i.ytimg.c...|https://www.youtu...|UCXuqSBlHAE6Xw-ye...|     Linus Tech Tips|2025-11-20 13:37:...|             0:27|
|Cmnfwabz0bA|This TV Factory i...|2025-11-18 21:20:33| 572162| 29769|    1876|  PT13M43S|https://i.ytimg.c...|https://www.youtu...|UCXuqSBlHAE6Xw-ye