In [2]:
import os
import math

import altair as alt
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [3]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.5-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [4]:
spark = (
    SparkSession.builder
    .appName('Airport Traffic')
    .master('local[20]')
    .config('spark.executor.memory', '12g')
    .config('spark.executor.cors', '4')
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.maxExecutors", "20")
    .config('spark.executors.memoryOverhead', '2g')
    .config("spark.driver.memory", "12g")
    .config("spark.driver.maxResultSize", "4g")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.coalescePartitions.enabled', 'true')
    .config('spark.sql.adaptive.advisoryPartitionSizeInBytes', '128mb')
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config('spark.dynamicAllocation.executorIdleTimeout', '120')
    .config('spark.sql.autoBroadcastJoinThreshold', '512mb')
    .getOrCreate()
)

In [5]:
df_movies_path = "F:\Datasets\CSV datasets\movies\movies.csv"
df_movies_reviews_path = "F:\Datasets\CSV datasets\movies\movies_reviews.csv"

In [6]:
movies_df = spark.read.csv(df_movies_path, header=True, inferSchema=True)

In [7]:
movies_reviews_df = spark.read.csv(df_movies_reviews_path, header=True, inferSchema=True)

In [9]:
movies_df.show(5)

+----------+--------------------+-----------+------+--------------------+--------------------+--------+--------------------+---------+---------------+-------------------+---------+---------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|        id|               title|releaseDate|rating|              genres|         description|duration|             tagline|metascore|metascore_count|metascore_sentiment|userscore|userscore_count|userscore_sentiment|production_companies|            director|              writer|            top_cast|
+----------+--------------------+-----------+------+--------------------+--------------------+--------+--------------------+---------+---------------+-------------------+---------+---------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|2000545497|      Dekalog (1988)| 1996-03-22| TV-MA|               Drama|This masterwork b...|   

In [10]:
movies_reviews_df.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                  id|               title|               quote|               score|                date|              author|     publicationName|         review_type|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|          2000545497|      Dekalog (1988)|What a bitter pil...|                 100|          2023-11-22|                Siza|                NULL|                user|
|          2000545497|      Dekalog (1988)|The best 10 hours...|                NULL|                NULL|                NULL|                NULL|                NULL|
|Ten commandments....| decades before l...| Krzysztof Kieślo...| and more lavishl...| “Dekalog” would ...|                 100|          2021-05-01|  

In [13]:
joined_df = movies_reviews_df.join(F.broadcast(movies_df), movies_reviews_df.id == movies_df.id, 'inner')

In [19]:
joined_df.cache()

DataFrame[id: string, title: string, quote: string, score: string, date: string, author: string, publicationName: string, review_type: string, id: string, title: string, releaseDate: string, rating: string, genres: string, description: string, duration: string, tagline: string, metascore: string, metascore_count: string, metascore_sentiment: string, userscore: string, userscore_count: string, userscore_sentiment: string, production_companies: string, director: string, writer: string, top_cast: string]

In [24]:
joined_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- quote: string (nullable = true)
 |-- score: string (nullable = true)
 |-- date: string (nullable = true)
 |-- author: string (nullable = true)
 |-- publicationName: string (nullable = true)
 |-- review_type: string (nullable = true)
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- releaseDate: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- description: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- metascore: string (nullable = true)
 |-- metascore_count: string (nullable = true)
 |-- metascore_sentiment: string (nullable = true)
 |-- userscore: string (nullable = true)
 |-- userscore_count: string (nullable = true)
 |-- userscore_sentiment: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- director: string (nullable = true)
 |--

In [25]:
joined_df.show(5)

+----------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------+-----------+------+------+--------------------+--------+-------+---------+---------------+-------------------+---------+---------------+-------------------+--------------------+--------------------+----------+--------------------+
|        id|         title|               quote|               score|                date|              author|     publicationName|         review_type|        id|         title|releaseDate|rating|genres|         description|duration|tagline|metascore|metascore_count|metascore_sentiment|userscore|userscore_count|userscore_sentiment|production_companies|            director|    writer|            top_cast|
+----------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------+-