In [1]:
import os
import math

import altair as alt
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.5-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
spark = (
    SparkSession.builder
    .appName('Airport Traffic')
    .master('local[4]')
    .config('spark.executor.memory', '2g')
    .config('spark.executor.cores', '2')
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "1")
    .config("spark.dynamicAllocation.maxExecutors", "4")
    .config('spark.executor.memoryOverhead', '512m')
    .config("spark.driver.memory", "2g")
    .config("spark.driver.maxResultSize", "2g")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.coalescePartitions.enabled', 'true')
    .config('spark.sql.adaptive.advisoryPartitionSizeInBytes', '64mb')
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config('spark.dynamicAllocation.executorIdleTimeout', '60s')
    .config('spark.sql.autoBroadcastJoinThreshold', '512mb')
    .getOrCreate()
)

In [4]:
df_movies_path = "F:\Datasets\CSV datasets\movies\movies.csv"
df_movies_reviews_path = "F:\Datasets\CSV datasets\movies\movies_reviews.csv"

In [5]:
movies_df = spark.read.csv(df_movies_path, header=True, inferSchema=True)

In [6]:
movies_reviews_df = spark.read.csv(df_movies_reviews_path, header=True, inferSchema=True)

In [9]:
joined_df = movies_reviews_df.join(F.broadcast(movies_df), movies_reviews_df.id == movies_df.id, 'inner')

In [10]:
joined_df.cache()

DataFrame[id: string, title: string, quote: string, score: string, date: string, author: string, publicationName: string, review_type: string, id: string, title: string, releaseDate: string, rating: string, genres: string, description: string, duration: string, tagline: string, metascore: string, metascore_count: string, metascore_sentiment: string, userscore: string, userscore_count: string, userscore_sentiment: string, production_companies: string, director: string, writer: string, top_cast: string]

In [11]:
joined_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- quote: string (nullable = true)
 |-- score: string (nullable = true)
 |-- date: string (nullable = true)
 |-- author: string (nullable = true)
 |-- publicationName: string (nullable = true)
 |-- review_type: string (nullable = true)
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- releaseDate: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- description: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- metascore: string (nullable = true)
 |-- metascore_count: string (nullable = true)
 |-- metascore_sentiment: string (nullable = true)
 |-- userscore: string (nullable = true)
 |-- userscore_count: string (nullable = true)
 |-- userscore_sentiment: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- director: string (nullable = true)
 |--

In [12]:
joined_df = joined_df.withColumn(
    'director_array',
    F.split(F.col('director'), ',')
)

joined_df = joined_df.withColumn(
    'director_single',
    F.explode(F.col('director_array'))
)

In [13]:
spark

In [14]:
joined_df.groupBy(
    'director_single'
).agg(
    F.avg(F.col('userscore').cast('double')).alias('director_avg_score')
).orderBy(
    F.col('director_avg_score').desc()
).show(5)

+-------------------+------------------+
|    director_single|director_avg_score|
+-------------------+------------------+
|       Lev Anderson|             100.0|
|      Brian W. Cook|             100.0|
|   Milos Loncarevic|              98.0|
|Nikolaus Geyrhalter|              98.0|
|      Linnea Saasen|              98.0|
+-------------------+------------------+
only showing top 5 rows

