In [1]:
import os
import math

import altair as alt
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.5-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
spark = (
    SparkSession.builder
    .appName('Airport Traffic')
    .master('local[4]')
    .config('spark.executor.memory', '2g')
    .config('spark.executor.cores', '2')
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "1")
    .config("spark.dynamicAllocation.maxExecutors", "4")
    .config('spark.executor.memoryOverhead', '512m')
    .config("spark.driver.memory", "2g")
    .config("spark.driver.maxResultSize", "2g")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.coalescePartitions.enabled', 'true')
    .config('spark.sql.adaptive.advisoryPartitionSizeInBytes', '64mb')
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config('spark.dynamicAllocation.executorIdleTimeout', '60s')
    .config('spark.sql.autoBroadcastJoinThreshold', '512mb')
    .getOrCreate()
)

In [4]:
df_movies_path = "F:\Datasets\CSV datasets\movies\movies.csv"
df_movies_reviews_path = "F:\Datasets\CSV datasets\movies\movies_reviews.csv"

In [5]:
movies_df = spark.read.csv(df_movies_path, header=True, inferSchema=True)

In [6]:
movies_reviews_df = spark.read.csv(df_movies_reviews_path, header=True, inferSchema=True)

In [7]:
joined_df = movies_reviews_df.join(F.broadcast(movies_df), movies_reviews_df.id == movies_df.id, 'inner')

In [8]:
joined_df.cache()

DataFrame[id: string, title: string, quote: string, score: string, date: string, author: string, publicationName: string, review_type: string, id: string, title: string, releaseDate: string, rating: string, genres: string, description: string, duration: string, tagline: string, metascore: string, metascore_count: string, metascore_sentiment: string, userscore: string, userscore_count: string, userscore_sentiment: string, production_companies: string, director: string, writer: string, top_cast: string]

In [9]:
joined_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- quote: string (nullable = true)
 |-- score: string (nullable = true)
 |-- date: string (nullable = true)
 |-- author: string (nullable = true)
 |-- publicationName: string (nullable = true)
 |-- review_type: string (nullable = true)
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- releaseDate: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- description: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- metascore: string (nullable = true)
 |-- metascore_count: string (nullable = true)
 |-- metascore_sentiment: string (nullable = true)
 |-- userscore: string (nullable = true)
 |-- userscore_count: string (nullable = true)
 |-- userscore_sentiment: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- director: string (nullable = true)
 |--

In [10]:
joined_df = joined_df.withColumn(
    'director_array',
    F.split(F.col('director'), ',')
)

joined_df = joined_df.withColumn(
    'director_single',
    F.explode(F.col('director_array'))
)

In [11]:
spark

In [12]:
joined_df.groupBy(
    'director_single'
).agg(
    F.avg(F.col('userscore').cast('double')).alias('director_avg_score')
).orderBy(
    F.col('director_avg_score').desc()
).show(5)

+-------------------+------------------+
|    director_single|director_avg_score|
+-------------------+------------------+
|      Brian W. Cook|             100.0|
|       Lev Anderson|             100.0|
|Nikolaus Geyrhalter|              98.0|
|   Milos Loncarevic|              98.0|
|      Linnea Saasen|              98.0|
+-------------------+------------------+
only showing top 5 rows



In [16]:
joined_df.show(truncate=False, n=5)

+----------+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [27]:
joined_df = joined_df.withColumn(
    'genre_array',
    F.split(F.col('genres'), ',')
)

joined_df = joined_df.withColumn(
    'individual_genre',
    F.explode(F.col('genre_array'))
)

joined_df = joined_df.withColumn(
    'individual_genre',
    F.trim(F.col('individual_genre'))
)

joined_df.groupBy(
    'individual_genre'
).agg(
    F.avg(F.col('metascore').cast('double')).alias('genre_avg_score'),
    F.avg('userscore').alias('genre_avg_userscore')
).orderBy(
    F.col('genre_avg_userscore').desc()
).show(10)

+----------------+------------------+-------------------+
|individual_genre|   genre_avg_score|genre_avg_userscore|
+----------------+------------------+-------------------+
|       Animation| 68.47672736340759|  72.53455674147177|
|          Family| 65.33807843876822|  70.41003572880197|
|         Musical| 67.69222197005406|  68.36792928057201|
|         Western| 66.10290581358218|  67.94736062925224|
|           Music| 70.16306394716601|  67.89234303047621|
|         Fantasy|63.837374181722254|  67.87679040230658|
|       Adventure|63.424157898135014|  67.75998269462038|
|       Film-Noir| 86.08825136612022|  67.47383879781421|
|          Comedy|62.475037579835366|  66.42924035583839|
|       Biography| 67.71024643280292|   66.2641447797247|
+----------------+------------------+-------------------+
only showing top 10 rows



In [28]:
joined_df.createOrReplaceTempView("movie_reviews")

In [32]:
spark.sql("""
    SELECT
        avg(metascore) as meta_score,
        avg(userscore) as user_score,
        count(*) as review_count,
        individual_genre
    FROM
        movie_reviews
    GROUP BY
        individual_genre
""").show()

+------------------+------------------+------------+----------------+
|        meta_score|        user_score|review_count|individual_genre|
+------------------+------------------+------------+----------------+
|61.000996911707766| 65.21876955512982|     3082731|           Crime|
| 62.50139281525923|62.029325889680365|     2249031|         Romance|
| 59.81715914339128| 64.05089318977791|     4603316|        Thriller|
|63.424157898135014| 67.75998269462038|     6426333|       Adventure|
|              NULL|              NULL|          10|              99|
| 65.36669922563719| 65.76451886434978|     6839816|           Drama|
| 65.51107291160743| 64.66925575305906|      907357|             War|
| 71.65365271750878| 53.99372289101932|      317838|     Documentary|
| 65.33807843876822| 70.41003572880197|     4064537|          Family|
|63.837374181722254| 67.87679040230658|     4399019|         Fantasy|
|              82.0|               0.0|         144|       Game-Show|
| 68.21889400921658|

In [35]:
joined_df.withColumn(
    'user_meta_score_diff',
    F.abs(F.col('userscore') - F.col('metascore'))
).show()

+----------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------+-----------+------+------+--------------------+--------+-------+---------+---------------+-------------------+---------+---------------+-------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+-----------+----------------+--------------------+
|        id|         title|               quote|               score|                date|              author|     publicationName|         review_type|        id|         title|releaseDate|rating|genres|         description|duration|tagline|metascore|metascore_count|metascore_sentiment|userscore|userscore_count|userscore_sentiment|production_companies|            director|    writer|            top_cast|      director_array|     director_single|genre_array|individual_genre|user_meta_score_di