In [1]:
import os
import math

import altair as alt
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.5-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
spark = (
    SparkSession.builder
    .appName('Airport Traffic')
    .master('local[4]')
    .config('spark.executor.memory', '2g')
    .config('spark.executor.cores', '2')
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "1")
    .config("spark.dynamicAllocation.maxExecutors", "4")
    .config('spark.executor.memoryOverhead', '512m')
    .config("spark.driver.memory", "2g")
    .config("spark.driver.maxResultSize", "2g")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.coalescePartitions.enabled', 'true')
    .config('spark.sql.adaptive.advisoryPartitionSizeInBytes', '64mb')
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config('spark.dynamicAllocation.executorIdleTimeout', '60s')
    .config('spark.sql.autoBroadcastJoinThreshold', '512mb')
    .getOrCreate()
)

In [4]:
df_movies_path = "F:\Datasets\CSV datasets\movies\movies.csv"
df_movies_reviews_path = "F:\Datasets\CSV datasets\movies\movies_reviews.csv"

In [5]:
movies_df = spark.read.csv(df_movies_path, header=True, inferSchema=True)

In [6]:
movies_df = movies_df.withColumnRenamed("title", "movie_title")
movies_df = movies_df.withColumnRenamed("id", "movie_id")

In [7]:
movies_reviews_df = spark.read.csv(df_movies_reviews_path, header=True, inferSchema=True)

In [8]:
joined_df = movies_reviews_df.join(
    F.broadcast(movies_df), movies_reviews_df.id == movies_df.movie_id, "inner"
)


In [9]:
joined_df.cache()

DataFrame[id: string, title: string, quote: string, score: string, date: string, author: string, publicationName: string, review_type: string, movie_id: string, movie_title: string, releaseDate: string, rating: string, genres: string, description: string, duration: string, tagline: string, metascore: string, metascore_count: string, metascore_sentiment: string, userscore: string, userscore_count: string, userscore_sentiment: string, production_companies: string, director: string, writer: string, top_cast: string]

In [10]:
joined_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- quote: string (nullable = true)
 |-- score: string (nullable = true)
 |-- date: string (nullable = true)
 |-- author: string (nullable = true)
 |-- publicationName: string (nullable = true)
 |-- review_type: string (nullable = true)
 |-- movie_id: string (nullable = true)
 |-- movie_title: string (nullable = true)
 |-- releaseDate: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- description: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- metascore: string (nullable = true)
 |-- metascore_count: string (nullable = true)
 |-- metascore_sentiment: string (nullable = true)
 |-- userscore: string (nullable = true)
 |-- userscore_count: string (nullable = true)
 |-- userscore_sentiment: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- director: string (nullable 

In [11]:
joined_df = joined_df.withColumn(
    'director_array',
    F.split(F.col('director'), ',')
)

joined_df = joined_df.withColumn(
    'director_single',
    F.explode(F.col('director_array'))
)

In [12]:
spark

In [13]:
joined_df.groupBy(
    'director_single'
).agg(
    F.avg(F.col('userscore').cast('double')).alias('director_avg_score')
).orderBy(
    F.col('director_avg_score').desc()
).show(5)

+-------------------+------------------+
|    director_single|director_avg_score|
+-------------------+------------------+
|      Brian W. Cook|             100.0|
|       Lev Anderson|             100.0|
|Nikolaus Geyrhalter|              98.0|
|   Milos Loncarevic|              98.0|
|      Linnea Saasen|              98.0|
+-------------------+------------------+
only showing top 5 rows



In [14]:
joined_df.show(truncate=False, n=5)

+----------+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [15]:
joined_df = joined_df.withColumn(
    'genre_array',
    F.split(F.col('genres'), ',')
)

joined_df = joined_df.withColumn(
    'individual_genre',
    F.explode(F.col('genre_array'))
)

joined_df = joined_df.withColumn(
    'individual_genre',
    F.trim(F.col('individual_genre'))
)

joined_df.groupBy(
    'individual_genre'
).agg(
    F.avg(F.col('metascore').cast('double')).alias('genre_avg_score'),
    F.avg('userscore').alias('genre_avg_userscore')
).orderBy(
    F.col('genre_avg_userscore').desc()
).show(10)

+----------------+------------------+-------------------+
|individual_genre|   genre_avg_score|genre_avg_userscore|
+----------------+------------------+-------------------+
|       Animation| 67.91979687116061|  72.26846910563879|
|          Family|62.955282400403426|  68.50639849213829|
|       Film-Noir| 85.96273291925466|  67.24596273291925|
|         Western| 66.97849462365592|  67.04609634551495|
|         Musical| 67.12175505653767|   66.5977772523055|
|       Adventure| 62.53759002326157|  66.49965046596503|
|         Fantasy| 61.89928556620473|  66.23475985708414|
|           Crime| 61.68818103199769|  65.38917786034105|
|          Sci-Fi|  61.7940662268872|  65.32193857431912|
|          Action| 59.69993538660349|  64.81016024938273|
+----------------+------------------+-------------------+
only showing top 10 rows



In [16]:
joined_df.createOrReplaceTempView("movie_reviews")

In [17]:
spark.sql("""
    SELECT
        avg(metascore) as meta_score,
        avg(userscore) as user_score,
        count(*) as review_count,
        individual_genre
    FROM
        movie_reviews
    GROUP BY
        individual_genre
""").show()

+------------------+------------------+------------+----------------+
|        meta_score|        user_score|review_count|individual_genre|
+------------------+------------------+------------+----------------+
| 61.68818103199769| 65.38917786034105|      216964|           Crime|
| 61.65832553097127|60.013149924773835|      171193|         Romance|
| 59.77647552176858|  64.1515089821001|      333072|        Thriller|
| 62.53759002326157| 66.49965046596503|      335687|       Adventure|
|              NULL|              NULL|          10|              99|
| 66.32661346016418| 63.78408013311783|      538216|           Drama|
| 67.36993859940259| 64.38319638455218|       51111|             War|
| 71.14466546112116| 47.35067027728666|       59740|     Documentary|
|62.955282400403426| 68.50639849213829|      151349|          Family|
| 61.89928556620473| 66.23475985708414|      189629|         Fantasy|
|              82.0|               0.0|           9|       Game-Show|
|  67.8639455782313|

In [18]:
joined_df = joined_df.withColumn("normalized_metascore", F.col("metascore").cast("double"))
joined_df = joined_df.withColumn("normalized_userscore", F.col("userscore").cast("double") * 10)

joined_df = joined_df.withColumn("score_difference", F.abs(F.col("normalized_metascore") - F.col
("normalized_userscore")))

result_df = joined_df.orderBy(F.col("score_difference").desc())
result_df.show(10)

+----------+----------------+--------------------+-----+----+------------------+--------------------+-----------+----------+----------------+-----------+---------+------------+--------------------+--------+--------------------+---------+---------------+-------------------+---------+---------------+-------------------+--------------------+-------------+--------------+--------------------+---------------+---------------+---------------+----------------+--------------------+--------------------+----------------+
|        id|           title|               quote|score|date|            author|     publicationName|review_type|  movie_id|     movie_title|releaseDate|   rating|      genres|         description|duration|             tagline|metascore|metascore_count|metascore_sentiment|userscore|userscore_count|userscore_sentiment|production_companies|     director|        writer|            top_cast| director_array|director_single|    genre_array|individual_genre|normalized_metascore|normalize

In [19]:
max_diff = joined_df.agg({"score_difference": "max"}).collect()[0][0]
movies_with_max_diff = joined_df.filter(F.col("score_difference") == F.lit(max_diff))
movies_with_max_diff.show()

+----------+----------------+--------------------+-----+----+------------------+--------------------+-----------+----------+----------------+-----------+---------+------------+--------------------+--------+--------------------+---------+---------------+-------------------+---------+---------------+-------------------+--------------------+-------------+--------------+--------------------+---------------+---------------+---------------+----------------+--------------------+--------------------+----------------+
|        id|           title|               quote|score|date|            author|     publicationName|review_type|  movie_id|     movie_title|releaseDate|   rating|      genres|         description|duration|             tagline|metascore|metascore_count|metascore_sentiment|userscore|userscore_count|userscore_sentiment|production_companies|     director|        writer|            top_cast| director_array|director_single|    genre_array|individual_genre|normalized_metascore|normalize

In [20]:
spark.sql("""
    SELECT
        *
    FROM
        movie_reviews
""").show(5)

+----------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------+-----------+------+------+--------------------+--------+-------+---------+---------------+-------------------+---------+---------------+-------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+-----------+----------------+
|        id|         title|               quote|               score|                date|              author|     publicationName|         review_type|  movie_id|   movie_title|releaseDate|rating|genres|         description|duration|tagline|metascore|metascore_count|metascore_sentiment|userscore|userscore_count|userscore_sentiment|production_companies|            director|    writer|            top_cast|      director_array|     director_single|genre_array|individual_genre|
+----------+--------------+-----------

In [21]:
window = Window.partitionBy('title').orderBy('date')

joined_df.withColumn(
    'review_number',
    F.row_number().over(window)
).show(10, truncate=False)

+----------+---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+----+------+---------------+-----------+----------+------------------------------+---------------------------------------------------------------------+------+--------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [22]:
spark.sql("""
    SELECT
        *,
        row_number() OVER (
            PARTITION BY title
            ORDER BY date
        ) AS review_number
    FROM
        movie_reviews
""").show(5, truncate=False)

+----------+---------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+----+------+---------------+-----------+----------+------------------------------+---------------------------------------------------------------------+------+--------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [23]:
author_genre_counts = joined_df.groupBy("author", "individual_genre").agg(
    F.count("*").alias("review_count")
)

window = Window.partitionBy("individual_genre").orderBy(F.col("review_count").desc())

ranked_reviewers = author_genre_counts.withColumn(
    'rank',
    F.row_number().over(window)
).where(F.col("author").isNotNull())

ranked_reviewers.filter(
    F.col("rank") <= 3
).show(5, truncate=False)

+--------------------+----------------+------------+----+
|author              |individual_genre|review_count|rank|
+--------------------+----------------+------------+----+
|Staff (Not Credited)|Action          |2302        |2   |
|James Berardinelli  |Action          |1882        |3   |
|Staff (Not Credited)|Adventure       |3028        |2   |
|James Berardinelli  |Adventure       |1707        |3   |
|Staff (Not Credited)|Animation       |676         |2   |
+--------------------+----------------+------------+----+
only showing top 5 rows



In [24]:
spark.sql("""
WITH ranked_reviewers AS (
    SELECT
        author,
        individual_genre,
        count(*) AS review_count,
        row_number() OVER (
            PARTITION BY author
            ORDER BY count(*) DESC
        ) AS rank
    FROM
        movie_reviews
    WHERE
        author IS NOT NULL
    GROUP BY
        author,
        individual_genre
) SELECT
    *,
    review_count
    FROM
        ranked_reviewers
    WHERE
        rank <= 3
""").show(5)

+--------------------+----------------+------------+----+------------+
|              author|individual_genre|review_count|rank|review_count|
+--------------------+----------------+------------+----+------------+
|  Hunger Games Do...|        Thriller|           1|   1|           1|
|  Hunger Games Do...|       Adventure|           1|   2|           1|
|  Hunger Games Do...|          Action|           1|   3|           1|
|  I get that this...|           Crime|           1|   1|           1|
|  I get that this...|          Sci-Fi|           1|   2|           1|
+--------------------+----------------+------------+----+------------+
only showing top 5 rows



In [25]:
joined_df = joined_df.withColumn(
    'days_since_release',
    F.datediff(F.to_date('date'), F.to_date('releaseDate'))
)

joined_df.withColumn(
    'years_since_release',
    F.round(F.col('days_since_release') / 365.25, 2)
).show(5)

+----------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------+-----------+------+------+--------------------+--------+-------+---------+---------------+-------------------+---------+---------------+-------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+-----------+----------------+--------------------+--------------------+----------------+------------------+-------------------+
|        id|         title|               quote|               score|                date|              author|     publicationName|         review_type|  movie_id|   movie_title|releaseDate|rating|genres|         description|duration|tagline|metascore|metascore_count|metascore_sentiment|userscore|userscore_count|userscore_sentiment|production_companies|            director|    writer|            top_cast|      direct

In [26]:
spark.sql("""
    SELECT
        *,
        DATEDIFF(date, releaseDate) AS days_since_release,
        ROUND(DATEDIFF(date, releaseDate) / 365.25, 2) AS years_since_release
    FROM
        movie_reviews
""").show(5)

+----------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------+-----------+------+------+--------------------+--------+-------+---------+---------------+-------------------+---------+---------------+-------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+-----------+----------------+------------------+-------------------+
|        id|         title|               quote|               score|                date|              author|     publicationName|         review_type|  movie_id|   movie_title|releaseDate|rating|genres|         description|duration|tagline|metascore|metascore_count|metascore_sentiment|userscore|userscore_count|userscore_sentiment|production_companies|            director|    writer|            top_cast|      director_array|     director_single|genre_array|individual_genre|

In [27]:
reviews1 = movies_reviews_df.alias("reviews1")
reviews2 = movies_reviews_df.alias("reviews2")

joined_reviews = reviews1.join(
    reviews2,
    (F.col("reviews1.title") == F.col("reviews2.title")) &
    (F.col("reviews1.author") != F.col("reviews2.author")),
    "inner"
)

joined_reviews = joined_reviews.withColumn(
    "score_difference",
    F.abs(F.col("reviews1.score").cast("double") - F.col("reviews2.score").cast("double"))
)

joined_reviews.select(
    F.col("reviews1.title").alias("movie_title"),
    F.col("reviews1.author").alias("author1"),
    F.col("reviews1.date").alias("date1"),
    F.col("reviews1.score").alias("score1"),
    F.col("reviews2.author").alias("author2"),
    F.col("reviews2.date").alias("date2"),
    F.col("reviews2.score").alias("score2"),
    "score_difference"
).show(5)

+--------------+-------+----------+------+----------------+-----+------+----------------+
|   movie_title|author1|     date1|score1|         author2|date2|score2|score_difference|
+--------------+-------+----------+------+----------------+-----+------+----------------+
|Dekalog (1988)|   Siza|2023-11-22|   100|   Kenneth Turan| NULL|   100|             0.0|
|Dekalog (1988)|   Siza|2023-11-22|   100|   David Ehrlich| NULL|   100|             0.0|
|Dekalog (1988)|   Siza|2023-11-22|   100|    Andrew Crump| NULL|   100|             0.0|
|Dekalog (1988)|   Siza|2023-11-22|   100|Lisa Schwarzbaum| NULL|   100|             0.0|
|Dekalog (1988)|   Siza|2023-11-22|   100|  Stephen Holden| NULL|   100|             0.0|
+--------------+-------+----------+------+----------------+-----+------+----------------+
only showing top 5 rows



In [28]:
movies_reviews_df.createOrReplaceTempView("movies_reviews")

In [29]:
spark.sql("""
    SELECT
        r1.title AS movie_title,
        r1.author AS author1,
        r1.date AS date1,
        r1.score AS score1,
        r2.author AS author2,
        r2.date AS date2,
        r2.score AS score2,
        ABS(CAST(r1.score AS DOUBLE) - CAST(r2.score AS DOUBLE)) AS score_difference
    FROM
        movies_reviews r1
    JOIN
        movies_reviews r2
    ON
        r1.title = r2.title AND r1.author != r2.author
""").show(5)

+--------------+-------+----------+------+----------------+-----+------+----------------+
|   movie_title|author1|     date1|score1|         author2|date2|score2|score_difference|
+--------------+-------+----------+------+----------------+-----+------+----------------+
|Dekalog (1988)|   Siza|2023-11-22|   100|   Kenneth Turan| NULL|   100|             0.0|
|Dekalog (1988)|   Siza|2023-11-22|   100|   David Ehrlich| NULL|   100|             0.0|
|Dekalog (1988)|   Siza|2023-11-22|   100|    Andrew Crump| NULL|   100|             0.0|
|Dekalog (1988)|   Siza|2023-11-22|   100|Lisa Schwarzbaum| NULL|   100|             0.0|
|Dekalog (1988)|   Siza|2023-11-22|   100|  Stephen Holden| NULL|   100|             0.0|
+--------------+-------+----------+------+----------------+-----+------+----------------+
only showing top 5 rows



In [30]:
window = Window.partitionBy()

df_with_zscores = joined_df \
    .withColumn('metascore_double',F.col('metascore').cast('double')) \
    .withColumn('userscore_double', F.col('userscore').cast('double') * 10) \
    .withColumn('metascore_mean', F.mean('metascore_double').over(window)) \
    .withColumn('metascore_stddev', F.stddev('metascore_double').over(window)) \
    .withColumn('userscore_mean', F.mean('userscore_double').over(window)) \
    .withColumn('userscore_stddev', F.stddev('userscore_double').over(window)) \
    .withColumn('metascore_zscore', F.col('metascore_double') - F.col('metascore_mean')) \
    .withColumn('userscore_zscore', F.col('userscore_double') - F.col('userscore_mean'))

divergent_movies = df_with_zscores \
    .withColumn("zscore_difference", F.abs(F.col("metascore_zscore") - F.col("userscore_zscore"))) \
    .select(
    "movie_title",
    "metascore_double",
    "userscore_double",
    "metascore_zscore",
    "userscore_zscore",
    "zscore_difference"
)

threshold = 2.0
significant_differences = divergent_movies \
    .filter(F.col("zscore_difference") > threshold) \
    .orderBy(F.col("zscore_difference").desc())

significant_differences.show(10, truncate=False)

+-------------------+----------------+----------------+-----------------+------------------+-----------------+
|movie_title        |metascore_double|userscore_double|metascore_zscore |userscore_zscore  |zscore_difference|
+-------------------+----------------+----------------+-----------------+------------------+-----------------+
|Shoah: Four Sisters|93.0            |0.0             |30.30402537431783|-641.9907265245962|672.294751898914 |
|Shoah: Four Sisters|93.0            |0.0             |30.30402537431783|-641.9907265245962|672.294751898914 |
|Shoah: Four Sisters|93.0            |0.0             |30.30402537431783|-641.9907265245962|672.294751898914 |
|Shoah: Four Sisters|93.0            |0.0             |30.30402537431783|-641.9907265245962|672.294751898914 |
|Shoah: Four Sisters|93.0            |0.0             |30.30402537431783|-641.9907265245962|672.294751898914 |
|Shoah: Four Sisters|93.0            |0.0             |30.30402537431783|-641.9907265245962|672.294751898914 |
|

In [31]:
spark.sql("""
    WITH zscore_calc AS (
      SELECT
        movie_title,
        CAST(metascore AS DOUBLE) as metascore_double,
        CAST(userscore AS DOUBLE) * 10 as userscore_double,
        AVG(CAST(metascore AS DOUBLE)) OVER () as metascore_mean,
        STDDEV(CAST(metascore AS DOUBLE)) OVER () as metascore_stddev,
        AVG(CAST(userscore AS DOUBLE) * 10) OVER () as userscore_mean,
        STDDEV(CAST(userscore AS DOUBLE) * 10) OVER () as userscore_stddev
      FROM movie_reviews
    ),
    zscores AS (
      SELECT
        movie_title,
        metascore_double,
        userscore_double,
        (metascore_double - metascore_mean) as metascore_zscore,
        (userscore_double - userscore_mean) as userscore_zscore
      FROM zscore_calc
    ),
    differences AS (
      SELECT
        movie_title,
        metascore_double,
        userscore_double,
        metascore_zscore,
        userscore_zscore,
        ABS(metascore_zscore - userscore_zscore) as zscore_difference
      FROM zscores
    )
    SELECT *
    FROM differences
    WHERE zscore_difference > 2.0
    ORDER BY zscore_difference DESC
""").show(5, truncate=False)

+-------------------+----------------+----------------+-----------------+------------------+-----------------+
|movie_title        |metascore_double|userscore_double|metascore_zscore |userscore_zscore  |zscore_difference|
+-------------------+----------------+----------------+-----------------+------------------+-----------------+
|Shoah: Four Sisters|93.0            |0.0             |30.30402537431783|-641.9907265245962|672.294751898914 |
|Shoah: Four Sisters|93.0            |0.0             |30.30402537431783|-641.9907265245962|672.294751898914 |
|Shoah: Four Sisters|93.0            |0.0             |30.30402537431783|-641.9907265245962|672.294751898914 |
|Shoah: Four Sisters|93.0            |0.0             |30.30402537431783|-641.9907265245962|672.294751898914 |
|Shoah: Four Sisters|93.0            |0.0             |30.30402537431783|-641.9907265245962|672.294751898914 |
+-------------------+----------------+----------------+-----------------+------------------+-----------------+
o

In [36]:
joined_df.select(
    F.col('author').alias('user_id'),
    F.col('movie_id'),
    F.col('movie_title'),
    F.col('score')
).filter(
    F.col('author').isNotNull() &
    F.col('score').isNotNull() &
    (F.col('score') != 'NULL')
).withColumn(
    'score',
    F.col('score').cast('double')
).show()

+--------------------+----------+--------------------+-----+
|             user_id|  movie_id|         movie_title|score|
+--------------------+----------+--------------------+-----+
|                Siza|2000545497|      Dekalog (1988)|100.0|
|            Andremax|2000545497|      Dekalog (1988)|100.0|
| Ingmar Bergman a...|2000545497|      Dekalog (1988)| NULL|
|            Daki1105|2000545497|      Dekalog (1988)|100.0|
|            BeastJ18|2000545497|      Dekalog (1988)|100.0|
|         benchwarmer|2000545497|      Dekalog (1988)|100.0|
|  James Berardinelli|2000545497|      Dekalog (1988)|100.0|
|         Roger Ebert|2000545497|      Dekalog (1988)|100.0|
|Staff [Not Credited]|2000545497|      Dekalog (1988)|100.0|
|         Bilge Ebiri|2000545497|      Dekalog (1988)|100.0|
|  Michael Wilmington|2000545497|      Dekalog (1988)|100.0|
|        Scott Tobias|2000545497|      Dekalog (1988)|100.0|
|       Andrew Sarris|2000545497|      Dekalog (1988)|100.0|
|      Stephen Holden|20

In [39]:
genre_quote_df = joined_df.select(
    'genres', 'quote'
).filter(
    F.col('genres').isNotNull() & (F.col('genres') != '') &
    F.col('quote').isNotNull() & (F.col('quote') != '')
)