In [8]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

# Initialize Spark Session
spark = SparkSession.builder.appName("movies").getOrCreate()

# Load CSV into DataFrame
df = spark.read.option("header", "true").csv("BDAData\movies.csv")
rdd = df.rdd

# Show first few rows
df.show(5)

+--------------------+------+---------+----+--------------------+-----+---------+---------------+--------------------+--------------+--------------+----------+-----------+------------------+-------+
|                name|rating|    genre|year|            released|score|    votes|       director|              writer|          star|       country|    budget|      gross|           company|runtime|
+--------------------+------+---------+----+--------------------+-----+---------+---------------+--------------------+--------------+--------------+----------+-----------+------------------+-------+
|         The Shining|     R|    Drama|1980|June 13, 1980 (Un...|  8.4| 927000.0|Stanley Kubrick|        Stephen King|Jack Nicholson|United Kingdom|19000000.0| 46998772.0|      Warner Bros.|  146.0|
|     The Blue Lagoon|     R|Adventure|1980|July 2, 1980 (Uni...|  5.8|  65000.0| Randal Kleiser|Henry De Vere Sta...|Brooke Shields| United States| 4500000.0| 58853106.0| Columbia Pictures|  104.0|
|Star

In [9]:
# Hash Partitioning:
# Repartitions the dataset into 4 partitions using hash partitioning based on "genre"
df_genre_partitioned = df.repartition(4, "genre")
print(f"Number of partitions: {df_genre_partitioned.rdd.getNumPartitions()}")


Number of partitions: 4


In [10]:
#Range Partitioning
# This orders the movies chronologically by their release year
df_year_sorted = df.orderBy("year")


In [11]:
#  Summary statistics by "genre"
#  Counts total movies per genre
#  Calculates the average IMDb score for each genre
#  Computes the average box office revenue per genre
df_summary = df.groupBy("genre").agg(
    F.count("*").alias("Total Movies"),
    F.avg("score").alias("Avg IMDb Score"),
    F.avg("gross").alias("Avg Box Office Revenue")
)
df_summary.show()


+---------+------------+------------------+----------------------+
|    genre|Total Movies|    Avg IMDb Score|Avg Box Office Revenue|
+---------+------------+------------------+----------------------+
|    Crime|         551| 6.671506352087112|   3.940119616605166E7|
|  Romance|          10| 6.409999999999999|        2.3549374875E7|
| Thriller|          16|5.9125000000000005|  2.6935259416666668E7|
|Adventure|         427| 6.291569086651052|  1.0932523022142857E8|
|    Drama|        1518|  6.69366754617415|   3.893095949386921E7|
|   Family|          11| 6.363636363636362|  1.9617249245454547E8|
|  Fantasy|          44| 6.006818181818182|   3.870932872093023E7|
|  History|           1|               8.3|                  NULL|
|  Mystery|          20| 6.665000000000001|         1.011835277E8|
|  Musical|           2|              8.05|             2595346.0|
|Animation|         338| 6.769230769230768|  2.3922998745074627E8|
|    Music|           1|               7.2|              11001

In [12]:
# Movies with IMDb score ≥ 8.0
df_high_rated = df.filter(df["score"] >= 8.0)
df_high_rated.show(5)


+--------------------+------+---------+----+--------------------+-----+---------+----------------+-----------------+--------------+--------------+----------+-----------+--------------------+-------+
|                name|rating|    genre|year|            released|score|    votes|        director|           writer|          star|       country|    budget|      gross|             company|runtime|
+--------------------+------+---------+----+--------------------+-----+---------+----------------+-----------------+--------------+--------------+----------+-----------+--------------------+-------+
|         The Shining|     R|    Drama|1980|June 13, 1980 (Un...|  8.4| 927000.0| Stanley Kubrick|     Stephen King|Jack Nicholson|United Kingdom|19000000.0| 46998772.0|        Warner Bros.|  146.0|
|Star Wars: Episod...|    PG|   Action|1980|June 20, 1980 (Un...|  8.7|1200000.0|  Irvin Kershner|   Leigh Brackett|   Mark Hamill| United States|18000000.0|538375067.0|           Lucasfilm|  124.0|
|    

In [13]:
# Movies by highest box office revenue (descending order)
df_sorted_revenue = df.orderBy(F.desc("gross"))
df_sorted_revenue.show(5)


+--------------------+---------+---------+----+--------------------+-----+-------+-------------------+------------------+---------------+-------------+----------+----------+--------------------+-------+
|                name|   rating|    genre|year|            released|score|  votes|           director|            writer|           star|      country|    budget|     gross|             company|runtime|
+--------------------+---------+---------+----+--------------------+-----+-------+-------------------+------------------+---------------+-------------+----------+----------+--------------------+-------+
|             Angel-A|        R|   Comedy|2005|December 21, 2005...|  7.1|33000.0|         Luc Besson|        Luc Besson|  Rie Rasmussen|       France|      NULL| 9995168.0|          EuropaCorp|   91.0|
|         In the Mood|    PG-13|   Comedy|1987|September 16, 198...|  6.2|  905.0|Phil Alden Robinson|    Robert Kosberg|Patrick Dempsey|United States| 7000000.0|  999382.0|Kings Road Ente

In [14]:
# Count total movies directed by each director
# Shows Top 5 directors with the most movies
df_directors = df.groupBy("director").agg(F.count("*").alias("Total Movies"))
df_top_directors = df_directors.orderBy(F.desc("Total Movies"))
df_top_directors.show(5)


+----------------+------------+
|        director|Total Movies|
+----------------+------------+
|     Woody Allen|          38|
|  Clint Eastwood|          31|
|       Directors|          28|
|Steven Spielberg|          27|
|      Ron Howard|          24|
+----------------+------------+
only showing top 5 rows

