In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
print(pyspark.__version__)

3.5.5


In [3]:
spark = SparkSession.builder \
        .appName("Movie_ratings") \
        .master("local[*]") \
        .getOrCreate()

25/05/04 18:29:46 WARN Utils: Your hostname, Ameys-Mac-mini.local resolves to a loopback address: 127.0.0.1; using 192.168.1.12 instead (on interface en1)
25/05/04 18:29:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/04 18:29:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df = spark.read.csv("movie_ratings_project.csv", header=True, inferSchema=True)

In [5]:
df.show()

+-------+--------------------+--------+------+-----------+
|user_id|         movie_title|   genre|rating|review_date|
+-------+--------------------+--------+------+-----------+
|    445|     Scene give open|  Comedy|     2| 2024-04-06|
|    870|      Ok nature race|  Sci-Fi|     4| 2024-05-03|
|    177|      Response along|  Sci-Fi|     5| 2024-01-25|
|    401|       Prepare phone| Romance|     5| 2024-09-03|
|    308|         Alone above|  Action|     5| 2024-10-08|
|    899|Industry operatio...|  Action|     1| 2023-11-26|
|    347|      Determine meet| Romance|     2| 2025-01-25|
|    314|      Reach election|  Sci-Fi|     5| 2023-07-19|
|    788|          View color|   Drama|     4| 2024-02-16|
|    426|Probably result give|  Sci-Fi|     5| 2025-01-14|
|    885|            Pressure|Thriller|     2| 2023-12-11|
|    859|             Science|  Horror|     3| 2023-06-08|
|    482|            Pressure|  Action|     1| 2024-02-02|
|    929|              Speech|  Sci-Fi|     3| 2023-07-2

## Data Exploration and Cleaning

### Print the schema and display the first 5 records.

In [6]:
df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- movie_title: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- review_date: date (nullable = true)



In [7]:
df.show(5)

+-------+---------------+-------+------+-----------+
|user_id|    movie_title|  genre|rating|review_date|
+-------+---------------+-------+------+-----------+
|    445|Scene give open| Comedy|     2| 2024-04-06|
|    870| Ok nature race| Sci-Fi|     4| 2024-05-03|
|    177| Response along| Sci-Fi|     5| 2024-01-25|
|    401|  Prepare phone|Romance|     5| 2024-09-03|
|    308|    Alone above| Action|     5| 2024-10-08|
+-------+---------------+-------+------+-----------+
only showing top 5 rows



### Find the number of unique users.

In [8]:
df.createOrReplaceTempView("movie_ratings")

In [9]:
spark.sql(
    """
    SELECT
        count(distinct user_id) as unique_users
    FROM movie_ratings
    """
).show()

+------------+
|unique_users|
+------------+
|          70|
+------------+



In [10]:
df.select(countDistinct("user_id").alias("unique_users")).show()

+------------+
|unique_users|
+------------+
|          70|
+------------+



### List all distinct genres available.

In [11]:
spark.sql(
    """
    SELECT 
        distinct genre
    FROM movie_ratings
    """
).show()

+--------+
|   genre|
+--------+
| Romance|
|Thriller|
|   Drama|
|  Horror|
|  Comedy|
|  Action|
|  Sci-Fi|
+--------+



In [12]:
df.select("genre") \
    .distinct() \
    .show()

+--------+
|   genre|
+--------+
| Romance|
|Thriller|
|   Drama|
|  Horror|
|  Comedy|
|  Action|
|  Sci-Fi|
+--------+



### Count how many reviews exist for each genre.

In [13]:
spark.sql(
    """
    SELECT
        genre,
        count(user_id) as review_count
    FROM movie_ratings
    GROUP BY genre
    """
).show()

+--------+------------+
|   genre|review_count|
+--------+------------+
| Romance|          11|
|Thriller|          11|
|   Drama|          13|
|  Horror|          10|
|  Comedy|           8|
|  Action|           6|
|  Sci-Fi|          11|
+--------+------------+



In [14]:
df.groupBy("genre") \
    .agg(count("user_id").alias("review_count")) \
    .show()

+--------+------------+
|   genre|review_count|
+--------+------------+
| Romance|          11|
|Thriller|          11|
|   Drama|          13|
|  Horror|          10|
|  Comedy|           8|
|  Action|           6|
|  Sci-Fi|          11|
+--------+------------+



### Find if there are any null values in the dataset.

In [15]:
null_cnt = [(df.filter(col(c).isNull()).count(), c) for c in df.columns]
print(null_cnt)

[(0, 'user_id'), (0, 'movie_title'), (0, 'genre'), (0, 'rating'), (0, 'review_date')]


## Aggregations & Grouping

### Calculate the average rating for each genre.

In [16]:
spark.sql(
    """
    SELECT
        genre,
        round(avg(rating), 2) as avg_rating
    FROM movie_ratings
    GROUP BY genre
    """
).show()

+--------+----------+
|   genre|avg_rating|
+--------+----------+
| Romance|       4.0|
|Thriller|      3.36|
|   Drama|       3.0|
|  Horror|       2.0|
|  Comedy|      3.38|
|  Action|       2.5|
|  Sci-Fi|      3.27|
+--------+----------+



In [17]:
df.groupBy("genre") \
    .agg(round(avg("rating"), 2).alias("avg_rating")) \
    .show()

+--------+----------+
|   genre|avg_rating|
+--------+----------+
| Romance|       4.0|
|Thriller|      3.36|
|   Drama|       3.0|
|  Horror|       2.0|
|  Comedy|      3.38|
|  Action|       2.5|
|  Sci-Fi|      3.27|
+--------+----------+



### Identify the top 5 highest-rated movies (average rating).

In [18]:
spark.sql(
    """
    SELECT 
        movie_title,
        avg(rating) as avg_rating
    FROM movie_ratings
    GROUP BY movie_title
    ORDER BY avg_rating DESC
    LIMIT 5
    """
).show()

+--------------+----------+
|   movie_title|avg_rating|
+--------------+----------+
|   Alone above|       5.0|
|Candidate song|       4.5|
| Prepare phone|       4.5|
|Response along|       4.5|
|       Science|       4.0|
+--------------+----------+



In [19]:
df.groupBy("movie_title") \
    .agg(avg("rating").alias("avg_rating")) \
    .orderBy("avg_rating", ascending = False) \
    .limit(5) \
    .show()

+--------------+----------+
|   movie_title|avg_rating|
+--------------+----------+
|   Alone above|       5.0|
|Candidate song|       4.5|
| Prepare phone|       4.5|
|Response along|       4.5|
|       Science|       4.0|
+--------------+----------+



### Find the total number of reviews each year.

In [20]:
spark.sql(
    """
    SELECT
        year(review_date) as release_year,
        count(user_id) as review_count
    FROM movie_ratings
    GROUP BY release_year
    ORDER BY release_year DESC
    """
).show()

+------------+------------+
|release_year|review_count|
+------------+------------+
|        2025|           9|
|        2024|          38|
|        2023|          23|
+------------+------------+



In [21]:
df.groupBy(year("review_date").alias("release_year")) \
    .agg(count("user_id").alias("review_count")) \
    .orderBy("release_year", ascending = False) \
    .show()

+------------+------------+
|release_year|review_count|
+------------+------------+
|        2025|           9|
|        2024|          38|
|        2023|          23|
+------------+------------+



### Which movie received the most reviews?

In [22]:
spark.sql(
    """
    SELECT
        movie_title,
        count(user_id) as review_count
    FROM movie_ratings
    GROUP BY movie_title
    ORDER BY review_count DESC
    """
).show()

+--------------------+------------+
|         movie_title|review_count|
+--------------------+------------+
|             May air|           6|
|      Reach election|           4|
|Arrive open someb...|           4|
|               Agent|           4|
|Respond actually ...|           4|
|            Pressure|           4|
|          View color|           3|
|    Everything stand|           3|
|      Determine meet|           3|
|     Scene give open|           3|
|Probably result give|           3|
|             Science|           2|
|              Speech|           2|
|      Candidate song|           2|
|          Develop at|           2|
|Industry operatio...|           2|
|      Ok nature race|           2|
|       Prepare phone|           2|
|         Learn paper|           2|
|      Response along|           2|
+--------------------+------------+
only showing top 20 rows



In [23]:
df.groupBy("movie_title") \
    .agg(count("user_id").alias("review_count")) \
    .orderBy("review_count", ascending=False) \
    .show()

+--------------------+------------+
|         movie_title|review_count|
+--------------------+------------+
|             May air|           6|
|      Reach election|           4|
|Arrive open someb...|           4|
|               Agent|           4|
|Respond actually ...|           4|
|            Pressure|           4|
|          View color|           3|
|    Everything stand|           3|
|      Determine meet|           3|
|     Scene give open|           3|
|Probably result give|           3|
|             Science|           2|
|              Speech|           2|
|      Candidate song|           2|
|          Develop at|           2|
|Industry operatio...|           2|
|      Ok nature race|           2|
|       Prepare phone|           2|
|         Learn paper|           2|
|      Response along|           2|
+--------------------+------------+
only showing top 20 rows



### Show the distribution (count) of ratings (1-5 stars).

In [24]:
spark.sql(
    """
    SELECT
        rating,
        count(rating) as rating_count
    FROM movie_ratings
    GROUP BY rating
    ORDER BY rating 
    """
).show()

+------+------------+
|rating|rating_count|
+------+------------+
|     1|          12|
|     2|          14|
|     3|          12|
|     4|          18|
|     5|          14|
+------+------------+



In [25]:
df.groupBy("rating") \
    .agg(count("rating").alias("rating_count")) \
    .orderBy("rating") \
    .show()

+------+------------+
|rating|rating_count|
+------+------------+
|     1|          12|
|     2|          14|
|     3|          12|
|     4|          18|
|     5|          14|
+------+------------+



## Insights

### Find the average rating given by each user.

In [26]:
spark.sql(
    """
    SELECT
        user_id,
        avg(rating) as avg_rating
    FROM movie_ratings
    GROUP BY user_id
    """
).show()

+-------+----------+
|user_id|avg_rating|
+-------+----------+
|    496|       1.0|
|    623|       1.0|
|    879|       3.0|
|    898|       4.0|
|    961|       3.0|
|    876|       5.0|
|    375|       4.0|
|    744|       5.0|
|    183|       4.0|
|    787|       2.0|
|    577|       3.0|
|    501|       3.0|
|    625|       4.0|
|    225|       4.0|
|    190|       4.0|
|    177|       5.0|
|    152|       1.0|
|    748|       4.0|
|    182|       2.0|
|    699|       5.0|
+-------+----------+
only showing top 20 rows



In [27]:
df.groupBy("user_id") \
    .agg(avg("rating").alias("avg_rating")) \
    .show()

+-------+----------+
|user_id|avg_rating|
+-------+----------+
|    496|       1.0|
|    623|       1.0|
|    879|       3.0|
|    898|       4.0|
|    961|       3.0|
|    876|       5.0|
|    375|       4.0|
|    744|       5.0|
|    183|       4.0|
|    787|       2.0|
|    577|       3.0|
|    501|       3.0|
|    625|       4.0|
|    225|       4.0|
|    190|       4.0|
|    177|       5.0|
|    152|       1.0|
|    748|       4.0|
|    182|       2.0|
|    699|       5.0|
+-------+----------+
only showing top 20 rows



### Which genre has the highest average rating?

In [28]:
spark.sql(
    """
    SELECT
        genre,
        avg(rating) as avg_rating
    FROM movie_ratings
    GROUP BY genre
    ORDER BY avg_rating DESC
    """
).show()

+--------+------------------+
|   genre|        avg_rating|
+--------+------------------+
| Romance|               4.0|
|  Comedy|             3.375|
|Thriller|3.3636363636363638|
|  Sci-Fi| 3.272727272727273|
|   Drama|               3.0|
|  Action|               2.5|
|  Horror|               2.0|
+--------+------------------+



In [29]:
df.groupBy("genre") \
    .agg(avg("rating").alias("avg_rating")) \
    .orderBy("avg_rating", ascending=False) \
    .show()

+--------+------------------+
|   genre|        avg_rating|
+--------+------------------+
| Romance|               4.0|
|  Comedy|             3.375|
|Thriller|3.3636363636363638|
|  Sci-Fi| 3.272727272727273|
|   Drama|               3.0|
|  Action|               2.5|
|  Horror|               2.0|
+--------+------------------+



### Add a new column called rating_category 
Where

- Rating 4 or 5 = 'Positive'
- Rating 3 = 'Neutral'
- Rating 1 or 2 = 'Negative'
- Expected Output: Updated DataFrame with new column.


In [30]:
spark.sql(
    """
    SELECT
        *,
        CASE
            WHEN rating = 4 OR rating = 5 THEN "Positive"
            WHEN rating = 3 THEN "Neutral"
            WHEN rating = 1 OR rating = 2 THEN "Negative"
        END as rating_category
    FROM movie_ratings
    """
).show()

+-------+--------------------+--------+------+-----------+---------------+
|user_id|         movie_title|   genre|rating|review_date|rating_category|
+-------+--------------------+--------+------+-----------+---------------+
|    445|     Scene give open|  Comedy|     2| 2024-04-06|       Negative|
|    870|      Ok nature race|  Sci-Fi|     4| 2024-05-03|       Positive|
|    177|      Response along|  Sci-Fi|     5| 2024-01-25|       Positive|
|    401|       Prepare phone| Romance|     5| 2024-09-03|       Positive|
|    308|         Alone above|  Action|     5| 2024-10-08|       Positive|
|    899|Industry operatio...|  Action|     1| 2023-11-26|       Negative|
|    347|      Determine meet| Romance|     2| 2025-01-25|       Negative|
|    314|      Reach election|  Sci-Fi|     5| 2023-07-19|       Positive|
|    788|          View color|   Drama|     4| 2024-02-16|       Positive|
|    426|Probably result give|  Sci-Fi|     5| 2025-01-14|       Positive|
|    885|            Pres

In [31]:
df2 = df.withColumn(
    "rating_category",
    when(col("rating").isin([4, 5]), "Positive")
    .when(col("rating") == 3, "Neutral")
    .when(col("rating").isin([1, 2]), "Negative")
)
df2.show()

+-------+--------------------+--------+------+-----------+---------------+
|user_id|         movie_title|   genre|rating|review_date|rating_category|
+-------+--------------------+--------+------+-----------+---------------+
|    445|     Scene give open|  Comedy|     2| 2024-04-06|       Negative|
|    870|      Ok nature race|  Sci-Fi|     4| 2024-05-03|       Positive|
|    177|      Response along|  Sci-Fi|     5| 2024-01-25|       Positive|
|    401|       Prepare phone| Romance|     5| 2024-09-03|       Positive|
|    308|         Alone above|  Action|     5| 2024-10-08|       Positive|
|    899|Industry operatio...|  Action|     1| 2023-11-26|       Negative|
|    347|      Determine meet| Romance|     2| 2025-01-25|       Negative|
|    314|      Reach election|  Sci-Fi|     5| 2023-07-19|       Positive|
|    788|          View color|   Drama|     4| 2024-02-16|       Positive|
|    426|Probably result give|  Sci-Fi|     5| 2025-01-14|       Positive|
|    885|            Pres

### Display the number of Positive, Neutral, and Negative reviews.

In [32]:
df2.createOrReplaceTempView("movie_ratings_2")

In [33]:
spark.sql(
    """
    SELECT
        rating_category,
        count(rating_category) as rating_count
    FROM movie_ratings_2
    GROUP BY rating_category
    """
).show()

+---------------+------------+
|rating_category|rating_count|
+---------------+------------+
|       Positive|          32|
|        Neutral|          12|
|       Negative|          26|
+---------------+------------+



In [34]:
df2.groupBy("rating_category") \
    .agg(count("rating_category").alias("rating_count")) \
    .show()

+---------------+------------+
|rating_category|rating_count|
+---------------+------------+
|       Positive|          32|
|        Neutral|          12|
|       Negative|          26|
+---------------+------------+



### Find the month with the highest number of reviews.

In [35]:
spark.sql(
    """
    SELECT 
        month(review_date) as review_month,
        count(user_id) as review_count
    FROM movie_ratings_2
    GROUP BY review_month
    ORDER BY review_month
    """
).show()

+------------+------------+
|review_month|review_count|
+------------+------------+
|           1|           5|
|           2|           8|
|           3|           1|
|           4|          11|
|           5|           3|
|           6|           2|
|           7|           4|
|           8|           3|
|           9|           9|
|          10|          11|
|          11|           8|
|          12|           5|
+------------+------------+



In [36]:
df2.groupBy(month("review_date").alias("review_month")) \
    .agg(count("user_id").alias("review_count")) \
    .orderBy("review_month") \
    .show()

+------------+------------+
|review_month|review_count|
+------------+------------+
|           1|           5|
|           2|           8|
|           3|           1|
|           4|          11|
|           5|           3|
|           6|           2|
|           7|           4|
|           8|           3|
|           9|           9|
|          10|          11|
|          11|           8|
|          12|           5|
+------------+------------+



## Bonus

### Using SQL, find the top 3 genres with the most Positive reviews.

In [37]:
spark.sql(
    """
    SELECT
        genre,
        rating_category,
        count(rating_category) as rating_count
    FROM movie_ratings_2
    WHERE rating_category = "Positive"
    GROUP BY 1, 2
    ORDER BY rating_count DESC
    LIMIT 3
    """
).show()

+--------+---------------+------------+
|   genre|rating_category|rating_count|
+--------+---------------+------------+
| Romance|       Positive|           9|
|Thriller|       Positive|           6|
|   Drama|       Positive|           6|
+--------+---------------+------------+



### Using SQL, show the number of reviews per year for each genre.

In [38]:
spark.sql(
    """
    SELECT
        year(review_date) as review_year,
        genre,
        count(user_id) as review_count
    FROM movie_ratings_2
    GROUP BY 1, 2
    ORDER BY review_year DESC
    """
).show()

+-----------+--------+------------+
|review_year|   genre|review_count|
+-----------+--------+------------+
|       2025|Thriller|           3|
|       2025| Romance|           1|
|       2025|  Action|           1|
|       2025|  Comedy|           1|
|       2025|  Horror|           1|
|       2025|  Sci-Fi|           2|
|       2024|   Drama|          10|
|       2024|  Horror|           6|
|       2024|  Action|           2|
|       2024|  Comedy|           5|
|       2024| Romance|           4|
|       2024|Thriller|           4|
|       2024|  Sci-Fi|           7|
|       2023|  Comedy|           2|
|       2023|  Action|           3|
|       2023|   Drama|           3|
|       2023|  Sci-Fi|           2|
|       2023|Thriller|           4|
|       2023|  Horror|           3|
|       2023| Romance|           6|
+-----------+--------+------------+



### Filter and show all Sci-Fi movies that received a 5-star rating.

In [39]:
df2.filter((col("genre") == "Sci-Fi") & (col("rating") == 5)) \
    .select("movie_title", "genre", "rating") \
    .show()

+--------------------+------+------+
|         movie_title| genre|rating|
+--------------------+------+------+
|      Response along|Sci-Fi|     5|
|      Reach election|Sci-Fi|     5|
|Probably result give|Sci-Fi|     5|
|             Science|Sci-Fi|     5|
+--------------------+------+------+

