In [29]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, when

# Initialize Spark Session
spark = SparkSession.builder.appName("movies").getOrCreate()

# Load CSV into DataFrame
df = spark.read.option("header", "true").csv("C://Users//trebo//Desktop//BDAData//movies.csv")
rdd = df.rdd

# Show first few rows
df.show(5)

+--------------------+------+---------+----+--------------------+-----+---------+---------------+--------------------+--------------+--------------+----------+-----------+------------------+-------+
|                name|rating|    genre|year|            released|score|    votes|       director|              writer|          star|       country|    budget|      gross|           company|runtime|
+--------------------+------+---------+----+--------------------+-----+---------+---------------+--------------------+--------------+--------------+----------+-----------+------------------+-------+
|         The Shining|     R|    Drama|1980|June 13, 1980 (Un...|  8.4| 927000.0|Stanley Kubrick|        Stephen King|Jack Nicholson|United Kingdom|19000000.0| 46998772.0|      Warner Bros.|  146.0|
|     The Blue Lagoon|     R|Adventure|1980|July 2, 1980 (Uni...|  5.8|  65000.0| Randal Kleiser|Henry De Vere Sta...|Brooke Shields| United States| 4500000.0| 58853106.0| Columbia Pictures|  104.0|
|Star

In [30]:
# Data Filtering
# Keep only movies from 2000 onwards
filtered_year = df.filter(col("year") >= 2000)

# Filter movies with over 50,000 votes
filtered_votes = filtered_year.filter(col("votes") >= 50000)

# Filter movies with a budget over 10,000,000
filtered_data = filtered_votes.filter(col("budget") >= 10000000)

# Show first 10 rows
filtered_data.show(10)


+-----------------+------+---------+----+--------------------+-----+---------+-------------------+-------------------+---------------+-------------+-----------+-----------+--------------------+-------+
|             name|rating|    genre|year|            released|score|    votes|           director|             writer|           star|      country|     budget|      gross|             company|runtime|
+-----------------+------+---------+----+--------------------+-----+---------+-------------------+-------------------+---------------+-------------+-----------+-----------+--------------------+-------+
|    Almost Famous|     R|Adventure|2000|September 22, 200...|  7.9| 260000.0|      Cameron Crowe|      Cameron Crowe|   Billy Crudup|United States| 60000000.0| 47386287.0|   Columbia Pictures|  122.0|
|        Gladiator|     R|   Action|2000|May 5, 2000 (Unit...|  8.5|1400000.0|       Ridley Scott|     David Franzoni|  Russell Crowe|United States|103000000.0|465380802.0| Dreamworks Pictures

In [31]:
# Data Aggregation 
# 1. Average budget and gross per genre
genre_agg = filtered_data.groupBy("genre").agg(
    avg("budget").alias("avg_budget"),
    avg("gross").alias("avg_gross")
)
# Show Results
genre_agg.show()


+---------+--------------------+--------------------+
|    genre|          avg_budget|           avg_gross|
+---------+--------------------+--------------------+
|    Crime| 3.560094339622641E7|1.1091751068867925E8|
|  Romance|              5.02E7|        1.27869379E8|
| Thriller|               2.6E7|         9.6258201E7|
|Adventure| 8.606846846846847E7| 2.932509976306306E8|
|    Drama|3.8851931330472104E7|1.1873864672103004E8|
|   Family|               1.6E8|       1.264434525E9|
|  Fantasy|3.5333333333333336E7|1.0824172783333333E8|
|  Mystery|              5.94E7|       2.832377132E8|
|Animation|1.0844154929577465E8| 4.378985987394366E8|
|   Horror| 2.435737704918033E7|1.1712742713114753E8|
|Biography|3.5818803418803416E7| 1.128766508974359E8|
|   Comedy| 3.897433155080214E7|1.2573126355347593E8|
|   Action| 8.802018150791366E7|2.7152104056258994E8|
|   Sci-Fi|              5.35E7|       1.010100065E8|
+---------+--------------------+--------------------+



In [32]:
# 2. Count movies per rating
rating_count = filtered_data.groupBy("rating").agg(count("*").alias("movie_count"))
# Show Results
rating_count.show()

+---------+-----------+
|   rating|movie_count|
+---------+-----------+
|Not Rated|          3|
|  Unrated|          1|
|       PG|        253|
|    NC-17|          1|
|        R|        738|
|        G|         29|
|    PG-13|        830|
+---------+-----------+



In [33]:
# Data Cleaning
# 1. Remove rows where budget is null
filtered_data = filtered_data.dropna(subset=["budget"])

# Show the cleaned dataset
filtered_data.show(10)

+-----------------+------+---------+----+--------------------+-----+---------+-------------------+-------------------+---------------+-------------+-----------+-----------+--------------------+-------+
|             name|rating|    genre|year|            released|score|    votes|           director|             writer|           star|      country|     budget|      gross|             company|runtime|
+-----------------+------+---------+----+--------------------+-----+---------+-------------------+-------------------+---------------+-------------+-----------+-----------+--------------------+-------+
|    Almost Famous|     R|Adventure|2000|September 22, 200...|  7.9| 260000.0|      Cameron Crowe|      Cameron Crowe|   Billy Crudup|United States| 60000000.0| 47386287.0|   Columbia Pictures|  122.0|
|        Gladiator|     R|   Action|2000|May 5, 2000 (Unit...|  8.5|1400000.0|       Ridley Scott|     David Franzoni|  Russell Crowe|United States|103000000.0|465380802.0| Dreamworks Pictures

In [34]:
# 2. Replace null or "Unrated" ratings with "Not Rated"
filtered_data = filtered_data.withColumn(
    "rating", when((col("rating").isNull()) | (col("rating") == "Unrated"), "Not Rated").otherwise(col("rating"))
)

# Show the cleaned dataset
filtered_data.show(10)

+-----------------+------+---------+----+--------------------+-----+---------+-------------------+-------------------+---------------+-------------+-----------+-----------+--------------------+-------+
|             name|rating|    genre|year|            released|score|    votes|           director|             writer|           star|      country|     budget|      gross|             company|runtime|
+-----------------+------+---------+----+--------------------+-----+---------+-------------------+-------------------+---------------+-------------+-----------+-----------+--------------------+-------+
|    Almost Famous|     R|Adventure|2000|September 22, 200...|  7.9| 260000.0|      Cameron Crowe|      Cameron Crowe|   Billy Crudup|United States| 60000000.0| 47386287.0|   Columbia Pictures|  122.0|
|        Gladiator|     R|   Action|2000|May 5, 2000 (Unit...|  8.5|1400000.0|       Ridley Scott|     David Franzoni|  Russell Crowe|United States|103000000.0|465380802.0| Dreamworks Pictures

In [35]:
# 3. Remove rows where company is null
filtered_data = filtered_data.dropna(subset=["company"])

# Show the cleaned dataset
filtered_data.show(10)

+-----------------+------+---------+----+--------------------+-----+---------+-------------------+-------------------+---------------+-------------+-----------+-----------+--------------------+-------+
|             name|rating|    genre|year|            released|score|    votes|           director|             writer|           star|      country|     budget|      gross|             company|runtime|
+-----------------+------+---------+----+--------------------+-----+---------+-------------------+-------------------+---------------+-------------+-----------+-----------+--------------------+-------+
|    Almost Famous|     R|Adventure|2000|September 22, 200...|  7.9| 260000.0|      Cameron Crowe|      Cameron Crowe|   Billy Crudup|United States| 60000000.0| 47386287.0|   Columbia Pictures|  122.0|
|        Gladiator|     R|   Action|2000|May 5, 2000 (Unit...|  8.5|1400000.0|       Ridley Scott|     David Franzoni|  Russell Crowe|United States|103000000.0|465380802.0| Dreamworks Pictures

In [36]:
# Convert PySpark DataFrame to Pandas
cleaned_df = filtered_data.toPandas()

# Save as a csv file
cleaned_df.to_csv("clean_movies.csv", index=False)

print("File saved successfully as clean_movies.csv")


File saved successfully as clean_movies.csv
