<a href="https://colab.research.google.com/github/ananyabadkar/movie-ratings-Spark-practice-notebooks-/blob/main/notebooks/%20spark_colab_ratings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🚀 Spark in Google Colab – Movie Ratings Example
This notebook shows how to set up PySpark in Google Colab and analyze a `ratings.data` dataset without installing Spark/Anaconda locally.

In [None]:
!pip install pyspark -q

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MovieRatings").getOrCreate()
spark

In [None]:
from google.colab import files
uploaded = files.upload()   # select ratings.data from your computer


Saving ratings.data to ratings.data


In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, LongType
schema = StructType([
    StructField("user", IntegerType(), True),
    StructField("movie", IntegerType(), True),
    StructField("rating", IntegerType(), True),
    StructField("timestamp", LongType(), True),
])
df = spark.read.csv("ratings.data", sep="\t", schema=schema)

In [None]:
df.printSchema()
df.show(8)
df.count()

root
 |-- user: integer (nullable = true)
 |-- movie: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- timestamp: long (nullable = true)

+----+-----+------+---------+
|user|movie|rating|timestamp|
+----+-----+------+---------+
|   1|   50|     4|881250949|
|   2|   50|     5|891717742|
|   3|   10|     2|880606923|
|   4|   20|     5|886397596|
|   5|   30|     3|888550774|
|   6|   40|     4|892430093|
|   7|   60|     5|878887116|
|   8|   70|     3|880606923|
+----+-----+------+---------+
only showing top 8 rows



10

In [None]:
from pyspark.sql import functions as F
avg_df = df.groupBy("movie").agg(F.avg("rating").alias("avg_rating"))
avg_df.orderBy(F.desc("avg_rating")).show(10)

+-----+----------+
|movie|avg_rating|
+-----+----------+
|   20|       5.0|
|   60|       5.0|
|   50|       4.5|
|   40|       4.0|
|   80|       4.0|
|   70|       3.0|
|   30|       3.0|
|   10|       2.0|
|   90|       2.0|
+-----+----------+



In [None]:
count_df = df.groupBy("movie").count().withColumnRenamed("count", "num_ratings")
count_df.orderBy(F.desc("num_ratings")).show(10)

+-----+-----------+
|movie|num_ratings|
+-----+-----------+
|   50|          2|
|   20|          1|
|   40|          1|
|   10|          1|
|   80|          1|
|   70|          1|
|   60|          1|
|   90|          1|
|   30|          1|
+-----+-----------+



In [None]:
stats_df = avg_df.join(count_df, "movie")
popular_df = stats_df.filter(F.col("num_ratings") >= 5)
popular_df.orderBy(F.desc("avg_rating"), F.desc("num_ratings")).show(10)

+-----+----------+-----------+
|movie|avg_rating|num_ratings|
+-----+----------+-----------+
+-----+----------+-----------+



In [None]:
top20 = popular_df.orderBy(F.desc("avg_rating"), F.desc("num_ratings")).limit(20)
pdf = top20.toPandas()
pdf.to_csv("top20_movies.csv", index=False)
from google.colab import files
files.download("top20_movies.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>