In [1]:
!pip install pyspark py4j
!pip install findspark
!pip install gradio

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1
Collecting gradio
  Downloading gradio-5.13.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.7-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.6.0 (from gradio)
  Downloading gradio_client-1.6.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting pyt

In [2]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName('MovieLens').getOrCreate()

In [3]:
# Load data
ratings = spark.read.option("header", "true").csv("/content/ratings.csv")
movies = spark.read.option("header", "true").csv("/content/movies.csv")


In [4]:
# Show a sample of the data
ratings.show(5)
movies.show(5)
ratings.printSchema()


+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 5 rows

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [5]:
from pyspark.sql.functions import *

# Most popular movies by the number of ratings
most_popular = ratings \
    .groupBy("movieId") \
    .agg(count("userId").alias("num_ratings")) \
    .sort(desc("num_ratings"))

# Join with movie details
most_popular_movies = most_popular.join(movies, "movieId")


In [6]:
# Top-rated movies by average rating
top_rated = ratings \
    .groupBy("movieId") \
    .agg(avg(col("rating")).alias("avg_rating"), count("userId").alias("num_ratings")) \
    .sort(desc("avg_rating"), desc("num_ratings"))

top_rated_movies = top_rated.join(movies, "movieId")


In [7]:
# Movies with high standard deviation in ratings (marmite movies)
ratings_stddev = ratings \
    .groupBy("movieId") \
    .agg(count("userId").alias("num_ratings"),
         avg(col("rating")).alias("avg_rating"),
         stddev(col("rating")).alias("std_rating")) \
    .where("num_ratings > 500")

marmite_movies = ratings_stddev.join(movies, "movieId").sort(desc("std_rating"))


In [8]:
# New feature: Rating deviation (how much the ratings deviate from the average)
rating_deviation = ratings \
    .groupBy("movieId") \
    .agg(
        avg(col("rating")).alias("avg_rating"),
        count("userId").alias("num_ratings"),
        stddev(col("rating")).alias("std_rating")
    ).withColumn(
        "rating_deviation", abs(col("avg_rating") - 3)  # Assuming 3 is the neutral rating
    )


In [9]:
# Functions to return data for the Gradio interface
def show_most_popular():
    return most_popular_movies.limit(10).toPandas()

def show_top_rated():
    return top_rated_movies.limit(10).toPandas()

def show_marmite_movies():
    return marmite_movies.limit(10).toPandas()

def show_rating_deviation():
    return rating_deviation.limit(10).toPandas()


In [12]:
import gradio as gr

# Gradio interface with added PySpark logo in the Project Overview tab
with gr.Blocks(css=".gr-button {background-color: #4CAF50; color: white; padding: 10px 20px; border-radius: 5px;} .gr-markdown {font-family: 'Arial', sans-serif;}") as demo:
    with gr.Tabs():
        with gr.Tab("Project Overview"):
            gr.Markdown("## MovieLens Data Analysis Project")

            # Add PySpark logo image
            gr.Image("/content/EssentialMovies.jpg", label="Powered by PySpark", height=150)

            gr.Markdown("""
            This project explores the MovieLens dataset to derive insights on movie ratings and popularity.
            Key features include identifying the most popular movies, top-rated movies, and movies with high rating deviation.

            ### Key Features
            - **Most Popular Movies**: Movies ranked by the number of ratings.
            - **Top-Rated Movies**: Movies ranked by average rating.
            - **Marmite Movies**: Movies with high variability in ratings.
            - **Rating Deviation**: How much ratings deviate from the average.

            ### Tools Used
            - PySpark for data processing
            - Gradio for the user interface
            """)

        with gr.Tab("Data Insights"):
            gr.Markdown("## Explore MovieLens Data Insights")
            gr.Markdown("Click a button to view different insights about the movies.")

            with gr.Row():
                most_popular_button = gr.Button("Show Most Popular Movies")
                top_rated_button = gr.Button("Show Top Rated Movies")
                marmite_button = gr.Button("Show Marmite Movies")
                rating_deviation_button = gr.Button("Show Rating Deviation")

            output = gr.DataFrame()

            # Link buttons to their respective functions
            most_popular_button.click(show_most_popular, outputs=output)
            top_rated_button.click(show_top_rated, outputs=output)
            marmite_button.click(show_marmite_movies, outputs=output)
            rating_deviation_button.click(show_rating_deviation, outputs=output)

# Launch the app
demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9cc099b1e066a5ef32.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


