In [1]:
!pip install kagglehub ipywidgets

Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


In [2]:
import os

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Row

import ipywidgets as widgets
from IPython.display import display, HTML


In [4]:
spark = SparkSession.builder \
    .appName("HybridRecommenderDashboard") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.driver.memory","6g") \
    .getOrCreate()

In [5]:
import kagglehub, os

ml20_path = kagglehub.dataset_download("grouplens/movielens-20m-dataset")
print("MovieLens path:", ml20_path)

files = os.listdir(ml20_path)
print(files)

rating_file = "rating.csv" if "rating.csv" in files else "ratings.csv"
movie_file  = "movie.csv"  if "movie.csv"  in files else "movies.csv"

ratings_df = spark.read.csv(f"{ml20_path}/{rating_file}", header=True, inferSchema=True)
movies_df  = spark.read.csv(f"{ml20_path}/{movie_file}",  header=True, inferSchema=True)

Downloading from https://www.kaggle.com/api/v1/datasets/download/grouplens/movielens-20m-dataset?dataset_version_number=1...


100%|██████████| 195M/195M [00:05<00:00, 35.5MB/s]

Extracting files...





MovieLens path: /root/.cache/kagglehub/datasets/grouplens/movielens-20m-dataset/versions/1
['genome_tags.csv', 'genome_scores.csv', 'link.csv', 'rating.csv', 'movie.csv', 'tag.csv']


In [6]:
ml20_path = "/root/.cache/kagglehub/datasets/grouplens/movielens-20m-dataset/versions/1"

movies_path = ml20_path + "/movie.csv"
ratings_path = ml20_path + "/rating.csv"

movies_df = spark.read.csv(movies_path, header=True, inferSchema=True)
ratings_df = spark.read.csv(ratings_path, header=True, inferSchema=True)

movies_df.show(5)
ratings_df.show(5)


+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|      2|   3.5|2005-04-02 23:53:47|
|     1|     29|   3.5|2005-04-02 23:31:16|
|     1|     32|   3.5|2005-04-02 23:33:39|
|     1|     47|   3.5|2005-04-02 23:32:07|
|     1|     50|   3.5|2005-04-02 23:29:40|
+------+-------+------+-------------------+
only showing top 5 rows



In [7]:
movies_df.count()

27278

In [8]:
tmdb_path = kagglehub.dataset_download("ashishkumarjayswal/movies-updated-data")
print("TMDB Path:", tmdb_path)

files = os.listdir(tmdb_path)
print("Available Files:", files)

possible_names = [
    "movies_metadata.csv",
    "Movies.csv",
    "movies.csv",
    "tmdb_5000_movies.csv",
    "updated_movies.csv",
    "movie_metadata.csv"
]

tmdb_file = None
for name in possible_names:
    if name in files:
        tmdb_file = name
        break


if tmdb_file is None:
    csv_files = [f for f in files if f.lower().endswith(".csv")]
    if len(csv_files) == 0:
        raise Exception("❌ No CSV file found in TMDB dataset.")
    tmdb_file = csv_files[0]

print(f"Using TMDB file: {tmdb_file}")


tmdb_csv = f"{tmdb_path}/{tmdb_file}"
tmdb_df = spark.read.csv(tmdb_csv, header=True, inferSchema=True)


tmdb_df.printSchema()
tmdb_df.show(5, truncate=120)


Downloading from https://www.kaggle.com/api/v1/datasets/download/ashishkumarjayswal/movies-updated-data?dataset_version_number=1...


100%|██████████| 215k/215k [00:00<00:00, 728kB/s]

Extracting files...





TMDB Path: /root/.cache/kagglehub/datasets/ashishkumarjayswal/movies-updated-data/versions/1
Available Files: ['movies_updated.csv']
Using TMDB file: movies_updated.csv
root
 |-- name: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- released: string (nullable = true)
 |-- score: double (nullable = true)
 |-- votes: integer (nullable = true)
 |-- director: string (nullable = true)
 |-- writer: string (nullable = true)
 |-- star: string (nullable = true)
 |-- country: string (nullable = true)
 |-- budget: integer (nullable = true)
 |-- gross: long (nullable = true)
 |-- company: string (nullable = true)
 |-- runtime,,: string (nullable = true)

+----------------------------------------------+------+---------+----+-----------------------------+-----+-------+---------------+-----------------------+--------------+--------------+--------+---------+------------------+---------+
|                     

In [9]:
from pyspark.sql.functions import regexp_replace, trim, lower, col, coalesce, lit


movies_norm = movies_df.withColumn(
    "clean_title",
    lower(
        trim(
            regexp_replace(
                regexp_replace(col("title"), r"\([^)]*\)", ""),
                r"[^a-zA-Z0-9 ]", ""
            )
        )
    )
)


tmdb_norm = tmdb_df.withColumn(
    "clean_title",
    lower(
        trim(
            regexp_replace(
                regexp_replace(col("name"), r"\([^)]*\)", ""),
                r"[^a-zA-Z0-9 ]", ""
            )
        )
    )
)

tmdb_norm = tmdb_norm.withColumn(
    "sentiment_score",
    (col("score") / 10)
).fillna({"sentiment_score": 0.5})


movies_with_sentiment = movies_norm.join(
    tmdb_norm.select(
        "clean_title", "sentiment_score", "genre", "year",
        "released", "score", "votes"
    ),
    on="clean_title",
    how="left"
)


movies_with_sentiment = movies_with_sentiment.withColumn(
    "sentiment_score",
    coalesce(col("sentiment_score"), lit(0.5))
)


movies_with_sentiment = movies_with_sentiment.fillna({
    "genre": "Unknown",
    "year": "Unknown",
    "released": "Unknown",
    "score": 0.0,
    "votes": 0
})


movies_with_sentiment.select("title", "sentiment_score").show(15, truncate=False)


+----------------------------------+------------------+
|title                             |sentiment_score   |
+----------------------------------+------------------+
|Toy Story (1995)                  |0.8300000000000001|
|Jumanji (1995)                    |0.7               |
|Grumpier Old Men (1995)           |0.67              |
|Waiting to Exhale (1995)          |0.6               |
|Father of the Bride Part II (1995)|0.61              |
|Heat (1995)                       |0.82              |
|Heat (1995)                       |0.5599999999999999|
|Sabrina (1995)                    |0.63              |
|Tom and Huck (1995)               |0.55              |
|Sudden Death (1995)               |0.58              |
|Sudden Death (1995)               |0.48              |
|GoldenEye (1995)                  |0.72              |
|American President, The (1995)    |0.5               |
|Dracula: Dead and Loving It (1995)|0.5900000000000001|
|Balto (1995)                      |0.71        

In [10]:
FRACTION = 0.52  # adjust if needed

ratings_sample = ratings_df.sample(False, FRACTION, seed=42)

print("Original:", ratings_df.count())
print("Sampled:", ratings_sample.count())

Original: 20000263
Sampled: 10399471


In [11]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, count

train_r, val_r = ratings_sample.randomSplit([0.8, 0.2], seed=42)

als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    rank=50,
    maxIter=10,
    regParam=0.1,
    coldStartStrategy="drop",
    nonnegative=True
)

als_model = als.fit(train_r)

val_preds = als_model.transform(val_r)

evaluator = RegressionEvaluator(labelCol="rating", predictionCol="prediction", metricName="rmse")

rmse = evaluator.evaluate(val_preds)
print("ALS RMSE:", rmse)


ALS RMSE: 0.8214651837390587


In [12]:
ratings_df.groupBy("userId").count().orderBy(col("count").desc()).show(20)

+------+-----+
|userId|count|
+------+-----+
|118205| 9254|
|  8405| 7515|
| 82418| 5646|
|121535| 5520|
|125794| 5491|
| 74142| 5447|
| 34576| 5356|
|131904| 5330|
| 83090| 5169|
| 59477| 4988|
|130767| 4785|
| 79159| 4707|
|  8963| 4524|
| 15617| 4354|
| 92011| 4236|
| 71975| 4182|
| 20132| 4101|
| 46470| 4094|
| 88820| 4093|
| 63147| 3958|
+------+-----+
only showing top 20 rows



In [18]:
# ================= ENHANCED BIG-DATA MOVIE DASHBOARD =================
# Features:
# - Dropdown movie selection
# - Dark mode Netflix-style card
# - Your updated decision logic
# - Weighted ALS + Sentiment score for ranking
# - Top 10 recommendations table
# - Similar movies by genre
# - Top 5 title matches
# - Threshold sliders
# - Simple charts (sentiment + ALS score distributions)

import ipywidgets as widgets
from IPython.display import display, HTML
from pyspark.sql import Row
from pyspark.sql.functions import lower, col, explode
import matplotlib.pyplot as plt

# ---------- CONFIG ----------
ALS_WEIGHT = 0.7
SENTIMENT_WEIGHT = 0.3


EXAMPLE_USER_ID = 118205

# ---------- FULL MOVIE LIST ----------
movie_list = movies_df.select("title").toPandas()["title"].tolist()
movie_list_sorted = sorted(movie_list)

# ---------- WIDGETS ----------

# Movie dropdown
movie_dropdown = widgets.Dropdown(
    options=movie_list_sorted,
    description="🎬 Movie:",
    layout=widgets.Layout(width="600px")
)

# Buttons
analyze_button = widgets.Button(
    description="Analyze movie",
    button_style="info",
    layout=widgets.Layout(width="160px")
)

toprecs_button = widgets.Button(
    description="Top 10 for me",
    button_style="success",
    layout=widgets.Layout(width="160px")
)

similar_button = widgets.Button(
    description="Similar movies",
    button_style="warning",
    layout=widgets.Layout(width="160px")
)

charts_button = widgets.Button(
    description="Show charts",
    button_style="",
    layout=widgets.Layout(width="140px")
)

# Threshold sliders (your logic, but tweakable)
strong_thr_slider = widgets.FloatSlider(
    value=3.5,
    min=2.0,
    max=5.0,
    step=0.1,
    description='Strong ≥',
    continuous_update=False,
    layout=widgets.Layout(width="250px")
)

mid_thr_slider = widgets.FloatSlider(
    value=3.0,
    min=1.0,
    max=4.0,
    step=0.1,
    description='Mid ≥',
    continuous_update=False,
    layout=widgets.Layout(width="250px")
)

sent_thr_slider = widgets.FloatSlider(
    value=0.5,
    min=0.0,
    max=1.0,
    step=0.05,
    description='Sent ≥',
    continuous_update=False,
    layout=widgets.Layout(width="250px")
)

output_movie = widgets.Output()
output_recs = widgets.Output()
output_similar = widgets.Output()
output_charts = widgets.Output()


# ---------- DECISION LOGIC (YOUR UPDATED VERSION) ----------
def get_decision(predicted_rating, sentiment_score,
                 strong_thr, mid_thr, sent_thr):
    """
    Your updated logic:
    - if pred >= strong_thr and sentiment >= sent_thr -> Strong Recommend
    - elif mid_thr <= pred < strong_thr -> soft Recommend
    - else -> Not recommended
    """
    if predicted_rating is None:
        return "⚠ ALS cannot predict (cold start).", "#FFCC66"

    if predicted_rating >= strong_thr and sentiment_score >= sent_thr:
        return "✅ RECOMMEND — Strong predicted rating AND positive sentiment.", "#6CFF6C"
    elif mid_thr <= predicted_rating < strong_thr:
        return "🤔 Recommend (Rating is good, but not very strong).", "#FFDD66"
    else:
        return "❌ DO NOT RECOMMEND — Rating too low.", "#FF7777"


# ---------- ANALYZE SINGLE MOVIE ----------
def analyze_movie(_btn):
    with output_movie:
        output_movie.clear_output()

        movie_query = movie_dropdown.value.strip()
        if not movie_query:
            display(HTML("<p style='color:#ff7777;'>❌ Please choose a movie.</p>"))
            return

        # Try exact match first
        matches_df = movies_with_sentiment.filter(
            lower(col("title")) == movie_query.lower()
        ).limit(5)

        # Fallback to partial search
        if matches_df.count() == 0:
            matches_df = movies_with_sentiment.filter(
                lower(col("title")).contains(movie_query.lower())
            ).limit(5)

        matches = matches_df.collect()
        if len(matches) == 0:
            display(HTML(f"<p style='color:#ff7777;'>❌ No movie found matching '{movie_query}'.</p>"))
            return

        movie = matches[0]
        movie_id = int(movie["movieId"])
        title = movie["title"]
        genres = movie["genres"] if "genres" in movie else "Unknown"
        sentiment_score = float(movie["sentiment_score"])
        poster_url = getattr(movie, "poster_url", None)  # if you ever add poster_url

        # ALS prediction
        test_df = spark.createDataFrame([Row(userId=EXAMPLE_USER_ID, movieId=movie_id)])
        result = als_model.transform(test_df).collect()
        if len(result) == 0 or result[0]["prediction"] is None:
            predicted_rating = None
        else:
            predicted_rating = float(result[0]["prediction"])

        # Decision using sliders (but same structure as your logic)
        strong_thr = strong_thr_slider.value
        mid_thr = mid_thr_slider.value
        sent_thr = sent_thr_slider.value

        decision_text, decision_color = get_decision(
            predicted_rating, sentiment_score,
            strong_thr, mid_thr, sent_thr
        )

        # Alternate title matches
        alt_titles_html = ""
        if len(matches) > 1:
            alt_titles_html = "<ul style='padding-left:20px;'>"
            for m in matches[1:]:
                alt_titles_html += f"<li>{m['title']}</li>"
            alt_titles_html += "</ul>"

        # Weighted score (for explanation)
        if predicted_rating is not None:
            final_score = ALS_WEIGHT * predicted_rating + SENTIMENT_WEIGHT * sentiment_score
        else:
            final_score = None

        # Dark Netflix-like card
        display(HTML(f"""
        <div style="
            background:#141414;
            color:#e0e0e0;
            padding:20px;
            border-radius:14px;
            width:840px;
            font-family:Arial, sans-serif;
            box-shadow:0px 0px 15px rgba(0,0,0,0.6);
        ">
            <div style="display:flex; gap:20px;">
                {
                    "<div style='flex:0 0 220px;'><div style='width:220px;height:310px;border-radius:12px;background:#333;display:flex;align-items:center;justify-content:center;color:#777;'>No Poster</div></div>"
                    if not poster_url else
                    f"<div style='flex:0 0 220px;'><img src='{poster_url}' style='width:220px;height:310px;border-radius:12px;object-fit:cover;'/></div>"
                }
                <div style="flex:1;">
                    <h2 style="margin-top:0;color:#fff;">{title}</h2>
                    <p><b style="color:#aaa;">Genres:</b> {genres}</p>
                    <p><b style="color:#aaa;">Sentiment Score:</b> {sentiment_score:.2f}</p>
                    <p><b style="color:#aaa;">ALS Predicted Rating (User {EXAMPLE_USER_ID}):</b> {('N/A' if predicted_rating is None else f'{predicted_rating:.2f}')}</p>
                    <p><b style="color:#aaa;">Weighted Score (0.7·ALS + 0.3·Sentiment):</b> {('N/A' if final_score is None else f'{final_score:.2f}')}</p>
                </div>
            </div>

            <hr style="border:0; border-top:1px solid #444; margin:15px 0;">

            <h3 style="color:{decision_color}; font-size:22px; margin-bottom:8px;">
                {decision_text}
            </h3>

            <p style="margin:4px 0;"><b>Reasoning:</b></p>
            <ul style="margin-top:4px; padding-left:18px; color:#ccc;">
                <li>ALS Rating = {('N/A' if predicted_rating is None else f'{predicted_rating:.2f}')} (Strong ≥ {strong_thr:.1f}, Mid ≥ {mid_thr:.1f})</li>
                <li>Sentiment Score = {sentiment_score:.2f} (threshold ≥ {sent_thr:.2f})</li>
                <li>Weighted Score = {('N/A' if final_score is None else f'{final_score:.2f}')} (0.7·ALS + 0.3·Sentiment)</li>
            </ul>

            {"<p style='margin-top:10px; color:#bbb;'><b>Other matching titles:</b>" + alt_titles_html + "</p>" if len(matches) > 1 else ""}

            <hr style="border:0; border-top:1px solid #444; margin:15px 0;">

            <p><i style="color:#aaa;">💬 Leave a quick review to help other users:</i></p>
            <textarea style="
                width:100%;
                height:70px;
                background:#222;
                color:#eee;
                border-radius:8px;
                border:1px solid #444;
                padding:8px;
            "></textarea>
        </div>
        """))


# ---------- TOP 10 RECOMMENDATIONS ----------
def show_top_recommendations(_btn):
    with output_recs:
        output_recs.clear_output()

        # Get ALS top 30 for this user
        user_df = spark.createDataFrame([Row(userId=EXAMPLE_USER_ID)])
        user_recs = als_model.recommendForUserSubset(user_df, 30)

        if user_recs.count() == 0:
            display(HTML("<p style='color:#ff7777;'>⚠ No recommendations available for this user.</p>"))
            return

        recs = (user_recs
                .select(explode("recommendations").alias("rec"))
                .select(col("rec.movieId").alias("movieId"),
                        col("rec.rating").alias("als_score")))

        recs_joined = (recs.join(
            movies_with_sentiment.select("movieId", "title", "genres", "sentiment_score"),
            on="movieId",
            how="left"
        )
        .withColumn("final_score", ALS_WEIGHT*col("als_score") + SENTIMENT_WEIGHT*col("sentiment_score"))
        .orderBy(col("final_score").desc())
        .limit(10))

        recs_pd = recs_joined.toPandas()

        strong_thr = strong_thr_slider.value
        mid_thr = mid_thr_slider.value
        sent_thr = sent_thr_slider.value

        rows_html = ""
        for _, r in recs_pd.iterrows():
            decision_text, decision_color = get_decision(
                r["als_score"], r["sentiment_score"],
                strong_thr, mid_thr, sent_thr
            )
            rows_html += f"""
            <tr>
                <td>{r['title']}</td>
                <td>{r['genres']}</td>
                <td style="text-align:center;">{r['als_score']:.2f}</td>
                <td style="text-align:center;">{r['sentiment_score']:.2f}</td>
                <td style="text-align:center;">{r['final_score']:.2f}</td>
                <td style="color:{decision_color}; text-align:center;">{decision_text.split(' — ')[0]}</td>
            </tr>
            """

        display(HTML(f"""
        <div style="
            background:#101010;
            color:#e0e0e0;
            padding:20px;
            border-radius:14px;
            width:900px;
            font-family:Arial, sans-serif;
            margin-top:15px;
            box-shadow:0px 0px 12px rgba(0,0,0,0.7);
        ">
            <h2 style="margin-top:0; color:#fff;">🔥 Top 10 Recommendations for User {EXAMPLE_USER_ID}</h2>
            <table style="width:100%; border-collapse:collapse; font-size:14px;">
                <thead>
                    <tr style="background:#222;">
                        <th style="padding:8px; border-bottom:1px solid #444; text-align:left;">Title</th>
                        <th style="padding:8px; border-bottom:1px solid #444; text-align:left;">Genres</th>
                        <th style="padding:8px; border-bottom:1px solid #444; text-align:center;">ALS</th>
                        <th style="padding:8px; border-bottom:1px solid #444; text-align:center;">Sent.</th>
                        <th style="padding:8px; border-bottom:1px solid #444; text-align:center;">Weighted</th>
                        <th style="padding:8px; border-bottom:1px solid #444; text-align:center;">Decision</th>
                    </tr>
                </thead>
                <tbody>
                    {rows_html}
                </tbody>
            </table>
        </div>
        """))


# ---------- SIMILAR MOVIES MODULE ----------
def show_similar_movies(_btn):
    with output_similar:
        output_similar.clear_output()

        movie_query = movie_dropdown.value.strip()
        if not movie_query:
            display(HTML("<p style='color:#ff7777;'>❌ Choose a base movie first.</p>"))
            return

        base_df = movies_with_sentiment.filter(
            lower(col("title")) == movie_query.lower()
        ).limit(1)
        if base_df.count() == 0:
            display(HTML("<p style='color:#ff7777;'>❌ Base movie not found.</p>"))
            return

        base_movie = base_df.first()
        base_genres = base_movie.genres or "Unknown"
        base_main_genre = base_genres.split('|')[0] if base_genres != "Unknown" else None

        if not base_main_genre:
            display(HTML("<p style='color:#ff7777;'>⚠ No genres available for this movie.</p>"))
            return

        # Find movies sharing main genre
        similar_df = (movies_with_sentiment
                      .filter(col("genres").contains(base_main_genre))
                      .filter(col("movieId") != base_movie.movieId)
                      .limit(200))

        # Attach ALS scores for user
        sim_pd = similar_df.select("movieId", "title", "genres", "sentiment_score").toPandas()
        if sim_pd.empty:
            display(HTML("<p style='color:#ff7777;'>⚠ No similar movies found.</p>"))
            return

        # For simplicity: treat ALS scores via a small Spark batch
        sim_ids = [int(m) for m in sim_pd["movieId"].tolist()]
        als_input_df = spark.createDataFrame([Row(userId=EXAMPLE_USER_ID, movieId=int(m)) for m in sim_ids])
        als_out = als_model.transform(als_input_df).toPandas()
        als_map = dict(zip(als_out["movieId"], als_out["prediction"]))

        sim_pd["als_score"] = sim_pd["movieId"].map(als_map)
        sim_pd = sim_pd.dropna(subset=["als_score"])

        sim_pd["final_score"] = ALS_WEIGHT*sim_pd["als_score"] + SENTIMENT_WEIGHT*sim_pd["sentiment_score"]
        sim_pd = sim_pd.sort_values("final_score", ascending=False).head(10)

        rows = ""
        for _, r in sim_pd.iterrows():
            rows += f"""
            <tr>
                <td>{r['title']}</td>
                <td>{r['genres']}</td>
                <td style="text-align:center;">{r['als_score']:.2f}</td>
                <td style="text-align:center;">{r['sentiment_score']:.2f}</td>
                <td style="text-align:center;">{r['final_score']:.2f}</td>
            </tr>
            """

        display(HTML(f"""
        <div style="
            background:#101010;
            color:#e0e0e0;
            padding:20px;
            border-radius:14px;
            width:900px;
            font-family:Arial, sans-serif;
            margin-top:15px;
            box-shadow:0px 0px 12px rgba(0,0,0,0.7);
        ">
            <h2 style="margin-top:0; color:#fff;">🎬 Movies similar to: {base_movie.title}</h2>
            <p style="color:#bbb;">Based on main genre: <b>{base_main_genre}</b></p>
            <table style="width:100%; border-collapse:collapse; font-size:14px;">
                <thead>
                    <tr style="background:#222;">
                        <th style="padding:8px; border-bottom:1px solid #444; text-align:left;">Title</th>
                        <th style="padding:8px; border-bottom:1px solid #444; text-align:left;">Genres</th>
                        <th style="padding:8px; border-bottom:1px solid #444; text-align:center;">ALS</th>
                        <th style="padding:8px; border-bottom:1px solid #444; text-align:center;">Sentiment</th>
                        <th style="padding:8px; border-bottom:1px solid #444; text-align:center;">Weighted</th>
                    </tr>
                </thead>
                <tbody>
                    {rows}
                </tbody>
            </table>
        </div>
        """))


# ---------- CHARTS ----------
def show_charts(_btn):
    with output_charts:
        output_charts.clear_output()

        # sample for chart speed
        sample_df = movies_with_sentiment.sample(False, 0.15, seed=42)
        pdf = sample_df.select("sentiment_score").toPandas()

        fig, ax = plt.subplots(figsize=(6,4))
        ax.hist(pdf["sentiment_score"], bins=20)
        ax.set_title("Sentiment Score Distribution (sampled)")
        ax.set_xlabel("Sentiment score")
        ax.set_ylabel("Count")
        plt.tight_layout()
        display(fig)
        plt.close(fig)



analyze_button.on_click(analyze_movie)
toprecs_button.on_click(show_top_recommendations)
similar_button.on_click(show_similar_movies)
charts_button.on_click(show_charts)

button_row1 = widgets.HBox([analyze_button, toprecs_button, similar_button, charts_button])
slider_row1 = widgets.HBox([strong_thr_slider, mid_thr_slider, sent_thr_slider])

display(movie_dropdown, button_row1, slider_row1,
        output_movie, output_recs, output_similar, output_charts)


Dropdown(description='🎬 Movie:', layout=Layout(width='600px'), options=('""Great Performances"" Cats (1998)', …

HBox(children=(Button(button_style='info', description='Analyze movie', layout=Layout(width='160px'), style=Bu…

HBox(children=(FloatSlider(value=3.5, continuous_update=False, description='Strong ≥', layout=Layout(width='25…

Output()

Output()

Output()

Output()

In [19]:

from pyspark.sql.functions import col, split, explode

print("=== TOP 10 RECOMMENDATIONS FOR USER ===")
top10 = als_model.recommendForAllUsers(10)
top10.filter(col("userId") == EXAMPLE_USER_ID).show(truncate=False)

print("\n=== SENTIMENT SCORE SAMPLE ===")
movies_with_sentiment.select("title", "sentiment_score").show(10, truncate=False)

print("\n=== GENRE SENTIMENT AVERAGES ===")
genre_sent = movies_with_sentiment.withColumn(
    "genre", explode(split(col("genres"), "\\|"))
)
genre_sent.groupBy("genre").avg("sentiment_score") \
    .orderBy("avg(sentiment_score)", ascending=False).show(20, truncate=False)

print("\n=== RANDOM MOVIE RATING & SENTIMENT COMPARISON ===")
sample_compare = movies_with_sentiment.join(
    ratings_sample.groupBy("movieId").avg("rating"),
    on="movieId", how="left"
).select("title", "sentiment_score", "avg(rating)")
sample_compare.show(15, truncate=False)


=== TOP 10 RECOMMENDATIONS FOR USER ===
+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                                                        |
+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|118205|[{129401, 5.308905}, {128812, 5.308905}, {128091, 5.308905}, {82055, 5.1622553}, {34464, 5.122144}, {117907, 5.014639}, {112423, 4.729891}, {86368, 4.6899037}, {107623, 4.686054}, {102107, 4.686054}]|
+------+--------------------------------------------------------------------------------------------------------------------