# Data Visualisation

In [None]:
pip install plotly

In [10]:
pip install pyarrow

Collecting pyarrow
  Obtaining dependency information for pyarrow from https://files.pythonhosted.org/packages/a0/8e/9adee63dfa3911be2382fb4d92e4b2e7d82610f9d9f668493bebaa2af50f/pyarrow-20.0.0-cp312-cp312-win_amd64.whl.metadata
  Downloading pyarrow-20.0.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Downloading pyarrow-20.0.0-cp312-cp312-win_amd64.whl (25.7 MB)
   ---------------------------------------- 0.0/25.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/25.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/25.7 MB 495.5 kB/s eta 0:00:52
    --------------------------------------- 0.4/25.7 MB 3.6 MB/s eta 0:00:08
   - -------------------------------------- 1.0/25.7 MB 6.4 MB/s eta 0:00:04
   -- ------------------------------------- 1.7/25.7 MB 8.5 MB/s eta 0:00:03
   --- ------------------------------------ 2.5/25.7 MB 10.0 MB/s eta 0:00:03
   ----- ---------------------------------- 3.5/25.7 MB 11.9 MB/s eta 0:00:02
   ------- -------------------


[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
pip install fastparquet

Collecting fastparquet
  Obtaining dependency information for fastparquet from https://files.pythonhosted.org/packages/b1/f9/98cd0c39115879be1044d59c9b76e8292776e99bb93565bf990078fd11c4/fastparquet-2024.11.0-cp312-cp312-win_amd64.whl.metadata
  Downloading fastparquet-2024.11.0-cp312-cp312-win_amd64.whl.metadata (4.3 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Obtaining dependency information for cramjam>=2.3 from https://files.pythonhosted.org/packages/26/c7/baf6b960403313f9df3217f7b8039bb2e403559c95641e23a0b0056283c2/cramjam-2.10.0-cp312-cp312-win_amd64.whl.metadata
  Downloading cramjam-2.10.0-cp312-cp312-win_amd64.whl.metadata (5.1 kB)
Collecting fsspec (from fastparquet)
  Obtaining dependency information for fsspec from https://files.pythonhosted.org/packages/bb/61/78c7b3851add1481b048b5fdc29067397a1784e2910592bc81bb3f608635/fsspec-2025.5.1-py3-none-any.whl.metadata
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Downloading fastparquet-2024.11.0-cp312-cp312-


[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
pip install seaborn

Collecting seaborn
  Obtaining dependency information for seaborn from https://files.pythonhosted.org/packages/83/11/00d3c3dfc25ad54e731d91449895a79e4bf2384dc3ac01809010ba88f6d5/seaborn-0.13.2-py3-none-any.whl.metadata
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
   ---------------------------------------- 0.0/294.9 kB ? eta -:--:--
   ---------------------------------------- 0.0/294.9 kB ? eta -:--:--
   ---- ---------------------------------- 30.7/294.9 kB 660.6 kB/s eta 0:00:01
   ---------------------------------------- 294.9/294.9 kB 3.6 MB/s eta 0:00:00
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [33]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from filmsdk_ibrahim import MovieClient, MovieConfig
import time
import json
from collections import Counter, defaultdict
from pathlib import Path
import fastparquet
import pyarrow
import re
import pickle
import os
from statistics import mean

In [29]:
# Output folder
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)

In [30]:
# API connection

config = MovieConfig(movie_base_url="http://localhost")
client = MovieClient(config=config)

client.health_check()

MOVIE_API_BASE_URL in MovieConfig init: http://localhost


{'message': 'api is working well'}

# Top 10 genres by number of films

In [12]:
analytics = client.get_analytics()
print(analytics)

api_movie_count = analytics.movie_count
print(api_movie_count)

genre_data_file = output_dir / "genre_df.parquet"
meta_file = output_dir / "meta.json"

# Read the metadata file if it exists
if meta_file.exists():
    with open(meta_file, "r") as f:
        meta = json.load(f)
    cached_movie_count = meta.get("movie_count", 0)
else:
    meta = {}
    cached_movie_count = 0

# Decision: use cache or recalculate
if genre_data_file.exists() and cached_movie_count == api_movie_count:
    print("Loading data from cache...")
    genre_df = pd.read_parquet(genre_data_file)
else:
    print("Updating data from the API...")
    # Initialize the genre counter
    genre_counter = Counter()

    # Parameters for batching
    limit = 1000
    skip = 0

    while True:
        batch = client.list_movies(skip=skip, limit=limit, output_format="dict")
        if not batch:
            break

        # Extract genres from the batch and count them
        for movie in batch:
            genres = movie.get("genres", "")
            genre_list = genres.split("|") if genres else []
            genre_counter.update(genre_list)

        skip += limit
        time.sleep(0.5)  # To comply with the API

    # Convert the Counter to a DataFrame
    genre_df = pd.DataFrame(genre_counter.items(), columns=["genre", "count"])
    genre_df = genre_df.sort_values("count", ascending=False).head(10)

    # Save the data
    genre_df.to_parquet(genre_data_file, index=False)
    with open(meta_file, "w") as f:
        json.dump({"movie_count": api_movie_count}, f)

genre_df

movie_count=9742 rating_count=100836 tag_count=3683 link_count=9742
9742
Loading data from cache...


Unnamed: 0,genre,count
0,Drama,4361
1,Comedy,3756
2,Thriller,1894
3,Action,1828
4,Romance,1596
5,Adventure,1263
6,Crime,1199
7,Sci-Fi,980
8,Horror,978
9,Fantasy,779


In [13]:
# Plotly display
fig = px.bar(
    genre_df,
    x="count",
    y="genre",
    title="Top 10 genres by number of films",
    labels={"genre": "Genre", "count": "Number of films"},
    color="count",
    color_continuous_scale="viridis",
    orientation='h'
)

fig.update_layout(
    yaxis={'categoryorder':'total ascending'},
    height=500
)

fig.show()

# Analyse of user behaviour

In [35]:
# File paths
output_dir = "output"
ratings_path = os.path.join(output_dir, "ratings.parquet")

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Get the current total number of ratings
current_rating_count = analytics.rating_count

# Reload or regenerate the data
if os.path.exists(ratings_path):
    ratings_df = pd.read_parquet(ratings_path)
    # Check if cached data matches the current rating count
    if ratings_df.shape[0] != current_rating_count:
        print("Data has changed, reloading from the API...")
        ratings_df = []
        limit, skip = 1000, 0
        while True:
            batch = client.list_ratings(skip=skip, limit=limit, output_format="pandas")
            if batch.empty:
                break
            ratings_df.append(batch)
            skip += limit
            time.sleep(0.5)
        ratings_df = pd.concat(ratings_df, ignore_index=True)
        ratings_df.to_parquet(ratings_path, index=False)
    else:
        print("Reusing cached data.")
else:
    print("No cached data found, performing initial download...")
    ratings_df = []
    limit, skip = 1000, 0
    while True:
        batch = client.list_ratings(skip=skip, limit=limit, output_format="pandas")
        if batch.empty:
            break
        ratings_df.append(batch)
        skip += limit
        time.sleep(0.5)
    ratings_df = pd.concat(ratings_df, ignore_index=True)
    ratings_df.to_parquet(ratings_path, index=False)


No cached data found, performing initial download...


In [36]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [37]:
# Graph 1: Number of ratings per user
ratings_per_user = ratings_df['userId'].value_counts().reset_index()
ratings_per_user.columns = ['userId', 'rating_count']
top_users = ratings_per_user.head(10)

# Horizontal bar plot
fig1 = px.bar(
    top_users,
    x="rating_count",
    y=top_users["userId"].astype(str),  # convert to string to avoid numeric scale on y-axis
    orientation="h",
    title="Top 10 Users by Number of Ratings",
    labels={"userId": "User", "rating_count": "Number of Ratings"},
    color="rating_count",
    color_continuous_scale="viridis"
)

# Sort so the largest value is on top
fig1.update_layout(
    yaxis={'categoryorder':'total ascending'},
    height=500
)

fig1.show()

In [38]:
# Graph 2: Distribution of given ratings
fig2 = px.histogram(
    ratings_df,
    x="rating",
    nbins=10,
    title="Distribution of Ratings Given",
    labels={"rating": "Rating"},
)
fig2.update_layout(bargap=0.1)
fig2.show()


In [39]:
# Graph 3: Very positive vs very critical users
avg_rating_per_user = ratings_df.groupby("userId")["rating"].mean().reset_index()
fig3 = px.histogram(
    avg_rating_per_user,
    x="rating",
    nbins=50,
    title="Distribution of Average Ratings per User",
    labels={"rating": "Average Rating"},
)
fig3.update_layout(bargap=0.1)
fig3.show()


# Total number of films per year

In [14]:
yearly_data_file = output_dir / "movies_by_year.parquet"
meta_file = output_dir / "meta_movies_by_year.json"

# === Retrieve the total number of movies via analytics ===
#analytics = client.get_analytics()
api_movie_count = analytics.movie_count

# === Read from cache if it exists ===
if meta_file.exists():
    with open(meta_file, "r") as f:
        meta = json.load(f)
    cached_movie_count = meta.get("movie_count", 0)
else:
    cached_movie_count = 0

# === Use cache or recalculate ===
if yearly_data_file.exists() and cached_movie_count == api_movie_count:
    print("Loading data from cache...")
    df_yearly = pd.read_parquet(yearly_data_file)

else:
    print("Extracting years from the API...")

    # === Initialization ===
    year_counter = Counter()
    skip = 0
    limit = 500
    year_pattern = re.compile(r"\((\d{4})\)$")

    while True:
        batch = client.list_movies(skip=skip, limit=limit, output_format="dict")
        if not batch:
            break

        for movie in batch:
            title = movie.get("title", "")
            match = year_pattern.search(title)
            if match:
                year = int(match.group(1))
                year_counter[year] += 1

        skip += limit
        time.sleep(0.5)

    # === Build the DataFrame ===
    df_yearly = pd.DataFrame(sorted(year_counter.items()), columns=["year", "movie_count"])

    # === Save to cache ===
    df_yearly.to_parquet(yearly_data_file, index=False)
    with open(meta_file, "w") as f:
        json.dump({"movie_count": api_movie_count}, f)
df_yearly

Loading data from cache...


Unnamed: 0,year,movie_count
0,1902,1
1,1903,1
2,1908,1
3,1915,1
4,1916,4
...,...,...
101,2014,277
102,2015,274
103,2016,218
104,2017,147


In [15]:
# === Display with Plotly ===
fig = px.bar(
    df_yearly,
    x="year",
    y="movie_count",
    title="Total Number of Films per Year (Based on Title)",
    labels={"year": "Year", "movie_count": "Number of Films"},
)

fig.update_layout(
    xaxis_title="Year",
    yaxis_title="Number of Films",
    height=500
)

fig.show()


# Top 20 films by number of ratings

In [16]:
top_movies_file = output_dir / "top_movies_by_ratings.parquet"
meta_file = output_dir / "meta_top_movies.json"

# === Retrieve API metrics ===
#analytics = client.get_analytics()
api_movie_count = analytics.movie_count
api_rating_count = analytics.rating_count

# === Check the cache ===
if meta_file.exists():
    with open(meta_file, "r") as f:
        meta = json.load(f)
    cached_movie_count = meta.get("movie_count", 0)
    cached_rating_count = meta.get("rating_count", 0)
else:
    cached_movie_count = 0
    cached_rating_count = 0

# === Use cache or recalculate ===
if (
    top_movies_file.exists()
    and cached_movie_count == api_movie_count
    and cached_rating_count == api_rating_count
):
    print("Loading data from cache...")
    top_movies_df = pd.read_parquet(top_movies_file)

else:
    print("Fetching ratings from the API...")

    # === Initialize counters ===
    movie_rating_count = defaultdict(int)
    movie_rating_sum = defaultdict(float)

    # === Batch process ratings ===
    limit = 500
    skip = 0

    while True:
        batch = client.list_ratings(skip=skip, limit=limit, output_format="dict")
        if not batch:
            break

        for rating in batch:
            movie_id = rating["movieId"]
            score = rating["rating"]
            movie_rating_count[movie_id] += 1
            movie_rating_sum[movie_id] += score

        skip += limit
        time.sleep(0.5)

    # === Build stats DataFrame ===
    stats = [
        {
            "movieId": movie_id,
            "rating_count": movie_rating_count[movie_id],
            "avg_rating": movie_rating_sum[movie_id] / movie_rating_count[movie_id]
        }
        for movie_id in movie_rating_count
    ]

    stats_df = pd.DataFrame(stats)
    top_movies_df = stats_df.sort_values("rating_count", ascending=False).head(20)

    # === Add movie titles via API ===
    movie_titles = {}
    for movie_id in top_movies_df["movieId"]:
        try:
            movie_data = client.get_movie(movie_id)
            movie_titles[movie_id] = movie_data.title
        except Exception as e:
            print(f"Error retrieving title for movieId {movie_id}: {e}")
            movie_titles[movie_id] = f"Movie {movie_id}"

    top_movies_df["title"] = top_movies_df["movieId"].map(movie_titles)

    # === Save to cache ===
    top_movies_df.to_parquet(top_movies_file, index=False)
    with open(meta_file, "w") as f:
        json.dump(
            {
                "movie_count": api_movie_count,
                "rating_count": api_rating_count
            },
            f
        )
top_movies_df

Loading data from cache...


Unnamed: 0,movieId,rating_count,avg_rating,title
0,356,329,4.164134,Forrest Gump (1994)
1,318,317,4.429022,"Shawshank Redemption, The (1994)"
2,296,307,4.197068,Pulp Fiction (1994)
3,593,279,4.16129,"Silence of the Lambs, The (1991)"
4,2571,278,4.192446,"Matrix, The (1999)"
5,260,251,4.231076,Star Wars: Episode IV - A New Hope (1977)
6,480,238,3.75,Jurassic Park (1993)
7,110,237,4.031646,Braveheart (1995)
8,589,224,3.970982,Terminator 2: Judgment Day (1991)
9,527,220,4.225,Schindler's List (1993)


In [17]:
# === Display with Plotly ===
fig = px.bar(
    top_movies_df.sort_values("rating_count", ascending=True),
    x="rating_count",
    y="title",
    color="avg_rating",
    orientation="h",
    title="Top 20 Movies by Number of Ratings",
    labels={
        "title": "Movie Title",
        "rating_count": "Number of Ratings",
        "avg_rating": "Average Rating"
    },
    color_continuous_scale="viridis"
)

fig.update_layout(
    yaxis={'categoryorder': 'total ascending'},
    height=700
)

fig.show()

# Top tags utilisés par les utilisateurs de la plateforme

In [18]:
tag_usage_file = output_dir / "user_tag_stats.parquet"
meta_file = output_dir / "meta_users_behavior.json"

# Retrieve API metrics to monitor changes:
#analytics = client.get_analytics()
api_rating_count = analytics.rating_count
api_tag_count = analytics.tag_count

if meta_file.exists():
    with open(meta_file, "r") as f:
        meta = json.load(f)
else:
    meta = {}

cached_rating_count = meta.get("rating_count", 0)
cached_tag_count = meta.get("tag_count", 0)

# Frequently used tags by users

if tag_usage_file.exists() and cached_tag_count == api_tag_count:
    print("Loading cache: user-used tags")
    tag_df = pd.read_parquet(tag_usage_file)
else:
    print("Recomputing: user-used tags")
    tag_counter = Counter()
    limit = 500
    skip = 0

    while True:
        batch = client.list_tags(skip=skip, limit=limit, output_format="dict")
        if not batch:
            break
        for tag in batch:
            tag_text = tag.get("tag", "")
            tag_counter[tag_text] += 1
        skip += limit
        time.sleep(0.5)

    tag_df = pd.DataFrame(tag_counter.items(), columns=["tag", "count"])
    tag_df = tag_df[tag_df["tag"].str.strip() != ""]
    tag_df = tag_df.sort_values("count", ascending=False).head(20)
    tag_df.to_parquet(tag_usage_file, index=False)

Loading cache: user-used tags


In [19]:
tag_df

Unnamed: 0,tag,count
0,In Netflix queue,131
1,atmospheric,36
2,superhero,24
3,thought-provoking,24
4,Disney,23
5,surreal,23
6,funny,23
7,religion,22
8,dark comedy,21
9,sci-fi,21


In [20]:
fig4 = px.bar(
    tag_df, x="count", y="tag", orientation="h",
    title="Top Tags Used by Users",
    labels={"count": "Number of Uses", "tag": "Tag"},
    color="count", color_continuous_scale="viridis"
)
fig4.update_layout(yaxis={'categoryorder': 'total ascending'})
fig4.show()

with open(meta_file, "w") as f:
    json.dump({
        "rating_count": api_rating_count,
        "tag_count": api_tag_count
    }, f)


# Other insights on tags

In [21]:
analytics_path = os.path.join(output_dir, "analytics.pkl")
tags_by_genre_path = os.path.join(output_dir, "tags_by_genre.parquet")
tags_good_rating_path = os.path.join(output_dir, "tags_good_rating.parquet")
tags_compare_path = os.path.join(output_dir, "tags_compare.parquet")

# Retrieve current statistics from the API
current_stats = client.get_analytics().__dict__

# Utility function to load or regenerate a cache
def use_or_generate(path, current_stats, compute_fn):
    if os.path.exists(path) and os.path.exists(analytics_path):
        with open(analytics_path, "rb") as f:
            saved_stats = pickle.load(f)
        if saved_stats == current_stats:
            return pd.read_parquet(path)

    df = compute_fn()
    df.to_parquet(path, index=False)
    with open(analytics_path, "wb") as f:
        pickle.dump(current_stats, f)
    return df

# -------------------------------
# 1. Most used tags by genre
# -------------------------------
def compute_tags_by_genre():
    genre_tag_counter = defaultdict(Counter)

    # Batch loading
    skip = 0
    limit = 500
    while True:
        movies = client.list_movies(skip=skip, limit=limit, output_format="dict")
        if not movies:
            break
        movie_dict = {
            m["movieId"]: m["genres"].split("|") if m["genres"] else []
            for m in movies
        }

        tags = client.list_tags(skip=skip, limit=limit, output_format="dict")
        for tag in tags:
            genres = movie_dict.get(tag["movieId"], [])
            for genre in genres:
                genre_tag_counter[genre][tag["tag"]] += 1

        skip += limit
        time.sleep(0.5)

    records = []
    for genre, tag_counter in genre_tag_counter.items():
        for tag, count in tag_counter.items():
            records.append({"genre": genre, "tag": tag, "count": count})
    df = pd.DataFrame(records)
    df = df.sort_values(["genre", "count"], ascending=[True, False])
    return df

In [22]:
tags_by_genre_df = use_or_generate(tags_by_genre_path, current_stats, compute_tags_by_genre)
# Top 3 tags by genre
top_tags_by_genre = tags_by_genre_df.groupby("genre").apply(lambda g: g.nlargest(3, 'count')).reset_index(drop=True)
# Concat genre + tag
top_tags_by_genre["tag_label"] = top_tags_by_genre["tag"] + " (" + top_tags_by_genre["genre"] + ")"
tags_by_genre_df





Unnamed: 0,genre,tag,count
0,Action,sci-fi,4
1,Action,superhero,4
2,Action,aliens,3
3,Action,boxing,3
4,Action,classic,2
...,...,...,...
760,Western,James Fennimore Cooper,1
761,Western,music,1
762,Western,dark humor,1
763,Western,easygoing,1


In [23]:
fig = px.bar(
    top_tags_by_genre.sort_values("count"),
    x="count",
    y="tag_label",
    color="genre",
    orientation="h",
    title="Top 3 Most Used Tags by Genre",
    labels={"count": "Number of Occurrences", "tag_label": "Tag (Genre)"},
    height=800
)
fig.update_layout(yaxis=dict(categoryorder='total ascending'))
fig.show()

In [24]:
# -------------------------------
# 2. Most frequent tags in well-rated movies (>= 4)
# -------------------------------
def compute_tags_for_good_ratings():
    good_ratings = []
    tags_by_movie = defaultdict(list)

    # Load ratings >= 4
    skip = 0
    limit = 500
    while True:
        ratings = client.list_ratings(skip=skip, limit=limit, output_format="dict")
        if not ratings:
            break
        good_ratings += [r for r in ratings if r["rating"] >= 4]
        skip += limit
        time.sleep(0.5)

    # Associate tags with well-rated movieIds
    movie_ids = set([r["movieId"] for r in good_ratings])
    skip = 0
    limit = 500
    while True:
        tags = client.list_tags(skip=skip, limit=limit, output_format="dict")
        if not tags:
            break
        for tag in tags:
            if tag["movieId"] in movie_ids:
                tags_by_movie[tag["tag"]].append(tag["movieId"])
        skip += limit
        time.sleep(0.5)

    df = pd.DataFrame([(tag, len(movies)) for tag, movies in tags_by_movie.items()],
                      columns=["tag", "count"])
    df = df.sort_values("count", ascending=False).head(20)
    return df

tags_good_rating_df = use_or_generate(tags_good_rating_path, current_stats, compute_tags_for_good_ratings)
tags_good_rating_df


Unnamed: 0,tag,count
0,In Netflix queue,102
1,atmospheric,33
2,Disney,23
3,funny,23
4,thought-provoking,23
5,surreal,22
6,superhero,22
7,quirky,21
8,sci-fi,21
9,psychology,21


In [25]:
# Visualization: Most frequent tags in well-rated movies
fig2 = px.bar(
    tags_good_rating_df,
    x="count",
    y="tag",
    orientation="h",
    title="Most Frequent Tags in Well-Rated Movies (Rating ≥ 4)",
    labels={"count": "Number of Occurrences", "tag": "Tag"},
    color="count",
    color_continuous_scale="viridis"
)
fig2.update_layout(yaxis={'categoryorder':'total ascending'})
fig2.show()

In [26]:
# -------------------------------
# 3. Comparison: Tags in well-rated vs poorly-rated movies
# -------------------------------
def compute_tags_compare():
    tag_counter_good = Counter()
    tag_counter_bad = Counter()

    # Ratings in batches
    skip = 0
    limit = 500
    rating_map = {}
    while True:
        ratings = client.list_ratings(skip=skip, limit=limit, output_format="dict")
        if not ratings:
            break
        for r in ratings:
            rating_map[r["movieId"]] = rating_map.get(r["movieId"], []) + [r["rating"]]
        skip += limit
        time.sleep(0.5)

    # Average rating per movie
    movie_avg_rating = {
        mid: sum(ratings)/len(ratings)
        for mid, ratings in rating_map.items()
    }

    # Tags in batches
    skip = 0
    while True:
        tags = client.list_tags(skip=skip, limit=limit, output_format="dict")
        if not tags:
            break
        for tag in tags:
            avg_rating = movie_avg_rating.get(tag["movieId"])
            if avg_rating is not None:
                if avg_rating >= 4:
                    tag_counter_good[tag["tag"]] += 1
                elif avg_rating < 3:
                    tag_counter_bad[tag["tag"]] += 1
        skip += limit
        time.sleep(0.5)

    tags = set(tag_counter_good.keys()) | set(tag_counter_bad.keys())
    data = []
    for tag in tags:
        data.append({
            "tag": tag,
            "count_good": tag_counter_good.get(tag, 0),
            "count_bad": tag_counter_bad.get(tag, 0)
        })
    df = pd.DataFrame(data)
    df["total"] = df["count_good"] + df["count_bad"]
    df = df[df["total"] > 5].sort_values("total", ascending=False).head(20)
    return df

tags_compare_df = use_or_generate(tags_compare_path, current_stats, compute_tags_compare)
tags_compare_df

Unnamed: 0,tag,count_good,count_bad,total
0,In Netflix queue,54,0,54
1,atmospheric,16,0,16
2,thought-provoking,13,1,14
3,dark comedy,13,0,13
4,suspense,11,1,12
5,religion,7,4,11
6,dark,9,1,10
7,psychology,10,0,10
8,surreal,10,0,10
9,superhero,1,9,10


In [27]:
# Visualization: Comparison of tags
fig3 = px.bar(
    tags_compare_df.melt(id_vars="tag", value_vars=["count_good", "count_bad"],
                         var_name="Type", value_name="count"),
    x="count",
    y="tag",
    color="Type",
    barmode="group",
    title="Tag Comparison: Well-Rated vs Poorly-Rated Movies",
    labels={"count": "Number of Occurrences", "tag": "Tag"}
)
fig3.update_layout(yaxis={'categoryorder':'total ascending'}, height=600)
fig3.show()
