In [45]:
import polars as pl
import pandas as pd
import numpy as np
import altair as alt

from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
(
    pl.scan_parquet("data/processed/processed.parquet").schema
)

{'userId': Int64,
 'movieId': Int64,
 'rating': Float64,
 'timestamp': Int64,
 'imdbId': Int64,
 'tmdbId': Int64,
 'id': Int32,
 'original_title': Utf8,
 'popularity': Float64,
 'release_date': Date,
 'revenue': Int64,
 'runtime': Float64,
 'vote_average': Float64,
 'vote_count': Int64}

In [3]:
lf = (
    pl.scan_parquet("data/processed/processed.parquet", low_memory=True)
    .select(["userId", "movieId", "rating", "timestamp", "original_title"])
)

In [4]:
movie_titles = (
    lf
    .select(["movieId", "original_title"])
    .unique(["movieId"])
    .collect()
)
movie_titles.head(5)

movieId,original_title
i64,str
117736,"""The Dark Horse…"
168928,"""Uncle Kent 2"""
103424,"""Hypocrites"""
116664,"""Advanced Style…"
51088,"""Reno 911!: Mia…"


In [5]:
minimum_movie_ratings = 10000
minimum_user_rated = 200

allowed_movies = (
    lf
    .select("movieId")
    .collect()
    .to_series()
    .value_counts()
    .filter(pl.col("counts") > minimum_movie_ratings)
    .select("movieId")
    .to_series()
)

allowed_users = (
    lf
    .select("userId")
    .collect()
    .to_series()
    .value_counts()
    .filter(pl.col("counts") > minimum_user_rated)
    .select("userId")
    .to_series()
)

f_df = (
    lf
    .filter(pl.col("movieId").is_in(allowed_movies))
    .filter(pl.col("userId").is_in(allowed_users))
    .collect()
)

In [28]:
n = 1_000_000
f_df.select(["movieId", "userId", "rating"]).sample(fraction=1)
pdf_train = f_df[:-n].to_pandas()
pdf_test = f_df[-n:].to_pandas()


In [36]:
pdf_train_p = pdf_train.pivot_table(index="userId", columns="movieId", values="rating")

In [37]:
n = 10

ratings_mean = pdf_train_p.mean(axis=0).sort_values(ascending=False).rename("rating_mean").to_frame()
ratings_count = pdf_train_p.count(axis=0).rename("rating_count").to_frame()

ranking_mean_rating = ratings_mean.head(n).join(ratings_count).join(movie_titles.to_pandas())

In [38]:
df_pred = pdf_test.set_index("movieId").join(ratings_mean)[["rating", "rating_mean"]]
y_true = df_pred["rating"]
y_pred = df_pred["rating_mean"]


In [39]:
rmse = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred))

In [40]:
ranking_mean_rating = ranking_mean_rating.sort_values("rating_mean", ascending=False)

# Generate the text column
ranking_mean_rating["text"] = (
    ranking_mean_rating["original_title"].astype(str)
    + ": "
    + ranking_mean_rating["rating_count"].astype(str)
    + " Ratings"
)

# Define the chart
chart = (
    alt.Chart(ranking_mean_rating)
    .mark_bar(color="#db0000")
    .encode(
        x=alt.X(
            "rating_mean:Q",
            title="Mean-Rating",
            scale=alt.Scale(domain=[4, 5], clamp=True),
        ),
        y=alt.Y(
            "movieId:O",
            title="Movie",
            sort=alt.EncodingSortField(
                field="rating_mean", op="mean", order="descending"
            ),
        ),
        text=alt.Text("txt:N", title="Rating Count"),
    )
)

# Set the text position
text = chart.mark_text(
    align="left",
    baseline="middle",
    dx=5,  # Adjust the text position
    color="#000000",
).encode(text="text:N")

# Combine the chart and text
fig = (chart + text).properties(
    title="Ranking Of Top {} Mean-Movie-Ratings: {:.4f} RMSE".format(n, rmse)
)

fig

In [41]:
min_votes = 1000

mean_all = pdf_train_p.stack().mean()
mean_sep = pdf_train_p.mean(axis=0).values
count_sep = pdf_train_p.count().values

In [51]:
weighted_score = (count_sep / (count_sep + min_votes) * mean_sep) + (
    min_votes / (count_sep + min_votes) * mean_all
)
weighted_ranking = np.argsort(weighted_score)[::-1]
weighted_sort = np.sort(weighted_score)[::-1]
weighted_movie_id = pdf_train_p.columns[weighted_ranking]

In [52]:
df_pred = pdf_test.set_index("movieId").join(pd.DataFrame(weighted_score, index=weighted_movie_id, columns=["prediction"]))[["rating", "prediction"]]

In [56]:
y_true = df_pred["rating"]
y_pred = df_pred["prediction"]

In [57]:
rmse = np.sqrt(mean_squared_error(y_true, y_pred))

In [76]:
df_plot = pd.DataFrame(weighted_score[:n], columns=['rating'])
df_plot.index = weighted_movie_id[:10]
ranking_weighted_rating = df_plot.join(ratings_count).join(movie_titles.to_pandas())
del df_plot

In [77]:
ranking_weighted_rating

Unnamed: 0_level_0,rating,rating_count,movieId,original_title
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
318,3.837475,20978,146608,Geppo il folle
858,3.113709,17475,88816,My Son John
50,3.085025,18058,157168,Forbidden Island
1221,2.995042,13554,138120,The Expedition
296,3.826571,22299,55112,Shanghai Kiss
2959,3.267953,18727,164344,Sidewalk Stories
527,3.379097,17456,130408,A Rumor Of War
2019,3.526841,6594,2424,You've Got Mail
904,3.791968,9580,60544,Drabet
1193,3.799178,14512,141992,Nieulotne


In [99]:
# Define the chart
chart = (
    alt.Chart(ranking_weighted_rating)
    .mark_bar(color="#db0000")
    .encode(
        x=alt.X(
            "rating:Q",
            title="Weighted Rating",
            scale=alt.Scale(domain=[3, 4.6], clamp=True),
        ),
        y=alt.Y(
            "row_number:O",
            title="Movie",
            sort=alt.EncodingSortField(field="rating", op="mean", order="descending"),
        ),
        text=alt.Text(
            "original_title:N", title="Movie Name", format="original_title:s"
        ),
    )
    .transform_window(
        row_number="row_number(rating)",
        sort=[alt.EncodingSortField(field="rating", order="descending")]
    )
)

# Set the text position
text = chart.mark_text(
    align="left",
    baseline="middle",
    dx=5,  # Adjust the text position
    color="#000000",
).encode(text="original_title:N")


# Combine the chart and text
fig = (chart + text).properties(
    title="Ranking Of Top {} Weighted-Movie-Ratings: {:.4f} RMSE".format(n, rmse)
)

# Show the plot
fig