In [1]:
import polars as pl
import altair as alt

In [2]:
(
    pl.scan_parquet("data/processed/processed.parquet").schema
)

{'userId': Int64,
 'movieId': Int64,
 'rating': Float64,
 'timestamp': Int64,
 'imdbId': Int64,
 'tmdbId': Int64,
 'id': Int32,
 'original_title': Utf8,
 'popularity': Float64,
 'release_date': Date,
 'revenue': Int64,
 'runtime': Float64,
 'vote_average': Float64,
 'vote_count': Int64}

In [3]:
lf = (
    pl.scan_parquet("data/processed/processed.parquet", low_memory=True)
    .select(["userId", "movieId", "rating", "timestamp"])
)

In [4]:
top_df = (
    lf.groupby("movieId")
    .agg(pl.col("rating").count())
    .sort(pl.col("rating"), descending=True)
    .with_row_count()
    .head(30)
    .collect()
)

In [6]:
alt.Chart(top_df.to_pandas()).mark_bar().encode(
    x=alt.X("movieId:O", sort="-y"),
    y="rating",
    tooltip=["movieId", "rating"]
).properties(
    width=1000,  # Specify the desired width of the chart
    height=400
).configure_axis(
    labelFontSize=14,  # Increase the font size of the axis labels
).configure_mark(
    size=10,  # Decrease the size of the bars
)

In [8]:
rating_top_df = (
    lf
    .select("rating")
    .collect()
    .to_series()
    .cut(bins=[0, 1, 2, 3, 4, 5])
    .groupby("break_point")
    .agg(pl.count())
    .rename({"break_point": "rating"})
)

In [10]:
rating_top_df

rating,count
f64,u32
5.0,5966742
2.0,2162582
1.0,1244643
4.0,10095163
3.0,6502925


In [11]:
alt.Chart(rating_top_df.to_pandas()).mark_bar().encode(
    x="rating:O",
    y="count",
    tooltip=["rating", "count"]
).properties(
    width=1000,  # Specify the desired width of the chart
    height=400
).configure_axis(
    labelFontSize=14,  # Increase the font size of the axis labels
    labelAngle=0
).configure_mark(
    size=10,  # Decrease the size of the bars
)

In [12]:
rating_month_df = (
    lf.select("timestamp")
    .with_columns(date=pl.from_epoch("timestamp", time_unit="s"))
    .with_columns(date=pl.col("date").dt.date())
    .groupby(pl.col("date").dt.truncate("1mo"))
    .agg(pl.count())
    .collect()
)

In [13]:
rating_month_df

date,count
date,u32
2002-01-01,104423
2012-09-01,56923
2005-05-01,164152
2014-10-01,41257
2008-05-01,70813
2009-09-01,72144
2009-03-01,104428
2002-05-01,68103
2011-10-01,58074
1997-04-01,93801


In [14]:
alt.Chart(rating_month_df.to_pandas()).mark_line().encode(
    x="yearmonth(date):T",
    y="count:Q",
    tooltip=["date", "count"]
).configure_axis(
    labelFontSize=14,  # Increase the font size of the axis labels
    labelAngle=40
).properties(
    width=1000,  # Specify the desired width of the chart
    height=400
)

In [144]:
count_counter_df = (
    lf
    .groupby("movieId")
    .agg(pl.count())
    .select("count")
    .collect()
    .to_series()
    .cut(range(0, 10001, 100))
    .with_columns(break_point=pl.col("break_point").clip_max(2000))
    .groupby("break_point")
    .agg(pl.count())
)
count_counter_df.head(5)

break_point,count
f64,u32
1900.0,99
100.0,34734
1600.0,114
600.0,430
1300.0,145


In [145]:
alt.Chart(count_counter_df.to_pandas()).mark_bar().encode(
    x="break_point:O",
    y="count:Q",
    tooltip=["break_point", "count"]
).configure_axis(
    labelFontSize=14,  # Increase the font size of the axis labels
    labelAngle=90
).properties(
    width=1000,  # Specify the desired width of the chart
    height=400
)

In [149]:
# Given a rating total, how many users have rated movies that many?
rating_total_user_count_df = (
    lf
    .select("userId")
    .groupby("userId")
    .agg(pl.count())
    .collect()
    .select(rated_times="count")
    .to_series()
    .cut(bins=range(0, 1000, 10))
    .with_columns(break_point=pl.col("break_point").clip_max(250))
    .groupby("break_point")
    .agg(user_count=pl.count())
)
rating_total_user_count_df.head(5)

break_point,user_count
f64,u32
250.0,26388
120.0,4308
180.0,2166
230.0,1457
60.0,11248


In [150]:
alt.Chart(rating_total_user_count_df.to_pandas()).mark_bar().encode(
    x="break_point:O",
    y="user_count:Q",
    tooltip=["break_point", "user_count"]
).configure_axis(
    labelFontSize=14,  # Increase the font size of the axis labels
    labelAngle=90
).properties(
    width=1000,  # Specify the desired width of the chart
    height=400
)