In [249]:
import plotly.express as px
import plotly.graph_objects as go
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
from bigdata_a3_utils import VALID_CATEGORIES

In [250]:
VALID_CATEGORIES: list[str] = [
    "Gift_Cards",
    "Subscription_Boxes",
    "Magazine_Subscriptions",
]

In [251]:
df: pl.DataFrame = pl.concat([
    pl.read_parquet(f"data/intermediate/{category}.parquet")
    for category in VALID_CATEGORIES
])

In [258]:
fig = px.histogram(
    df,
    x="rating",
    width=900,
    height=900,
    color="verified_purchase",
    barmode="stack",
    color_discrete_sequence=px.colors.qualitative.Set2,
)
fig.update_layout(
    bargap=0.2,
    title=dict(
        text="Distribution of Star Ratings",
        x=0.5,
        font_family="Arial Black",
    ),
)

In [253]:
top_categories = (
    (
        df.with_columns(
            pl.col("main_category").fill_null(pl.col("main_category")).alias("category")
        )
    )
    .group_by("category")
    .len()
    .sort("len", descending=True)
    .head(10)
)

fig = px.bar(
    top_categories,
    x="len",
    y="category",
    height=900,
    color="category",
    color_discrete_sequence=px.colors.qualitative.Prism,
    orientation="h"
)

fig.update_layout(
    title=dict(
        text="Top 10 Categories",
        x=0.5,
        font_family="Arial Black",
    ),
    xaxis_title="Category",
    yaxis_title="Total Review Count",
    showlegend=False,
    margin_l=200
)

fig.show()

In [254]:
top_brands: pl.DataFrame = (
    df.filter(~(pl.col("brand") == "Unknown"))
    .group_by("brand")
    .len()
    .sort("len", descending=True)
    .head(10)
)

fig = px.bar(
    top_brands,
    x="len",
    y="brand",
    width=900,
    height=900,
    color="brand",
    color_discrete_sequence=px.colors.qualitative.Prism,
    orientation="h"
)

fig.update_layout(
    title=dict(
        text="Top 10 Brands",
        x=0.5,
        font_family="Arial Black",
    ),
    xaxis_title="Brand",
    yaxis_title="Total Review Count",
    showlegend=False,
    margin_l=200
)

fig.show()

In [255]:
ratings_by_year = df[["rating", "year"]].group_by("year").mean().sort("year")

fig = px.line(
    ratings_by_year,
    x="year",
    y="rating",
    width=900,
    height=900,
    
)

fig.update_layout(
    title=dict(
        text="Average Star Rating Per Year",
        x=0.5,
        font_family="Arial Black",
    ),
    xaxis_title="Year",
    yaxis_title="Average Rating",
)

fig.update_xaxes(dtick=2)

fig.show()


In [256]:
corr_matrix = df[["review_length", "rating"]].corr().to_numpy()

fig = px.imshow(
    corr_matrix,
    text_auto=True,
    color_continuous_scale=px.colors.sequential.Purpor_r,
    x=["review length", "rating"],
    y=["review length", "rating"],
)

fig.update_layout(
    title=dict(
        text="Correlation Between Review Length and Star Rating",
        x=0.5,
        font_family="Arial Black",
    ),
    width=900,
    height=900,
)

fig.show()

In [257]:
corr_matrix = df[["verified_purchase", "rating"]].corr().to_numpy()

fig = px.imshow(
    corr_matrix,
    text_auto=True,
    color_continuous_scale=px.colors.sequential.Purpor_r,
    x=["verified purchase", "rating"],
    y=["verified purchase", "rating"],
)

fig.update_layout(
    title=dict(
        text="Correlation Between Verified Purchase and Star Rating",
        x=0.5,
        font_family="Arial Black",
    ),
    width=900,
    height=900,
)

fig.show()