In [1]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
from load_scrape_data import get_clean_data_df

from constants import (
    CRIT_SCORE_K,
    DATE_K,
    SCORE_KS,
    USER_SCORE_K,
)

In [2]:
pio.renderers.default = 'iframe_connected'

In [3]:
df = get_clean_data_df()

In [4]:
useful_cols = list(SCORE_KS) + [DATE_K]

# masking
masks = [pd.notna(df[k]) for k in SCORE_KS]
has_score_mask = pd.concat(masks, axis=1).any(axis=1)

post_2003_mask = df[DATE_K] >= pd.to_datetime('2004-01-01')

scored_post_2003_mask = has_score_mask & post_2003_mask
scored_post_2003_df = df.loc[scored_post_2003_mask, useful_cols]
scored_post_2003_df.rename(  # more human readable plot legend
    columns={
        CRIT_SCORE_K: "critics' score",
        USER_SCORE_K: "users' score",
    },
    inplace=True,
)

# dates to quarters
dates = scored_post_2003_df[DATE_K]
quarterly_dates = dates.dt.to_period('Q')
quarterly_dates = quarterly_dates.dt.to_timestamp()  # Qs to dates for compatibility
quarterly_dates = quarterly_dates + pd.Timedelta(days=45)  # plot Q1 as Feb 15 not Jan 1
scored_post_2003_df[DATE_K] = quarterly_dates

# aggregate w/ mean & std
means = scored_post_2003_df.groupby(DATE_K).mean()
means.columns = [f'μ({c})' for c in means.columns]
stds = scored_post_2003_df.groupby(DATE_K).std()
stds.columns = [f'σ({c})' for c in stds.columns]
scored_post_2003_df = means.join(stds)

scored_post_2003_df.head(2)

Unnamed: 0_level_0,μ(users' score),μ(critics' score),σ(users' score),σ(critics' score)
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-02-15,71.494118,68.574074,15.226562,11.855204
2004-05-16,67.666667,67.09434,17.799051,15.333454


In [5]:
# reformat above for plotly symbols per line
scored_post_2003_df[DATE_K] = scored_post_2003_df.index
scored_post_2003_df.reset_index(drop=True, inplace=True)

scored_post_2003_df = scored_post_2003_df.melt(
    id_vars=DATE_K,
    value_vars=scored_post_2003_df.columns[:4],
)
scored_post_2003_df.head(2)

Unnamed: 0,date,variable,value
0,2004-02-15,μ(users' score),71.494118
1,2004-05-16,μ(users' score),67.666667


In [6]:
fig = px.line(
    scored_post_2003_df,
    color='variable',
    symbol='variable',
    x=DATE_K,
    y='value',
    # y=[f'mean_{CRIT_SCORE_K}', f'mean_{USER_SCORE_K}'],
    # error_y=scored_post_2004_df['std_critics_score'],
    labels={DATE_K: 'release date'},
)
fig.update_layout(
    font={'size': 24},
    legend={
        'x': 0.01,
        'y': 0.3,
    },
    legend_title=None,
)
fig.update_traces(
    marker={
        'size': 10,
    }
)
fig.update_xaxes(
    # dtick='Y',
    range=['2004-01-02', '2023-12-31'],
)
fig.update_yaxes(
    range=[1, 80],
    title='',
)
fig.show()