In [3]:
import polars as pl
import os
from pathlib import Path

VGDB_PROJECT_DIR = Path(os.environ['VGDB_PROJECT_DIR'])

FACT_GAME_METRICS_PATH = VGDB_PROJECT_DIR / "data/processed/analytics/fact_game_metrics.parquet"
fact = pl.read_parquet(FACT_GAME_METRICS_PATH)

In [4]:
fact.head()

game_id,platform_id,released_date,release_year,rating,ratings_count,reviews_count,added_status_owned,added_status_playing
i64,u32,date,i32,f64,i64,i64,i64,i64
1,32,2015-10-23,2015,0.0,2,4,88,0
1,24,2015-10-23,2015,0.0,2,4,88,0
1,50,2015-10-23,2015,0.0,2,4,88,0
1,36,2015-10-23,2015,0.0,2,4,88,0
1,31,2015-10-23,2015,0.0,2,4,88,0


In [10]:
feature_cols = [
    "platform_id", 
    "ratings_count", 
    "reviews_count", 
    "added_status_owned", 
    "added_status_playing", 
    "release_year"
]

target_col = "rating"

X = fact.select(feature_cols).to_numpy()
y = fact.select(target_col).to_numpy().ravel()

In [12]:
X

array([[  32,    2,    4,   88,    0, 2015],
       [  24,    2,    4,   88,    0, 2015],
       [  50,    2,    4,   88,    0, 2015],
       ...,
       [  32,    0,    0,    0,    0, 2016],
       [  32,    0,    0,    0,    0, 2017],
       [  32,    0,    0,    0,    0, 2015]], shape=(612462, 6))

In [13]:
y

array([0., 0., 0., ..., 0., 0., 0.], shape=(612462,))

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")

Mean Squared Error: 0.01


In [25]:
platform_stats = (
    fact.group_by("platform_id")
        .agg([
            pl.mean("rating").alias("avg_rating"),
            pl.sum("ratings_count").alias("total_ratings"),
            pl.sum("added_status_owned").alias("total_owned")
        ])
        .sort("avg_rating", descending=False)
)

platform_stats

platform_id,avg_rating,total_ratings,total_owned
i32,f64,i64,i64
45,0.001795,7862,31158
21,0.081754,157015,789905
32,0.113361,854857,4938938
23,0.149661,215529,1455443
24,0.165009,330774,2114101
…,…,…,…
18,1.877286,3728,5103
37,1.908955,11971,38322
49,2.040898,295059,1120067
22,2.065185,1212,2164


In [27]:
top_games_per_year = (
    fact.group_by("release_year")
        .agg([
            pl.max("rating").alias("max_rating"),
            pl.max("added_status_owned").alias("most_owned")
        ])
        .sort("release_year")
)

top_games_per_year

release_year,max_rating,most_owned
i32,f64,i64
1971,3.73,11
1972,3.9,13
1973,0.0,3
1974,0.0,4
1975,0.0,5
…,…,…
2018,4.83,4745
2019,4.62,3495
2020,4.81,3507
2021,4.67,186
