# arXiv heatmaps

We try to predict the `math` totals only.  First we import the data.

In [1]:
import pandas as pd

full_df = pd.read_parquet("../../data/arxiv-totals.parquet")

Then we sum all the `math` categories.

In [2]:
df = (
    pd.DataFrame(full_df[list(full_df.filter(regex="math."))].sum(axis=1))
    .reset_index(inplace=False)
    .rename(columns={"date": "ds", 0: "y"})
)

One-hot encode weekday and month.

In [3]:
from calendar import day_name

df["weekday"] = df["ds"].apply(lambda date: day_name[date.weekday()])
df["month"] = df["ds"].apply(lambda date: date.month)

one_hot_weekday = (
    pd.get_dummies(df.weekday, dtype=int).drop("Friday", axis=1).iloc[:, [0, 2, 3, 1]]
)
one_hot_month = pd.get_dummies(df.month, dtype=int).drop(12, axis=1)
df = df.join(one_hot_weekday).join(one_hot_month)

df.columns = df.columns.astype(str)

Make the train/test split.

In [4]:
df_train = df[
    (df.ds >= pd.Timestamp(2001, 1, 1)) & (df.ds <= pd.Timestamp(2025, 3, 14))
]
df_test = df[df.ds >= pd.Timestamp(2025, 3, 17)]

## The models with 150 days window

In [6]:
from sklearn.model_selection import TimeSeriesSplit

ts_cv = TimeSeriesSplit(n_splits=5, test_size=15, max_train_size=150)

splits = list(ts_cv.split(df_train))

In [None]:
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error

from prophet import Prophet

import numpy as np

import json

dummy = DummyRegressor(strategy="mean")
t_reg = LinearRegression()
day_reg = LinearRegression()
mday_reg = LinearRegression()
tday_reg = LinearRegression()
gbrt = HistGradientBoostingRegressor(categorical_features=["weekday", "month"])

dummy_rmses = np.zeros(5)
t_rmses = np.zeros(5)
day_rmses = np.zeros(5)
mday_rmses = np.zeros(5)
tday_rmses = np.zeros(5)
gbrt_rmses = np.zeros(5)
prophet_rmses = np.zeros(5)

with open("../../data/arxiv-categories.json", "r") as f:
    arxiv_categories_descriptions = json.load(f)

for i, (train_index, test_index) in enumerate(splits):
    # prepare train data
    df_tt = df_train.iloc[train_index, :]
    df_tt.reset_index(
        inplace=True
    )  # reset index twice to get a column of indexes to use as feature

    # prepare validation data
    df_holdout = df_train.iloc[test_index, :]
    df_holdout.reset_index(inplace=True)

    # dummy model
    dummy.fit(df_tt[["index"]], df_tt["y"])
    dummy_preds = dummy.predict(df_holdout[["index"]])
    dummy_rmses[i] = root_mean_squared_error(df_holdout["y"], dummy_preds)

    # t-linear model
    t_reg.fit(df_tt[["index"]], df_tt["y"])
    t_preds = t_reg.predict(df_holdout[["index"]])
    t_rmses[i] = root_mean_squared_error(df_holdout["y"], t_preds)

    # day-linear model
    day_reg.fit(df_tt[["Monday", "Tuesday", "Wednesday", "Thursday"]], df_tt["y"])
    day_preds = day_reg.predict(
        df_holdout[["Monday", "Tuesday", "Wednesday", "Thursday"]]
    )
    day_rmses[i] = root_mean_squared_error(df_holdout["y"], day_preds)

    # day-linear model
    mday_reg.fit(
        df_tt[
            [
                "1",
                "2",
                "3",
                "4",
                "5",
                "6",
                "7",
                "8",
                "9",
                "10",
                "11",
                "Monday",
                "Tuesday",
                "Wednesday",
                "Thursday",
            ]
        ],
        df_tt["y"],
    )
    mday_preds = mday_reg.predict(
        df_holdout[
            [
                "1",
                "2",
                "3",
                "4",
                "5",
                "6",
                "7",
                "8",
                "9",
                "10",
                "11",
                "Monday",
                "Tuesday",
                "Wednesday",
                "Thursday",
            ]
        ]
    )
    mday_rmses[i] = root_mean_squared_error(df_holdout["y"], mday_preds)

    # tday-linear model
    tday_reg.fit(
        df_tt[["index", "Monday", "Tuesday", "Wednesday", "Thursday"]],
        df_tt["y"],
    )
    tday_preds = tday_reg.predict(
        df_holdout[["index", "Monday", "Tuesday", "Wednesday", "Thursday"]]
    )
    tday_rmses[i] = root_mean_squared_error(df_holdout["y"], tday_preds)

    # gradient boosting
    gbrt.fit(df_tt[["weekday", "month"]], df_tt["y"])
    gbrt_preds = gbrt.predict(df_holdout[["weekday", "month"]])
    gbrt_rmses[i] = root_mean_squared_error(df_holdout["y"], gbrt_preds)

    # prophet
    prophet = Prophet()
    prophet.fit(df_tt)
    prophet_preds = prophet.predict(df_holdout[["ds"]])["yhat"]
    prophet_rmses[i] = root_mean_squared_error(df_holdout["y"], prophet_preds)
    del prophet

16:18:18 - cmdstanpy - INFO - Chain [1] start processing
16:18:18 - cmdstanpy - INFO - Chain [1] done processing
16:18:18 - cmdstanpy - INFO - Chain [1] start processing
16:18:18 - cmdstanpy - INFO - Chain [1] done processing
16:18:18 - cmdstanpy - INFO - Chain [1] start processing
16:18:18 - cmdstanpy - INFO - Chain [1] done processing
16:18:18 - cmdstanpy - INFO - Chain [1] start processing
16:18:18 - cmdstanpy - INFO - Chain [1] done processing
16:18:19 - cmdstanpy - INFO - Chain [1] start processing
16:18:19 - cmdstanpy - INFO - Chain [1] done processing


In [11]:
print("Dummy rmses:", dummy_rmses)
print("  - mean = ", dummy_rmses.mean())
print("Linear reg rmses:", t_rmses)
print("  - mean = ", t_rmses.mean())
print("Weekday reg rmses:", day_rmses)
print("  - mean = ", day_rmses.mean())
print("Month-weekday reg rmses:", mday_rmses)
print("  - mean = ", mday_rmses.mean())
print("Linear-weekday reg rmses:", tday_rmses)
print("  - mean = ", tday_rmses.mean())
print("Gradient boosting rmses:", gbrt_rmses)
print("  - mean = ", gbrt_rmses.mean())
print("Prophet rmses:", prophet_rmses)
print("  - mean = ", prophet_rmses.mean())

Dummy rmses: [ 66.39552361 118.07022317  54.14753549  69.10937418  66.50741011]
  - mean =  74.8460133146663
Linear reg rmses: [ 63.96316405 123.01287056  56.94975938  68.69509773  65.543826  ]
  - mean =  75.63294354560354
Weekday reg rmses: [38.40085358 84.14185113 23.20612467 30.6547948  29.04687782]
  - mean =  41.09010040000369
Month-weekday reg rmses: [38.3335876  93.19557312 50.6747524  31.57441748 25.39006715]
  - mean =  47.833679549633366
Linear-weekday reg rmses: [32.64566846 91.97123533 30.90189787 29.51103125 26.16828907]
  - mean =  42.23962439642128
Gradient boosting rmses: [35.55291677 90.24905791 44.90067874 31.53571178 28.67952289]
  - mean =  46.183577617612926
Prophet rmses: [30.61248243 99.7677942  23.39373717 33.06705749 31.86888653]
  - mean =  43.741991564849535


## Full FB Prophet

In [None]:
ts_cv = TimeSeriesSplit(n_splits=5, test_size=15)

splits = list(ts_cv.split(df_train))

prophet_full_rmses = np.zeros(5)

for i, (train_index, test_index) in enumerate(splits):
    # prepare train data
    df_tt = df_train.iloc[train_index, :]

    # prepare validation data
    df_holdout = df_train.iloc[test_index, :]

    # prophet
    prophet = Prophet()
    prophet.fit(df_tt)
    prophet_preds = prophet.predict(df_holdout[["ds"]])["yhat"]
    prophet_full_rmses[i] = root_mean_squared_error(df_holdout["y"], prophet_preds)
    del prophet

16:24:32 - cmdstanpy - INFO - Chain [1] start processing
16:24:33 - cmdstanpy - INFO - Chain [1] done processing
16:24:33 - cmdstanpy - INFO - Chain [1] start processing
16:24:34 - cmdstanpy - INFO - Chain [1] done processing
16:24:35 - cmdstanpy - INFO - Chain [1] start processing
16:24:35 - cmdstanpy - INFO - Chain [1] done processing
16:24:35 - cmdstanpy - INFO - Chain [1] start processing
16:24:36 - cmdstanpy - INFO - Chain [1] done processing
16:24:37 - cmdstanpy - INFO - Chain [1] start processing
16:24:38 - cmdstanpy - INFO - Chain [1] done processing


In [13]:
print("Dummy rmses:", dummy_rmses)
print("  - mean = ", dummy_rmses.mean())
print("Linear reg rmses:", t_rmses)
print("  - mean = ", t_rmses.mean())
print("Weekday reg rmses:", day_rmses)
print("  - mean = ", day_rmses.mean())
print("Month-weekday reg rmses:", mday_rmses)
print("  - mean = ", mday_rmses.mean())
print("Linear-weekday reg rmses:", tday_rmses)
print("  - mean = ", tday_rmses.mean())
print("Gradient boosting rmses:", gbrt_rmses)
print("  - mean = ", gbrt_rmses.mean())
print("Prophet rmses:", prophet_rmses)
print("  - mean = ", prophet_rmses.mean())
print("Full prophet rmses:", prophet_full_rmses)
print("  - mean = ", prophet_full_rmses.mean())

Dummy rmses: [ 66.39552361 118.07022317  54.14753549  69.10937418  66.50741011]
  - mean =  74.8460133146663
Linear reg rmses: [ 63.96316405 123.01287056  56.94975938  68.69509773  65.543826  ]
  - mean =  75.63294354560354
Weekday reg rmses: [38.40085358 84.14185113 23.20612467 30.6547948  29.04687782]
  - mean =  41.09010040000369
Month-weekday reg rmses: [38.3335876  93.19557312 50.6747524  31.57441748 25.39006715]
  - mean =  47.833679549633366
Linear-weekday reg rmses: [32.64566846 91.97123533 30.90189787 29.51103125 26.16828907]
  - mean =  42.23962439642128
Gradient boosting rmses: [35.55291677 90.24905791 44.90067874 31.53571178 28.67952289]
  - mean =  46.183577617612926
Prophet rmses: [30.61248243 99.7677942  23.39373717 33.06705749 31.86888653]
  - mean =  43.741991564849535
Full prophet rmses: [45.69470014 94.34975619 32.95994785 46.53997394 43.85623571]
  - mean =  52.68012276829133


It looks like the best model is still just regression on one-hot encoded weekdays.