# make_tables.ipynb
**Purpose:** Generate LaTeX tables used in the manuscript (e.g., tuned hyperparameters, feature rankings, thresholds).  
**Inputs:** Excel/CSV outputs from Stage 1 and Stage 2 tuning scripts.  
**Outputs:** LaTeX `.txt` table files.

**Part of repository:** `3_Graphs_Tables`  
**Reproducibility:** This notebook is deterministic given the provided model output files.


In [17]:
import pandas as pd

In [18]:
SUBREDDIT_LABELS = {
    "conspiracy": "r/Conspiracy",
    "crypto": "r/CryptoCurrency",
    "politics": "r/politics",
}

In [21]:
hyperparam_dfs = {}
sheet_name = "all_hyperparams"
for subreddit in SUBREDDIT_LABELS:
    eval_filepath = f"/home/cara/Documents/reddit_analyses/thread-size/Outputs/1_thread_start/{subreddit}/4_model/evaluation.xlsx"
    df = pd.read_excel(eval_filepath, sheet_name=sheet_name)
    
    # Ensure correct column names
    df.columns = ["Features", "Parameter", "Value"]

    # Forward-fill N_feats to propagate block identifiers
    df["Features"] = df["Features"].ffill().astype(int)

    # Pivot into the desired wide format
    table = df.pivot(index="Features", columns="Parameter", values="Value")

    hyperparam_dfs[subreddit] = table.copy()

    table = table.reset_index()  # if N_feats was the index

    latex_str = table.to_latex(
        index=False,                    # donâ€™t print the row index
        header=True,
        float_format="%.4f",         # control float precision
        column_format="|l|r|r|r|r|r|r|r|r|",      # LaTeX alignment (1 left + 8 right)
        #caption=f"{SUBREDDIT_LABELS[subreddit]} tuned thread start LightGBM hyperparameters by number of features.",
        #label=f"tab:s1-{subreddit}-hyperparams",
        escape=False                    # so underscores in col names are not escaped weirdly
    )
    latex_str = latex_str.replace(r"\\", r"\\ \hline")
    to_repl = [r'\toprule', r'\midrule', r'\bottomrule']
    for s in to_repl:
        latex_str = latex_str.replace(s, "")
    caption = f"Optimal LightGBM tree hyperparameters selected via cross-validated Optuna/TPE search for each number of features for {SUBREDDIT_LABELS[subreddit]}. Values represent cross-fold aggregated hyperparameters, using the mode for integer parameters and the mean for continuous parameters. These configurations were used for the final thread start model evaluation."
    full_latex = (r"\paragraph*{S Table.}{\bf " +
    f"{subreddit} tuned thread start LightGBM hyperparameters by number of features." +
    r"""}
    \label{S-Table.}
    \begin{table}[!ht]
        \centering""" + latex_str + r"""
    \end{table}
    """ + caption)
    # Save to file
    with open(f"/home/cara/Documents/reddit_analyses/thread-size/Publication_Outputs/1_Thread_Start/hyperparam_outputs/{subreddit}.txt", "w") as f:
        f.write(full_latex)

In [11]:
outfile = f"/home/cara/Documents/reddit_analyses/thread-size/Publication_Outputs/1_Thread_Start/hyperparam_outputs/hyperparams.xlsx"
with pd.ExcelWriter(outfile) as writer:
    for sub, df in hyperparam_dfs.items():
        df.to_excel(writer, sheet_name=sub)
