# make_tables.ipynb
**Purpose:** Generate LaTeX tables used in the manuscript (e.g., tuned hyperparameters, feature rankings, thresholds).  
**Inputs:** Excel/CSV outputs from Stage 1 and Stage 2 tuning scripts.  
**Outputs:** LaTeX `.txt` table files.

**Part of repository:** `3_Graphs_Tables`  
**Reproducibility:** This notebook is deterministic given the provided model output files.


In [1]:
import pandas as pd
import ast
import re
from make_outputs import format_label

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SUBREDDIT_LABELS = {
    "conspiracy": "r/Conspiracy",
    "crypto": "r/CryptoCurrency",
    "politics": "r/politics",
}

In [58]:
# want header - title - table_header - tabular - table_footer - caption - footer

supp_mat_table = {
    "header": r"\paragraph*{S Table.}{\bf ",
    "table_header":
r"""}
\label{S-Table.}
\begin{table}[!ht]
\centering""",
    "table_footer":
r"""
\end{table}
"""
    "footer":
r"""
\FloatBarrier
"""
}

SyntaxError: invalid syntax (1136838261.py, line 14)

In [59]:
def built_supp_mat_table(title, tabular, caption):
    tabular = tabular.replace(r"\\", r"\\ \hline")
    to_repl = [r'\toprule', r'\midrule', r'\bottomrule']
    for s in to_repl:
        tabular = tabular.replace(s, "")
    return supp_mat_table["header"] + title + supp_mat_table["table_header"] + tabular + supp_mat_table["table_footer"] + caption + supp_mat_table["footer"]

In [None]:
hyperparam_dfs = {}
sheet_name = "all_hyperparams"
for subreddit in SUBREDDIT_LABELS:
    eval_filepath = f"/home/cara/Documents/reddit_analyses/thread-size/Outputs/1_thread_start/{subreddit}/4_model/evaluation.xlsx"
    df = pd.read_excel(eval_filepath, sheet_name=sheet_name)
    
    # Ensure correct column names
    df.columns = ["Features", "Parameter", "Value"]

    # Forward-fill N_feats to propagate block identifiers
    df["Features"] = df["Features"].ffill().astype(int)

    # Pivot into the desired wide format
    table = df.pivot(index="Features", columns="Parameter", values="Value")

    hyperparam_dfs[subreddit] = table.copy()

    table = table.reset_index()  # if N_feats was the index

    latex_str = table.to_latex(
        index=False,                    # don’t print the row index
        header=True,
        float_format="%.4f",         # control float precision
        column_format="|l|r|r|r|r|r|r|r|r|",      # LaTeX alignment (1 left + 8 right)
        #caption=f"{SUBREDDIT_LABELS[subreddit]} tuned thread start LightGBM hyperparameters by number of features.",
        #label=f"tab:s1-{subreddit}-hyperparams",
        escape=True                    # so underscores in col names are not escaped weirdly
    )
    
    caption = f"Optimal LightGBM tree hyperparameters selected via cross-validated Optuna/TPE search for each number of features for {SUBREDDIT_LABELS[subreddit]}. Values represent cross-fold aggregated hyperparameters, using the mode for integer parameters and the mean for continuous parameters. These configurations were used for the final thread start model evaluation."
    title = f"{subreddit} tuned thread start LightGBM hyperparameters by number of features."
    full_latex = built_supp_mat_table(title, latex_str, caption)
    # Save to file
    with open(f"/home/cara/Documents/reddit_analyses/thread-size/Publication_Outputs/1_Thread_Start/hyperparam_outputs/{subreddit}.txt", "w") as f:
        f.write(full_latex)

outfile = f"/home/cara/Documents/reddit_analyses/thread-size/Publication_Outputs/1_Thread_Start/hyperparam_outputs/hyperparams.xlsx"
with pd.ExcelWriter(outfile) as writer:
    for sub, df in hyperparam_dfs.items():
        df.to_excel(writer, sheet_name=sub)


In [13]:
model_threshold_dfs = {}
sheet_name = "all_params"
for subreddit in SUBREDDIT_LABELS:
    eval_filepath = f"/home/cara/Documents/reddit_analyses/thread-size/Outputs/1_thread_start/{subreddit}/4_model/evaluation.xlsx"
    df = pd.read_excel(eval_filepath, sheet_name=sheet_name)
    
    # Ensure correct column names
    df.columns = ["Features", "param_index", "Parameter", "Value"]

    # Forward-fill N_feats to propagate block identifiers
    df["Features"] = df["Features"].ffill().astype(int)

    # Pivot into the desired wide format
    table = df.pivot(index="Features", columns="Parameter", values="Value")

    model_threshold_dfs[subreddit] = table.copy()[['model_threshold']]


In [16]:
pd.concat(model_threshold_dfs, axis=1).to_csv(f"/home/cara/Documents/reddit_analyses/thread-size/Publication_Outputs/1_Thread_Start/metrics/model_thresholds.xlsx")

In [None]:

outfile = f"/home/cara/Documents/reddit_analyses/thread-size/Publication_Outputs/1_Thread_Start/hyperparam_outputs/hyperparams.xlsx"
with pd.ExcelWriter(outfile) as writer:
    for sub, df in hyperparam_dfs.items():
        df.to_excel(writer, sheet_name=sub)

In [13]:
s2_cw_dfs = {}
sheet_name = "params"
for subreddit in SUBREDDIT_LABELS:
    eval_filepath = f"/home/cara/Documents/reddit_analyses/thread-size/Outputs/2_thread_size/{subreddit}/2_tuning/tuning_outputs.xlsx"
    df = pd.read_excel(eval_filepath, sheet_name=sheet_name)
    
    # Ensure correct column names
    df = df[['n_feats', 'final_class_weights']].rename(columns={
        "final_class_weights": "cws"
    })

    s2_cw_dfs[subreddit] = df

In [23]:
def parse_cw_string(s):
    cleaned = re.sub(r"np\.float64\(([^)]+)\)", r"\1", s)
    return ast.literal_eval(cleaned)

In [36]:
CLASS_NAMES = ["Stalled", "Small", "Medium", "Large"]

In [38]:
outfile = f"/home/cara/Documents/reddit_analyses/thread-size/Publication_Outputs/2_Thread_Size/tuning_outputs/class_weights.xlsx"

with pd.ExcelWriter(outfile) as writer:
    for sub, df in s2_cw_dfs.items():
        expanded = pd.json_normalize(df["cws"].apply(parse_cw_string))
        df = df.join(expanded)[["n_feats", 0,1,2,3]]
        ratios = df[['n_feats']].copy()
        for col in [0,1,2,3]:
            ratios[CLASS_NAMES[col]] = df[col]/df[0]
        ratios.to_excel(writer, index=False, sheet_name=sub)
        


In [37]:
def round_dec(val, d=2):
    return round(val, d)

In [40]:
s2_feat_dfs = {}
sheet_name = "feature_importances"
for subreddit in SUBREDDIT_LABELS:
    eval_filepath = f"/home/cara/Documents/reddit_analyses/thread-size/Outputs/2_thread_size/{subreddit}/2_tuning/tuning_outputs.xlsx"
    df = pd.read_excel(eval_filepath, sheet_name=sheet_name)
    
    # Ensure correct column names
    df = df[['feature', 'mean_importance', 'mean_split', 'mean_gain']].rename(columns={
        'feature': 'Feature',
        'mean_importance': 'Scaled',
        'mean_split': 'Split',
        'mean_gain': 'Gain'
    })
    df['Feature'] = df['Feature'].apply(format_label)
    df['Scaled'] = df["Scaled"].apply(round_dec, d=4)
    df['Split'] = df["Split"].apply(round_dec, d=0)
    df['Gain'] = df["Gain"].apply(round_dec, d=0)

    s2_feat_dfs[subreddit] = df

outfile = f"/home/cara/Documents/reddit_analyses/thread-size/Publication_Outputs/2_Thread_Size/tuning_outputs/feature_importances.xlsx"

with pd.ExcelWriter(outfile) as writer:
    for sub, df in s2_feat_dfs.items():
        df.to_excel(writer, sheet_name=sub)

In [54]:
col_digits = {'colsample_bytree': 3, 'learning_rate': 3, 'max_depth':0, 'min_child_samples':0,
       'num_leaves':0, 'reg_alpha':3, 'reg_lambda':3, 'subsample':3}
int_cols = ['Features', 'max_depth', 'min_child_samples', 'num_leaves']

In [57]:
hyperparam_dfs = {}
sheet_name = "hyperparams"
for subreddit in SUBREDDIT_LABELS:
    eval_filepath = f"/home/cara/Documents/reddit_analyses/thread-size/Outputs/2_thread_size/{subreddit}/4_model/evaluation.xlsx"
    df = pd.read_excel(eval_filepath, sheet_name=sheet_name)
    # Ensure correct column names
    df.columns = ["Parameter", "Value", "Features"]
    # Forward-fill N_feats to propagate block identifiers
    df["Features"] = df["Features"].ffill().astype(int)

    # Pivot into the desired wide format
    table = df.pivot(index="Features", columns="Parameter", values="Value")
    for col in table.columns:
        table[col] = table[col].apply(round_dec, d=col_digits[col])
    for col in [x for x in table.columns if x in int_cols]:
        table[col] = table[col].astype(int)
    hyperparam_dfs[subreddit] = table.copy()

    table = table.reset_index()  # if N_feats was the index
    

    latex_str = table.to_latex(
        index=False,                    # don’t print the row index
        header=True,
        float_format="%.3f",
        column_format="|l|r|r|r|r|r|r|r|r|",      # LaTeX alignment (1 left + 8 right)
        #caption=f"{SUBREDDIT_LABELS[subreddit]} tuned thread start LightGBM hyperparameters by number of features.",
        #label=f"tab:s1-{subreddit}-hyperparams",
        escape=True                    # so underscores in col names are not escaped weirdly
    )
    
    caption = f"Optimal LightGBM tree hyperparameters selected via cross-validated Optuna/TPE search for each number of features for {SUBREDDIT_LABELS[subreddit]}. Values represent cross-fold aggregated hyperparameters, using the mode for integer parameters and the mean for continuous parameters. These configurations were used for the final thread size model evaluation."
    title = f"{SUBREDDIT_LABELS[subreddit]} tuned thread size LightGBM hyperparameters by number of features."
    full_latex = built_supp_mat_table(title, latex_str, caption)
    # Save to file
    with open(f"/home/cara/Documents/reddit_analyses/thread-size/Publication_Outputs/2_Thread_Size/tuning_outputs/{subreddit}_hparams.txt", "w") as f:
        f.write(full_latex)